[Likwid-commit] [likwid] 01/01: Initial version of LIKWID 4.0.0. I added some new patches in the patch dir but dpkg-buildpackage does not apply them

Thomas Röhl eatmyfear-guest at moszumanska.debian.org
Fri Nov 6 15:07:14 UTC 2015


This is an automated email from the git hooks/post-receive script.

eatmyfear-guest pushed a commit to branch likwid-4.0
in repository likwid.

commit 3bbca2d3dc11f06652877a7f6ec53e8433c2894f
Author: Thomas.Roehl <Thomas.Roehl at googlemail.com>
Date:   Fri Nov 6 16:04:39 2015 +0100

    Initial version of LIKWID 4.0.0. I added some new patches in the patch dir but dpkg-buildpackage does not apply them
---
 INSTALL                                       |  158 +-
 Makefile                                      |  429 +-
 README                                        |   56 +-
 bench/Makefile                                |  154 +
 bench/includes/allocator.h                    |   50 +
 bench/includes/barrier.h                      |   57 +
 bench/includes/barrier_types.h                |   49 +
 bench/includes/bstrlib.h                      |    1 +
 bench/includes/likwid.h                       | 1069 +++++
 bench/includes/strUtil.h                      |   59 +
 bench/includes/test_types.h                   |  105 +
 bench/includes/threads.h                      |  113 +
 bench/includes/threads_types.h                |   56 +
 bench/likwid-bench.c                          |  424 ++
 bench/perl/AsmGen.pl                          |  284 ++
 bench/perl/Parse/RecDescent.pm                | 3045 ++++++++++++
 bench/perl/Template.pm                        |  916 ++++
 bench/perl/Template/Base.pm                   |  283 ++
 bench/perl/Template/Config.pm                 |  428 ++
 bench/perl/Template/Constants.pm              |  265 ++
 bench/perl/Template/Context.pm                | 1477 ++++++
 bench/perl/Template/Directive.pm              | 1040 ++++
 bench/perl/Template/Document.pm               |  490 ++
 bench/perl/Template/Exception.pm              |  229 +
 bench/perl/Template/Filters.pm                |  811 ++++
 bench/perl/Template/Grammar.pm                | 6252 +++++++++++++++++++++++++
 bench/perl/Template/Iterator.pm               |  493 ++
 bench/perl/Template/Namespace/Constants.pm    |  176 +
 bench/perl/Template/Parser.pm                 | 1131 +++++
 bench/perl/Template/Plugin.pm                 |  369 ++
 bench/perl/Template/Plugin/Assert.pm          |  155 +
 bench/perl/Template/Plugin/CGI.pm             |  135 +
 bench/perl/Template/Plugin/Datafile.pm        |  166 +
 bench/perl/Template/Plugin/Date.pm            |  355 ++
 bench/perl/Template/Plugin/Directory.pm       |  386 ++
 bench/perl/Template/Plugin/Dumper.pm          |  152 +
 bench/perl/Template/Plugin/File.pm            |  391 ++
 bench/perl/Template/Plugin/Filter.pm          |  411 ++
 bench/perl/Template/Plugin/Format.pm          |   93 +
 bench/perl/Template/Plugin/HTML.pm            |  163 +
 bench/perl/Template/Plugin/Image.pm           |  436 ++
 bench/perl/Template/Plugin/Iterator.pm        |   88 +
 bench/perl/Template/Plugin/Math.pm            |  242 +
 bench/perl/Template/Plugin/Pod.pm             |   87 +
 bench/perl/Template/Plugin/Procedural.pm      |  133 +
 bench/perl/Template/Plugin/Scalar.pm          |  163 +
 bench/perl/Template/Plugin/String.pm          |  761 +++
 bench/perl/Template/Plugin/Table.pm           |  441 ++
 bench/perl/Template/Plugin/URL.pm             |  203 +
 bench/perl/Template/Plugin/View.pm            |   97 +
 bench/perl/Template/Plugin/Wrap.pm            |  142 +
 bench/perl/Template/Plugins.pm                |  466 ++
 bench/perl/Template/Provider.pm               | 1391 ++++++
 bench/perl/Template/Service.pm                |  573 +++
 bench/perl/Template/Stash.pm                  |  839 ++++
 bench/perl/Template/Stash/Context.pm          |  773 +++
 bench/perl/Template/Stash/XS.pm               |  137 +
 bench/perl/Template/Test.pm                   |  709 +++
 bench/perl/Template/VMethods.pm               |  587 +++
 bench/perl/Template/View.pm                   |  743 +++
 bench/perl/gas.pm                             |  211 +
 bench/perl/generatePas.pl                     |  163 +
 bench/perl/isax86.pm                          |   45 +
 bench/perl/isax86_64.pm                       |   66 +
 bench/perl/templates/bench.tt                 |   36 +
 bench/perl/templates/group.tt                 |  157 +
 bench/perl/templates/group_types.tt           |   13 +
 bench/perl/templates/testcases.tt             |   19 +
 bench/src/allocator.c                         |  171 +
 bench/src/barrier.c                           |  159 +
 bench/src/bench.c                             |  772 +++
 bench/src/strUtil.c                           |  315 ++
 bench/src/threads.c                           |  287 ++
 bench/x86-64/branch.ptt                       |   36 -
 bench/x86-64/peak.ptt                         |   49 -
 bench/x86-64/peak_avx.ptt                     |   49 -
 bench/x86-64/peak_sse.ptt                     |   49 -
 bench/x86-64/peakflops.ptt                    |   37 -
 bench/x86-64/peakflops_avx.ptt                |   37 -
 bench/x86-64/peakflops_sse.ptt                |   37 -
 bench/x86-64/stream_avx.ptt                   |   32 +-
 bench/x86-64/triad_avx.ptt                    |   12 +-
 config.mk                                     |   27 +-
 debian/changelog                              |    6 +
 debian/control                                |    2 +-
 debian/copyright                              |    4 +-
 debian/docs                                   |    1 +
 debian/files                                  |    1 +
 debian/likwid.lintian-overrides               |    4 +-
 debian/likwid.symbols                         |    2 -
 debian/patches/01-manpages.patch              |  547 +--
 debian/patches/03-Makefile-man.patch          |   73 +-
 debian/patches/11-hwloc-soname.patch          |   16 +
 debian/patches/12-lua-soname.patch            |   16 +
 debian/patches/13-likwid-soname.patch         |   16 +
 debian/patches/14-man-bench-fix.patch         |   13 +
 debian/watch                                  |    2 +-
 doc/Doxyfile                                  | 1781 +++++++
 doc/applications/likwid-accessD.md            |   55 +
 doc/applications/likwid-agent.md              |   94 +
 doc/applications/likwid-bench.md              |   93 +
 doc/applications/likwid-genTopoCfg.md         |   29 +
 doc/applications/likwid-memsweeper.md         |   34 +
 doc/applications/likwid-mpirun.md             |   83 +
 doc/applications/likwid-perfctr.md            |  260 +
 doc/applications/likwid-perfscope.md          |  107 +
 doc/applications/likwid-pin.md                |  170 +
 doc/applications/likwid-powermeter.md         |   75 +
 doc/applications/likwid-setFreq.md            |   13 +
 doc/applications/likwid-setFrequencies.md     |   50 +
 doc/applications/likwid-topology.md           |   68 +
 doc/archs/atom.md                             |  104 +
 doc/archs/broadwell.md                        |  203 +
 doc/archs/core2.md                            |  103 +
 doc/archs/haswell.md                          |  203 +
 doc/archs/haswellep.md                        |  896 ++++
 doc/archs/interlagos.md                       |  107 +
 doc/archs/ivybridge.md                        |  190 +
 doc/archs/ivybridgeep.md                      |  790 ++++
 doc/archs/k10.md                              |   68 +
 doc/archs/k8.md                               |   68 +
 doc/archs/kabini.md                           |  162 +
 doc/archs/nehalem.md                          |  237 +
 doc/archs/nehalemex.md                        |  554 +++
 doc/archs/pentiumm.md                         |   63 +
 doc/archs/phi.md                              |   78 +
 doc/archs/sandybridge.md                      |  189 +
 doc/archs/sandybridgeep.md                    |  775 +++
 doc/archs/silvermont.md                       |  175 +
 doc/archs/westmere.md                         |  239 +
 doc/archs/westmereex.md                       |  555 +++
 doc/bstrlib.txt                               | 3201 +++++++++++++
 doc/likwid-accessD.1                          |   10 +-
 doc/likwid-agent.1                            |   94 +
 doc/likwid-bench.1                            |  144 +-
 doc/likwid-doxygen.md                         |  262 ++
 doc/likwid-features.1                         |   22 +-
 doc/likwid-genCfg.1                           |   30 -
 doc/likwid-genTopoCfg.1                       |   30 +
 doc/likwid-memsweeper.1                       |   16 +-
 doc/likwid-mpirun.1                           |   69 +-
 doc/likwid-perfctr.1                          |  245 +-
 doc/likwid-perfscope.1                        |  178 +-
 doc/likwid-pin.1                              |  226 +-
 doc/likwid-powermeter.1                       |   69 +-
 doc/likwid-setFreq.1                          |   10 +-
 doc/likwid-setFrequencies.1                   |   40 +-
 doc/likwid-topology.1                         |   46 +-
 doc/likwid.cfg.md                             |   38 +
 doc/logo.png                                  |  Bin 0 -> 6776 bytes
 doc/lua-doxygen.md                            | 2615 +++++++++++
 examples/C-likwidAPI.c                        |  136 +
 examples/C-markerAPI.c                        |   87 +
 examples/F-markerAPI.F90                      |   79 +
 examples/Lua-likwidAPI.lua                    |   93 +
 examples/Makefile                             |   36 +
 ext/hwloc/AUTHORS                             |    8 +
 ext/hwloc/COPYING                             |   28 +
 ext/hwloc/Makefile                            |   64 +
 ext/hwloc/hwloc/base64.c                      |  306 ++
 ext/hwloc/hwloc/bind.c                        |  781 +++
 ext/hwloc/hwloc/bitmap.c                      | 1492 ++++++
 ext/hwloc/hwloc/components.c                  |  792 ++++
 ext/hwloc/hwloc/diff.c                        |  426 ++
 ext/hwloc/hwloc/distances.c                   |  995 ++++
 ext/hwloc/hwloc/dolib.c                       |   47 +
 ext/hwloc/hwloc/misc.c                        |  166 +
 ext/hwloc/hwloc/pci-common.c                  |  482 ++
 ext/hwloc/hwloc/topology-bgq.cb               |  246 +
 ext/hwloc/hwloc/topology-darwin.cb            |  307 ++
 ext/hwloc/hwloc/topology-fake.c               |   61 +
 ext/hwloc/hwloc/topology-freebsd.cb           |  255 +
 ext/hwloc/hwloc/topology-linux.c              | 5133 ++++++++++++++++++++
 ext/hwloc/hwloc/topology-noos.c               |   58 +
 ext/hwloc/hwloc/topology-opencl.cb            |  346 ++
 ext/hwloc/hwloc/topology-osf.cb               |  392 ++
 ext/hwloc/hwloc/topology-synthetic.c          | 1128 +++++
 ext/hwloc/hwloc/topology-x86.c                | 1386 ++++++
 ext/hwloc/hwloc/topology.c                    | 3436 ++++++++++++++
 ext/hwloc/hwloc/traversal.c                   |  701 +++
 ext/hwloc/include/hwloc.h                     | 2206 +++++++++
 ext/hwloc/include/hwloc/autogen/config.h      |  202 +
 ext/hwloc/include/hwloc/autogen/config.h.in   |  201 +
 ext/hwloc/include/hwloc/autogen/stamp-h2      |    1 +
 ext/hwloc/include/hwloc/bitmap.h              |  359 ++
 ext/hwloc/include/hwloc/cuda.h                |  224 +
 ext/hwloc/include/hwloc/cudart.h              |  184 +
 ext/hwloc/include/hwloc/deprecated.h          |  114 +
 ext/hwloc/include/hwloc/diff.h                |  299 ++
 ext/hwloc/include/hwloc/export.h              |  221 +
 ext/hwloc/include/hwloc/gl.h                  |  135 +
 ext/hwloc/include/hwloc/glibc-sched.h         |  125 +
 ext/hwloc/include/hwloc/helper.h              | 1249 +++++
 ext/hwloc/include/hwloc/inlines.h             |  154 +
 ext/hwloc/include/hwloc/intel-mic.h           |  143 +
 ext/hwloc/include/hwloc/linux-libnuma.h       |  273 ++
 ext/hwloc/include/hwloc/linux.h               |   77 +
 ext/hwloc/include/hwloc/myriexpress.h         |  127 +
 ext/hwloc/include/hwloc/nvml.h                |  176 +
 ext/hwloc/include/hwloc/opencl.h              |  199 +
 ext/hwloc/include/hwloc/openfabrics-verbs.h   |  155 +
 ext/hwloc/include/hwloc/plugins.h             |  433 ++
 ext/hwloc/include/hwloc/rename.h              |  651 +++
 ext/hwloc/include/numa.h                      |  468 ++
 ext/hwloc/include/pci/config.h                |   16 +
 ext/hwloc/include/pci/header.h                | 1195 +++++
 ext/hwloc/include/pci/pci.h                   |  240 +
 ext/hwloc/include/pci/types.h                 |   65 +
 ext/hwloc/include/private/autogen/README.txt  |    3 +
 ext/hwloc/include/private/autogen/config.h    |  761 +++
 ext/hwloc/include/private/components.h        |   40 +
 ext/hwloc/include/private/cpuid-x86.h         |   89 +
 ext/hwloc/include/private/cpuid.h             |   80 +
 ext/hwloc/include/private/debug.h             |   57 +
 ext/hwloc/include/private/map.h               |  110 +
 ext/hwloc/include/private/misc.h              |  382 ++
 ext/hwloc/include/private/private.h           |  335 ++
 ext/hwloc/include/private/solaris-chiptype.h  |   59 +
 ext/hwloc/include/private/xml.h               |   98 +
 ext/hwloc/include/static-components.h         |   17 +
 ext/lua/Makefile                              |   66 +
 ext/lua/includes/lapi.h                       |   24 +
 ext/lua/includes/lauxlib.h                    |  212 +
 ext/lua/includes/lcode.h                      |   83 +
 ext/lua/includes/lctype.h                     |   95 +
 ext/lua/includes/ldebug.h                     |   34 +
 ext/lua/includes/ldo.h                        |   46 +
 ext/lua/includes/lfunc.h                      |   33 +
 ext/lua/includes/lgc.h                        |  157 +
 ext/lua/includes/llex.h                       |   78 +
 ext/lua/includes/llimits.h                    |  309 ++
 ext/lua/includes/lmem.h                       |   57 +
 ext/lua/includes/lobject.h                    |  607 +++
 ext/lua/includes/lopcodes.h                   |  288 ++
 ext/lua/includes/lparser.h                    |  119 +
 ext/lua/includes/lstate.h                     |  228 +
 ext/lua/includes/lstring.h                    |   46 +
 ext/lua/includes/ltable.h                     |   41 +
 ext/lua/includes/ltm.h                        |   57 +
 ext/lua/includes/lua.h                        |  444 ++
 ext/lua/includes/luaconf.h                    |  551 +++
 ext/lua/includes/lualib.h                     |   55 +
 ext/lua/includes/lundump.h                    |   28 +
 ext/lua/includes/lvm.h                        |   44 +
 ext/lua/includes/lzio.h                       |   65 +
 ext/lua/includes/readline/chardefs.h          |  152 +
 ext/lua/includes/readline/history.h           |  267 ++
 ext/lua/includes/readline/keymaps.h           |   97 +
 ext/lua/includes/readline/readline.h          |  894 ++++
 ext/lua/includes/readline/rlconf.h            |   61 +
 ext/lua/includes/readline/rlstdc.h            |   45 +
 ext/lua/includes/readline/rltypedefs.h        |   93 +
 ext/lua/includes/readline/tilde.h             |   80 +
 ext/lua/src/lapi.c                            | 1284 +++++
 ext/lua/src/lauxlib.c                         |  959 ++++
 ext/lua/src/lbaselib.c                        |  458 ++
 ext/lua/src/lbitlib.c                         |  211 +
 ext/lua/src/lcode.c                           |  881 ++++
 ext/lua/src/lcorolib.c                        |  155 +
 ext/lua/src/lctype.c                          |   52 +
 ext/lua/src/ldblib.c                          |  398 ++
 ext/lua/src/ldebug.c                          |  580 +++
 ext/lua/src/ldo.c                             |  673 +++
 ext/lua/src/ldump.c                           |  173 +
 ext/lua/src/lfunc.c                           |  161 +
 ext/lua/src/lgc.c                             | 1213 +++++
 ext/lua/src/linit.c                           |   67 +
 ext/lua/src/liolib.c                          |  665 +++
 ext/lua/src/llex.c                            |  527 +++
 ext/lua/src/lmathlib.c                        |  279 ++
 ext/lua/src/lmem.c                            |   99 +
 ext/lua/src/loadlib.c                         |  725 +++
 ext/lua/src/lobject.c                         |  287 ++
 ext/lua/src/lopcodes.c                        |  107 +
 ext/lua/src/loslib.c                          |  323 ++
 ext/lua/src/lparser.c                         | 1638 +++++++
 ext/lua/src/lstate.c                          |  322 ++
 ext/lua/src/lstring.c                         |  185 +
 ext/lua/src/lstrlib.c                         | 1019 ++++
 ext/lua/src/ltable.c                          |  588 +++
 ext/lua/src/ltablib.c                         |  283 ++
 ext/lua/src/ltm.c                             |   77 +
 ext/lua/src/lua.c                             |  497 ++
 ext/lua/src/lundump.c                         |  258 +
 ext/lua/src/lvm.c                             |  867 ++++
 ext/lua/src/lzio.c                            |   76 +
 filters/csv                                   |  114 -
 filters/xml                                   |  184 +-
 groups/atom/BRANCH.txt                        |   16 +-
 groups/atom/DATA.txt                          |   12 +-
 groups/atom/MEM.txt                           |   12 +-
 groups/broadwell/BRANCH.txt                   |   31 +
 groups/broadwell/CLOCK.txt                    |   23 +
 groups/broadwell/DATA.txt                     |   22 +
 groups/broadwell/ENERGY.txt                   |   39 +
 groups/broadwell/FLOPS_AVX.txt                |   24 +
 groups/broadwell/FLOPS_DP.txt                 |   29 +
 groups/broadwell/FLOPS_SP.txt                 |   29 +
 groups/broadwell/ICACHE.txt                   |   25 +
 groups/broadwell/L2.txt                       |   37 +
 groups/broadwell/L2CACHE.txt                  |   34 +
 groups/broadwell/L3.txt                       |   36 +
 groups/broadwell/L3CACHE.txt                  |   35 +
 groups/broadwell/TLB_DATA.txt                 |   35 +
 groups/broadwell/TLB_INSTR.txt                |   28 +
 groups/broadwellEP/BRANCH.txt                 |   31 +
 groups/broadwellEP/CLOCK.txt                  |   23 +
 groups/broadwellEP/DATA.txt                   |   22 +
 groups/broadwellEP/ENERGY.txt                 |   35 +
 groups/broadwellEP/FLOPS_AVX.txt              |   24 +
 groups/broadwellEP/FLOPS_DP.txt               |   29 +
 groups/broadwellEP/FLOPS_SP.txt               |   29 +
 groups/broadwellEP/ICACHE.txt                 |   25 +
 groups/broadwellEP/L2.txt                     |   37 +
 groups/broadwellEP/L2CACHE.txt                |   34 +
 groups/broadwellEP/L3.txt                     |   36 +
 groups/broadwellEP/L3CACHE.txt                |   35 +
 groups/{haswell => broadwellEP}/TLB_DATA.txt  |    0
 groups/{haswell => broadwellEP}/TLB_INSTR.txt |    0
 groups/core2/BRANCH.txt                       |   10 +-
 groups/core2/CACHE.txt                        |    1 -
 groups/core2/DATA.txt                         |    6 +-
 groups/core2/FLOPS_DP.txt                     |    1 -
 groups/core2/FLOPS_SP.txt                     |    1 -
 groups/core2/FLOPS_X87.txt                    |    1 -
 groups/core2/L2.txt                           |   15 +-
 groups/core2/MEM.txt                          |    4 +-
 groups/core2/TLB.txt                          |    1 -
 groups/haswell/BRANCH.txt                     |   10 +-
 groups/haswell/DATA.txt                       |    4 +-
 groups/haswell/ENERGY.txt                     |   13 +-
 groups/haswell/ICACHE.txt                     |    6 +-
 groups/haswell/L2.txt                         |   18 +-
 groups/haswell/L2CACHE.txt                    |    9 +-
 groups/haswell/L3.txt                         |   16 +-
 groups/haswell/L3CACHE.txt                    |   12 +-
 groups/haswell/TLB_DATA.txt                   |   16 +-
 groups/haswell/TLB_INSTR.txt                  |    8 +-
 groups/haswellEP/BRANCH.txt                   |   31 +
 groups/haswellEP/CBOX.txt                     |   60 +
 groups/haswellEP/CLOCK.txt                    |   23 +
 groups/haswellEP/DATA.txt                     |   22 +
 groups/haswellEP/ENERGY.txt                   |   35 +
 groups/haswellEP/ICACHE.txt                   |   25 +
 groups/haswellEP/L2.txt                       |   37 +
 groups/haswellEP/L2CACHE.txt                  |   34 +
 groups/haswellEP/L3.txt                       |   36 +
 groups/haswellEP/L3CACHE.txt                  |   35 +
 groups/haswellEP/MEM.txt                      |   51 +
 groups/haswellEP/NUMA.txt                     |   33 +
 groups/haswellEP/QPI.txt                      |   37 +
 groups/haswellEP/SBOX.txt                     |   28 +
 groups/haswellEP/TLB_DATA.txt                 |   35 +
 groups/haswellEP/TLB_INSTR.txt                |   28 +
 groups/interlagos/BRANCH.txt                  |   16 +-
 groups/interlagos/DATA.txt                    |    4 +-
 groups/interlagos/FLOPS_DP.txt                |    2 +-
 groups/interlagos/FLOPS_SP.txt                |    2 +-
 groups/interlagos/ICACHE.txt                  |   16 +-
 groups/interlagos/L2CACHE.txt                 |    8 +-
 groups/interlagos/L3.txt                      |   19 +-
 groups/interlagos/L3CACHE.txt                 |    8 +-
 groups/ivybridge/BRANCH.txt                   |    8 +-
 groups/ivybridge/DATA.txt                     |    4 +-
 groups/ivybridge/ENERGY.txt                   |   12 +-
 groups/ivybridge/FLOPS_AVX.txt                |   10 +-
 groups/ivybridge/FLOPS_DP.txt                 |    6 +-
 groups/ivybridge/FLOPS_SP.txt                 |   10 +-
 groups/ivybridge/ICACHE.txt                   |    6 +-
 groups/ivybridge/L2.txt                       |   26 +-
 groups/ivybridge/L2CACHE.txt                  |    7 +-
 groups/ivybridge/L3.txt                       |   16 +-
 groups/ivybridge/L3CACHE.txt                  |   13 +-
 groups/ivybridge/MEM.txt                      |   32 -
 groups/ivybridge/MEM_DP.txt                   |   57 -
 groups/ivybridge/MEM_SP.txt                   |   57 -
 groups/ivybridge/TLB_DATA.txt                 |   16 +-
 groups/ivybridge/TLB_INSTR.txt                |    8 +-
 groups/ivybridgeEP/BRANCH.txt                 |   31 +
 groups/ivybridgeEP/CACHES.txt                 |   55 +
 groups/ivybridgeEP/CBOX.txt                   |   55 +
 groups/ivybridgeEP/CLOCK.txt                  |   23 +
 groups/ivybridgeEP/DATA.txt                   |   22 +
 groups/ivybridgeEP/ENERGY.txt                 |   33 +
 groups/ivybridgeEP/FLOPS_AVX.txt              |   25 +
 groups/ivybridgeEP/FLOPS_DP.txt               |   30 +
 groups/ivybridgeEP/FLOPS_SP.txt               |   31 +
 groups/ivybridgeEP/ICACHE.txt                 |   25 +
 groups/ivybridgeEP/L2.txt                     |   38 +
 groups/ivybridgeEP/L2CACHE.txt                |   34 +
 groups/ivybridgeEP/L3.txt                     |   36 +
 groups/ivybridgeEP/L3CACHE.txt                |   36 +
 groups/ivybridgeEP/MEM.txt                    |   49 +
 groups/ivybridgeEP/MEM_DP.txt                 |   68 +
 groups/ivybridgeEP/MEM_SP.txt                 |   70 +
 groups/ivybridgeEP/NUMA.txt                   |   33 +
 groups/ivybridgeEP/QPI.txt                    |   52 +
 groups/ivybridgeEP/TLB_DATA.txt               |   35 +
 groups/ivybridgeEP/TLB_INSTR.txt              |   28 +
 groups/ivybridgeEP/UNCORECLOCK.txt            |   84 +
 groups/k10/BRANCH.txt                         |   16 +-
 groups/k10/ICACHE.txt                         |   16 +-
 groups/k10/L2.txt                             |   16 +-
 groups/k10/L3CACHE.txt                        |    6 +-
 groups/k10/MEM.txt                            |   15 +-
 groups/k8/BRANCH.txt                          |   16 +-
 groups/k8/ICACHE.txt                          |   16 +-
 groups/kabini/BRANCH.txt                      |   16 +-
 groups/kabini/DATA.txt                        |    4 +-
 groups/kabini/ICACHE.txt                      |   16 +-
 groups/kabini/L2.txt                          |   16 +-
 groups/nehalem/BRANCH.txt                     |    8 +-
 groups/nehalem/DATA.txt                       |    4 +-
 groups/nehalem/FLOPS_DP.txt                   |    8 +-
 groups/nehalem/FLOPS_SP.txt                   |    8 +-
 groups/nehalem/ICACHE.txt                     |   25 +
 groups/nehalem/L2.txt                         |   28 +-
 groups/nehalem/L2CACHE.txt                    |    8 +-
 groups/nehalem/L3.txt                         |   16 +-
 groups/nehalem/L3CACHE.txt                    |   18 +-
 groups/nehalem/MEM.txt                        |   45 +-
 groups/nehalem/VIEW.txt                       |   50 -
 groups/nehalemEX/BRANCH.txt                   |    8 +-
 groups/nehalemEX/DATA.txt                     |    4 +-
 groups/nehalemEX/FLOPS_DP.txt                 |    8 +-
 groups/nehalemEX/FLOPS_SP.txt                 |    8 +-
 groups/nehalemEX/ICACHE.txt                   |   25 +
 groups/nehalemEX/L2.txt                       |   25 +-
 groups/nehalemEX/L2CACHE.txt                  |    9 +-
 groups/nehalemEX/L3.txt                       |   37 +
 groups/nehalemEX/L3CACHE.txt                  |   48 +
 groups/nehalemEX/MEM.txt                      |   53 +-
 groups/pentiumm/BRANCH.txt                    |   17 +
 groups/pentiumm/CPI.txt                       |   18 +
 groups/pentiumm/FLOPS_DP.txt                  |   18 +
 groups/pentiumm/FLOPS_SP.txt                  |   18 +
 groups/pentiumm/L3.txt                        |   30 +
 groups/phi/CACHE.txt                          |    9 +-
 groups/phi/COMPUTE_TO_DATA_RATIO.txt          |   22 +
 groups/phi/L2CACHE.txt                        |   19 -
 groups/phi/MEM.txt                            |   18 +
 groups/phi/MEM1.txt                           |   10 +-
 groups/phi/MEM2.txt                           |   10 +-
 groups/phi/MEM3.txt                           |    8 +-
 groups/phi/MEM4.txt                           |   10 +-
 groups/phi/MEM5.txt                           |   10 +-
 groups/phi/MEM6.txt                           |   10 +-
 groups/phi/MEM_READ.txt                       |   20 +
 groups/phi/MEM_WRITE.txt                      |   20 +
 groups/phi/PAIRING.txt                        |   14 +-
 groups/phi/READ_MISS_RATIO.txt                |    9 +-
 groups/phi/TLB.txt                            |   23 +
 groups/phi/TLB_L1.txt                         |   23 +
 groups/phi/TLB_L2.txt                         |   21 +
 groups/phi/VECTOR.txt                         |   10 +-
 groups/phi/VECTOR2.txt                        |    9 +-
 groups/phi/VPU_FILL_RATIO_DBL.txt             |   12 +-
 groups/phi/VPU_PAIRING.txt                    |   15 +-
 groups/phi/VPU_READ_MISS_RATIO.txt            |   10 +-
 groups/phi/VPU_WRITE_MISS_RATIO.txt           |   10 +-
 groups/phi/WRITE_MISS_RATIO.txt               |    9 +-
 groups/sandybridge/BRANCH.txt                 |    8 +-
 groups/sandybridge/DATA.txt                   |    4 +-
 groups/sandybridge/ENERGY.txt                 |   14 +-
 groups/sandybridge/FLOPS_AVX.txt              |    8 +-
 groups/sandybridge/FLOPS_DP.txt               |    8 +-
 groups/sandybridge/FLOPS_SP.txt               |   14 +-
 groups/sandybridge/ICACHE.txt                 |   25 +
 groups/sandybridge/L2.txt                     |   24 +-
 groups/sandybridge/L2CACHE.txt                |    7 +-
 groups/sandybridge/L3.txt                     |   16 +-
 groups/sandybridge/L3CACHE.txt                |   12 +-
 groups/sandybridge/MEM.txt                    |   32 -
 groups/sandybridge/MEM_DP.txt                 |   55 -
 groups/sandybridge/MEM_SP.txt                 |   56 -
 groups/sandybridge/TLB_DATA.txt               |   16 +-
 groups/sandybridge/TLB_INSTR.txt              |    8 +-
 groups/sandybridgeEP/BRANCH.txt               |   31 +
 groups/sandybridgeEP/CACHES.txt               |   76 +
 groups/sandybridgeEP/CLOCK.txt                |   27 +
 groups/sandybridgeEP/DATA.txt                 |   22 +
 groups/sandybridgeEP/ENERGY.txt               |   33 +
 groups/sandybridgeEP/FLOPS_AVX.txt            |   25 +
 groups/sandybridgeEP/FLOPS_DP.txt             |   31 +
 groups/sandybridgeEP/FLOPS_SP.txt             |   31 +
 groups/sandybridgeEP/ICACHE.txt               |   25 +
 groups/sandybridgeEP/L2.txt                   |   38 +
 groups/sandybridgeEP/L2CACHE.txt              |   34 +
 groups/sandybridgeEP/L3.txt                   |   36 +
 groups/sandybridgeEP/L3CACHE.txt              |   35 +
 groups/sandybridgeEP/MEM.txt                  |   40 +
 groups/sandybridgeEP/MEM_DP.txt               |   59 +
 groups/sandybridgeEP/MEM_SP.txt               |   61 +
 groups/sandybridgeEP/NUMA.txt                 |   33 +
 groups/sandybridgeEP/QPI.txt                  |   27 +
 groups/sandybridgeEP/TLB_DATA.txt             |   35 +
 groups/sandybridgeEP/TLB_INSTR.txt            |   28 +
 groups/silvermont/BRANCH.txt                  |    8 +-
 groups/silvermont/CLOCK.txt                   |   23 +
 groups/silvermont/DATA.txt                    |   22 +
 groups/silvermont/ENERGY.txt                  |    6 +-
 groups/silvermont/ICACHE.txt                  |    6 +-
 groups/silvermont/L1TOL2.txt                  |   28 -
 groups/silvermont/L2CACHE.txt                 |   34 +
 groups/silvermont/L2TOMEM.txt                 |   26 -
 groups/silvermont/MEM.txt                     |   37 +
 groups/silvermont/MEM_LAT.txt                 |   23 +
 groups/silvermont/TLB_DATA.txt                |   27 +
 groups/silvermont/TLB_INSTR.txt               |   27 +
 groups/westmere/BRANCH.txt                    |   10 +-
 groups/westmere/DATA.txt                      |    4 +-
 groups/westmere/FLOPS_DP.txt                  |    8 +-
 groups/westmere/FLOPS_SP.txt                  |    8 +-
 groups/westmere/ICACHE.txt                    |   25 +
 groups/westmere/L2.txt                        |   24 +-
 groups/westmere/L2CACHE.txt                   |    9 +-
 groups/westmere/L3.txt                        |   16 +-
 groups/westmere/L3CACHE.txt                   |   16 +-
 groups/westmere/MEM.txt                       |   47 +-
 groups/westmere/TLB.txt                       |   22 -
 groups/westmere/TLB_DATA.txt                  |   35 +
 groups/westmere/TLB_INSTR.txt                 |   27 +
 groups/westmereEX/BRANCH.txt                  |   10 +-
 groups/westmereEX/DATA.txt                    |    4 +-
 groups/westmereEX/FLOPS_DP.txt                |    6 +-
 groups/westmereEX/FLOPS_SP.txt                |    6 +-
 groups/westmereEX/ICACHE.txt                  |   25 +
 groups/westmereEX/L2.txt                      |   24 +-
 groups/westmereEX/L2CACHE.txt                 |    9 +-
 groups/westmereEX/L3.txt                      |   16 +-
 groups/westmereEX/L3CACHE.txt                 |   52 +
 groups/westmereEX/MEM.txt                     |   49 +-
 groups/westmereEX/NUMA.txt                    |   33 +
 groups/westmereEX/TLB.txt                     |   22 -
 groups/westmereEX/TLB_DATA.txt                |   35 +
 groups/westmereEX/TLB_INSTR.txt               |   27 +
 kernel/Makefile                               |    3 +-
 make/config_checks.mk                         |   38 +
 make/config_defines.mk                        |   99 +
 make/include_CLANG.mk                         |   28 +
 make/include_GCC.mk                           |   21 +-
 make/include_GCCX86.mk                        |   16 +-
 make/include_ICC.mk                           |   11 +-
 make/include_MIC.mk                           |    6 +-
 monitoring/README.agent                       |   66 +
 monitoring/groups/atom/BW_MEM.txt             |   10 +
 monitoring/groups/atom/FLOPS_DP.txt           |   13 +
 monitoring/groups/atom/FLOPS_SP.txt           |   12 +
 monitoring/groups/broadwell/BW.txt            |   13 +
 monitoring/groups/broadwell/ENERGY.txt        |   18 +
 monitoring/groups/broadwell/FLOPS_DP.txt      |   22 +
 monitoring/groups/broadwell/FLOPS_SP.txt      |   22 +
 monitoring/groups/broadwellEP/BW.txt          |   13 +
 monitoring/groups/broadwellEP/ENERGY.txt      |   18 +
 monitoring/groups/core2/BW_L2.txt             |   11 +
 monitoring/groups/core2/BW_MEM.txt            |   10 +
 monitoring/groups/haswell/BW.txt              |   13 +
 monitoring/groups/haswell/ENERGY.txt          |   18 +
 monitoring/groups/haswellEP/BW.txt            |   32 +
 monitoring/groups/haswellEP/ENERGY.txt        |   18 +
 monitoring/groups/interlagos/BW.txt           |   16 +
 monitoring/groups/interlagos/CPI.txt          |   19 +
 monitoring/groups/interlagos/FLOPS.txt        |   18 +
 monitoring/groups/ivybridge/BW.txt            |   13 +
 monitoring/groups/ivybridge/ENERGY.txt        |   18 +
 monitoring/groups/ivybridge/FLOPS_DP.txt      |   23 +
 monitoring/groups/ivybridge/FLOPS_SP.txt      |   24 +
 monitoring/groups/ivybridgeEP/BW.txt          |   32 +
 monitoring/groups/ivybridgeEP/ENERGY.txt      |   18 +
 monitoring/groups/ivybridgeEP/FLOPS_DP.txt    |   23 +
 monitoring/groups/ivybridgeEP/FLOPS_SP.txt    |   24 +
 monitoring/groups/kabini/BW.txt               |   14 +
 monitoring/groups/kabini/CPI.txt              |   19 +
 monitoring/groups/kabini/FLOPS.txt            |   14 +
 monitoring/groups/nehalem/BW.txt              |   20 +
 monitoring/groups/nehalem/CPI.txt             |   14 +
 monitoring/groups/nehalem/FLOPS.txt           |   20 +
 monitoring/groups/nehalemEX/BW.txt            |   29 +
 monitoring/groups/nehalemEX/CPI.txt           |   12 +
 monitoring/groups/nehalemEX/FLOPS.txt         |   20 +
 monitoring/groups/pentiumm/BW.txt             |   12 +
 monitoring/groups/pentiumm/CPI.txt            |   17 +
 monitoring/groups/phi/CPI.txt                 |   17 +
 monitoring/groups/sandybridge/BW.txt          |   13 +
 monitoring/groups/sandybridge/ENERGY.txt      |   18 +
 monitoring/groups/sandybridge/FLOPS_DP.txt    |   24 +
 monitoring/groups/sandybridge/FLOPS_SP.txt    |   24 +
 monitoring/groups/sandybridgeEP/BW.txt        |   24 +
 monitoring/groups/sandybridgeEP/ENERGY.txt    |   18 +
 monitoring/groups/sandybridgeEP/FLOPS_DP.txt  |   24 +
 monitoring/groups/sandybridgeEP/FLOPS_SP.txt  |   24 +
 monitoring/groups/silvermont/BW.txt           |   12 +
 monitoring/groups/silvermont/CPI.txt          |   14 +
 monitoring/groups/silvermont/ENERGY.txt       |   16 +
 monitoring/groups/westmere/BW.txt             |   19 +
 monitoring/groups/westmere/CPI.txt            |   14 +
 monitoring/groups/westmere/FLOPS.txt          |   20 +
 monitoring/groups/westmereEX/BW.txt           |   20 +
 monitoring/groups/westmereEX/CPI.txt          |   14 +
 monitoring/groups/westmereEX/FLOPS.txt        |   20 +
 monitoring/likwid-agent.conf                  |   52 +
 perl/feedGnuplot                              | 1543 ++++--
 perl/gen_events.pl                            |   77 +-
 perl/generatePas.pl                           |    2 +-
 perl/likwid-mpirun                            |  456 --
 perl/likwid-perfscope                         |  110 -
 perl/likwid-setFrequencies                    |  185 -
 perl/set_license.pl                           |  226 +-
 perl/templates/group.tt                       |   63 +-
 src/access-daemon/Makefile                    |   19 +-
 src/access-daemon/accessDaemon.c              |  602 +--
 src/access-daemon/setFreq.c                   |  243 +-
 src/access.c                                  |  224 +
 src/accessClient.c                            |  143 +-
 src/affinity.c                                |  299 +-
 src/allocator.c                               |  199 -
 src/applications/likwid-agent.lua             |  573 +++
 src/applications/likwid-bench.c               |  536 ---
 src/applications/likwid-features.c            |  191 -
 src/applications/likwid-genCfg.c              |  122 -
 src/applications/likwid-genTopoCfg.lua        |  140 +
 src/applications/likwid-memsweeper.c          |  138 -
 src/applications/likwid-memsweeper.lua        |   86 +
 src/applications/likwid-mpirun.lua            | 1490 ++++++
 src/applications/likwid-perfctr.c             |  528 ---
 src/applications/likwid-perfctr.lua           |  715 +++
 src/applications/likwid-perfscope.lua         |  599 +++
 src/applications/likwid-pin.c                 |  346 --
 src/applications/likwid-pin.lua               |  250 +
 src/applications/likwid-powermeter.c          |  507 --
 src/applications/likwid-powermeter.lua        |  331 ++
 src/applications/likwid-setFrequencies.lua    |  314 ++
 src/applications/likwid-topology.c            |  509 --
 src/applications/likwid-topology.lua          |  383 ++
 src/applications/likwid.lua                   | 1532 ++++++
 src/asciiBoxes.c                              |  256 -
 src/asciiTable.c                              |  236 -
 src/barrier.c                                 |  155 -
 src/bench.c                                   |  537 ---
 src/bitUtil.c                                 |    8 +-
 src/bstrlib.c                                 | 3072 ++++++------
 src/configuration.c                           |  183 +
 src/cpuFeatures.c                             |  272 +-
 src/cpuid.c                                   | 1244 -----
 src/daemon.c                                  |  123 -
 src/ghash.c                                   |   47 +-
 src/hashTable.c                               |   78 +-
 src/includes/access.h                         |   41 +
 src/includes/accessClient.h                   |   21 +-
 src/includes/accessClient_types.h             |   20 +-
 src/includes/affinity.h                       |   24 +-
 src/includes/affinity_types.h                 |   42 -
 src/includes/allocator.h                      |   48 -
 src/includes/asciiBoxes.h                     |   42 -
 src/includes/asciiBoxes_types.h               |   47 -
 src/includes/asciiTable.h                     |   45 -
 src/includes/asciiTable_types.h               |   48 -
 src/includes/barrier.h                        |   62 -
 src/includes/barrier_types.h                  |   49 -
 src/includes/bitUtil.h                        |    8 +-
 src/includes/bstrlib.h                        |   46 +-
 src/includes/configuration.h                  |   46 +
 src/includes/cpuFeatures.h                    |    8 +-
 src/includes/cpuFeatures_types.h              |   42 +-
 src/includes/cpuid.h                          |  130 -
 src/includes/cpuid_types.h                    |  115 -
 src/includes/daemon.h                         |   42 -
 src/includes/error.h                          |   70 +-
 src/includes/ghash.h                          |   42 +-
 src/includes/hashTable.h                      |    9 +-
 src/includes/libperfctr_types.h               |   10 +-
 src/includes/likwid.h                         | 1025 +++-
 src/includes/lock.h                           |   10 +-
 src/includes/memsweep.h                       |   15 +-
 src/includes/msr.h                            |   20 +-
 src/includes/multiplex.h                      |   40 -
 src/includes/multiplex_types.h                |   42 -
 src/includes/numa.h                           |   43 +-
 src/includes/numa_hwloc.h                     |   40 +
 src/includes/numa_proc.h                      |   39 +
 src/includes/numa_types.h                     |   52 -
 src/includes/pci.h                            |   21 +-
 src/includes/pci_hwloc.h                      |   37 +
 src/includes/pci_proc.h                       |   37 +
 src/includes/pci_types.h                      |   69 +-
 src/includes/perfmon.h                        |   88 +-
 src/includes/perfmon_atom.h                   |   11 +-
 src/includes/perfmon_atom_events.txt          |   17 +-
 src/includes/perfmon_broadwell.h              |  506 ++
 src/includes/perfmon_broadwell_counters.h     |   64 +
 src/includes/perfmon_broadwell_events.txt     |  393 ++
 src/includes/perfmon_core2.h                  |  335 +-
 src/includes/perfmon_core2_counters.h         |   31 +-
 src/includes/perfmon_core2_events.txt         |   56 +-
 src/includes/perfmon_haswell.h                | 1833 +++++++-
 src/includes/perfmon_haswellEP_counters.h     |  316 ++
 src/includes/perfmon_haswellEP_events.txt     | 2391 ++++++++++
 src/includes/perfmon_haswell_counters.h       |   65 +-
 src/includes/perfmon_haswell_events.txt       |  108 +-
 src/includes/perfmon_interlagos.h             |  326 +-
 src/includes/perfmon_interlagos_counters.h    |   35 +-
 src/includes/perfmon_interlagos_events.txt    |  130 +-
 src/includes/perfmon_ivybridge.h              | 1765 ++++---
 src/includes/perfmon_ivybridgeEP_events.txt   | 1955 ++++++++
 src/includes/perfmon_ivybridge_counters.h     |  344 +-
 src/includes/perfmon_ivybridge_events.txt     |  339 +-
 src/includes/perfmon_k10.h                    |  225 +-
 src/includes/perfmon_k10_counters.h           |   26 +-
 src/includes/perfmon_k10_events.txt           |   53 +-
 src/includes/perfmon_k8.h                     |   17 +-
 src/includes/perfmon_k8_events.txt            |   42 +-
 src/includes/perfmon_kabini.h                 |  386 +-
 src/includes/perfmon_kabini_counters.h        |   39 +-
 src/includes/perfmon_kabini_events.txt        |   30 +-
 src/includes/perfmon_nehalem.h                |  688 ++-
 src/includes/perfmon_nehalemEX.h              | 1786 ++++---
 src/includes/perfmon_nehalemEX_counters.h     |  185 +
 src/includes/perfmon_nehalemEX_events.txt     |  425 +-
 src/includes/perfmon_nehalem_counters.h       |   58 +-
 src/includes/perfmon_nehalem_events.txt       |   33 +-
 src/includes/perfmon_p6_events.txt            |   19 +-
 src/includes/perfmon_phi.h                    |  235 +-
 src/includes/perfmon_phi_counters.h           |   23 +-
 src/includes/perfmon_phi_events.txt           |   17 +-
 src/includes/perfmon_pm.h                     |  243 +-
 src/includes/perfmon_pm_counters.h            |   22 +-
 src/includes/perfmon_pm_events.txt            |   36 +-
 src/includes/perfmon_sandybridge.h            | 1950 ++++++--
 src/includes/perfmon_sandybridgeEP_events.txt | 1282 +++++
 src/includes/perfmon_sandybridge_counters.h   |  250 +-
 src/includes/perfmon_sandybridge_events.txt   |  167 +-
 src/includes/perfmon_silvermont.h             |  520 +-
 src/includes/perfmon_silvermont_counters.h    |   37 +-
 src/includes/perfmon_silvermont_events.txt    |  424 +-
 src/includes/perfmon_types.h                  |  304 +-
 src/includes/perfmon_westmere.h               |   13 +-
 src/includes/perfmon_westmereEX.h             | 1898 +++++---
 src/includes/perfmon_westmereEX_counters.h    |  274 +-
 src/includes/perfmon_westmereEX_events.txt    |  405 +-
 src/includes/perfmon_westmere_events.txt      |   63 +-
 src/includes/power.h                          |  178 +-
 src/includes/power_types.h                    |   39 +-
 src/includes/registers.h                      |  415 +-
 src/includes/registers_types.h                |  198 +
 src/includes/strUtil.h                        |   55 -
 src/includes/strUtil_types.h                  |   61 -
 src/includes/test_types.h                     |  108 -
 src/includes/textcolor.h                      |    8 +-
 src/includes/thermal.h                        |   50 +-
 src/includes/thermal_types.h                  |   18 +-
 src/includes/threads.h                        |  107 -
 src/includes/threads_types.h                  |   57 -
 src/includes/timer.h                          |   74 +-
 src/includes/timer_types.h                    |    8 +-
 src/includes/tlb-info.h                       |   89 +
 src/includes/topology.h                       |  139 +
 src/includes/topology_cpuid.h                 |   43 +
 src/includes/topology_hwloc.h                 |   51 +
 src/includes/topology_proc.h                  |   51 +
 src/includes/topology_types.h                 |   73 +
 src/includes/tree.h                           |    9 +-
 src/includes/tree_types.h                     |   34 +-
 src/includes/types.h                          |   29 +-
 src/libperfctr.c                              |  666 +--
 src/likwid.f90                                |  102 +-
 src/likwid_f90_interface.c                    |   47 +-
 src/loadData.S                                |   44 +
 src/loadData.s                                |   22 -
 src/loadData.s.tmp                            |    0
 src/luawid.c                                  | 1681 +++++++
 src/memsweep.c                                |   59 +-
 src/msr.c                                     |  264 +-
 src/multiplex.c                               |  165 -
 src/numa.c                                    |  401 +-
 src/numa_hwloc.c                              |  393 ++
 src/numa_proc.c                               |  372 ++
 src/pci.c                                     |  429 +-
 src/pci_hwloc.c                               |   82 +
 src/pci_proc.c                                |  126 +
 src/perfmon.c                                 | 2572 +++++-----
 src/power.c                                   |  489 +-
 src/pthread-overload/Makefile                 |   23 +-
 src/pthread-overload/pthread-overload.c       |   26 +-
 src/strUtil.c                                 |  975 ----
 src/thermal.c                                 |   20 +-
 src/threads.c                                 |  217 -
 src/timer.c                                   |  119 +-
 src/topology.c                                |  965 ++++
 src/topology_cpuid.c                          |  943 ++++
 src/topology_hwloc.c                          |  277 ++
 src/topology_proc.c                           |  627 +++
 src/tree.c                                    |   54 +-
 test/MPI_pin_test.c                           |   53 +-
 test/Makefile                                 |   16 +-
 test/accuracy/Makefile                        |   42 +-
 test/accuracy/README                          |    7 +-
 test/accuracy/TESTS/FLOPS_AVX.txt             |    8 +-
 test/accuracy/TESTS/FLOPS_DP.txt              |    2 +-
 test/accuracy/TESTS/FLOPS_SP.txt              |   10 +-
 test/accuracy/TESTS/L2.txt                    |    7 +-
 test/accuracy/TESTS/L3.txt                    |    7 +-
 test/accuracy/TESTS/MEM.txt                   |    7 +-
 test/accuracy/likwid-accuracy.py              |  240 +-
 test/accuracy/likwid-tester                   |  220 -
 test/accuracy/likwid-tester-plot              |   78 -
 test/executable_tests/Makefile                |   22 -
 test/executable_tests/README                  |    8 -
 test/executable_tests/likwid-bench.txt        |   29 -
 test/executable_tests/likwid-features.txt     |    9 -
 test/executable_tests/likwid-genCfg.txt       |    5 -
 test/executable_tests/likwid-memsweeper.txt   |    8 -
 test/executable_tests/likwid-perfctr.txt      |   38 -
 test/executable_tests/likwid-pin.txt          |   26 -
 test/executable_tests/likwid-powermeter.txt   |   14 -
 test/executable_tests/likwid-setFreq.txt      |    6 -
 test/executable_tests/likwid-topology.txt     |   11 -
 test/executable_tests/tester.sh               |   80 -
 test/stream.c                                 |  191 +
 818 files changed, 148876 insertions(+), 23680 deletions(-)

diff --git a/INSTALL b/INSTALL
index 5939aa9..e85caf6 100644
--- a/INSTALL
+++ b/INSTALL
@@ -1,25 +1,25 @@
 == Basic build ==
 
 1. Edit config.mk. Follow the comments there.
-   Optionally you can change compiler settings in include_[GCC|ICC|GCCX86].mk.
-   Please note that only the default compiler flags are supported and tested.
-2. make
-3. make install (required)
-4. setup access to the msr device files (see end of this document)
+   Optionally you can change compiler settings in include_[GCC|CLANG|ICC|MIC].mk.
+   Please note that only the default compiler flags GCC are supported and tested.
+2. make (Builds hwloc, lua, Likwid libraries, access daemons and likwid-bench)
+3. make install (this is required for likwid-pin and if you use the accessDaemon)
 
 Only the default flags set are tested. As it is not possible to test all
-compiler setting variants the Intel icc compiler is only build tested. A basic
-function test is done for the icc binary. The only variant fully tested is gcc
-with default compiler flags. It is therefore recommended to use gcc with the
-default flags. If you want to use and build the Fortran interface you can mix
-GCC with the Intel Fortran Compiler. More information on this can be found in
-the WIKI. On 32bit systems you have to pick the GCCX86 compiler target.
+compiler setting variants the Intel icc compiler and Clang is only build tested.
+A basic function test is done for the icc binary. The only variant fully tested
+is gcc with default compiler flags. It is therefore recommended to use gcc with
+the default flags. If you want to use and build the Fortran interface you can mix
+GCC with the Intel Fortran Compiler (default setup). You can change the Fortran
+compiler in make/include_[GCC|CLANG|ICC|MIC].mk.
 
 *NOTICE*
 
-All generated files are located in the [GCC|ICC|GCCX86] build directory.
-This includes the dependency files, object files and also the
-generated source files and the pas and assembly files for likwid-bench.
+All generated files are located in the [GCC|ICC|CLANG|MIC] build directory.
+This includes the dependency files, object files. The
+generated source files and the pas and assembly files for likwid-bench are build
+in bench/[GCC|ICC|CLANG|MIC].
 If you debug your likwid-bench benchmarks you can look at all
 intermediate build files and also the final assembly code.
 
@@ -28,79 +28,96 @@ intermediate build files and also the final assembly code.
 On very old systems with old kernels (< 2.6.7) or old glibc versions likwid
 is build with reduced funtionality. This includes missing support for NUMA
 and pinning.
+likwid-setFrequencies can only be used if the acpi_cpufreq module is loaded. It
+is not possible to fix the frequency with the intel_pstate module.
 
 == Additional Targets ==
 
 make clean     -  clean the object directory
 make distclean -  clean also the executables/libraries
 make uninstall -  delete installed files
+make docs      -  generate html documentation using doxygen
+make local     -  set paths in Lua files to work from current directory
+                  (for testing only! Uses already installed access daemons and
+                  libraries.)
 
-== Build  accessDaemon ==
+== Dependencies ==
+Most parts of the Likwid suite do not have external dependencies that need to be
+installed before you can build Likwid. If external libraries are used, they are
+shipped with Likwid.
 
-To build the accessDaemon:
+Included dependencies:
+- hwloc
+- Lua
+- Perl Template toolkit
 
-1. Set the desired default ACCESSMODE. You can overwrite this on the command line.
-2. make will also build the accessDaemon
-3. Install with
-   make install
+Build dependencies:
+- C compiler (commonly gcc, but clang and icc are also possible)
+- make
+- Perl
 
-With the standard  make install target the daemon will also be installed in
-${PREFIX}/sbin . Don't forget to copy the dameon if you configured a different
-path in ACCESSDAEMON.
+Runtime dependencies for likwid-perfscope:
+- gnuplot
 
-== Setup of msr module ==
+Runtime dependencies for likwid-agent (if enabled in configfile):
+- gmetric (Output to Ganglia Monitoring System)
+- rrdtool (Output to RRDs)
+- logger (Output to syslog)
 
-likwid-perfctr, likwid-powermeter and likwid-features require the Linux msr kernel module. This module
-is part of most standard distro kernels. You have to be root to do the initial setup.
+For the HTML documentation you further need doxygen.
 
-Check if msr device files are there with 'ls /dev/cpu/0/'. If msr device files are not there try:
+== Build  accessDaemon ==
+
+Change path for the accessDaemon:
 
-1. Check if the msr module is loaded with  'lsmod | grep msr' . There should be an output.
-2. It the module is not loaded load it with  'modprobe msr' . For automatic loading at startup
-consult your distros documentation how to do so.
+1. Edit config.mk and configure path in ACCESSDAEMON variable. You can overwrite
+   it later in likwid.cfg
+2. Set the desired default ACCESSMODE. You can overwrite this on the command
+   line or likwid.cfg.
+2. make will also build the accessDaemon
+3. Install with (sudo) make install
 
-Once you have the msr device files avilable:
-3. Adopt access rights on the msr device files for normal user. To allow everybody access you can
-use 'chmod o+rw /dev/cpu/*/msr' . This is only recommended on save single user desktop systems.
+With the standard make install target the daemon will also be installed in
+to the path in $ACCESSDAEMON. It also sets the user to root and the suid bit.
+
+== Setup of msr module ==
+
+likwid-perfctr, likwid-powermeter, likwid-agent, require the Linux msr kernel
+module. This module is part of most standard distro kernels. You have to be root
+to do the initial setup.
+
+1. Check if the msr module is loaded with 'lsmod | grep msr'.
+   There should be an output.
+2. If the module is not loaded, load it with 'modprobe msr'. For automatic
+   loading at startup consult your distros documentation how to do so, commonly
+   by adding 'msr' to /etc/modules.
+3. Adopt access rights on the msr device files for normal user. To allow
+   everybody access you can use 'chmod o+rw /dev/cpu/*/msr'.
+   This is only recommended on save single user desktop systems and might be not
+   enough to grant access to anybody because of POSIX capabilites or other
+   security features of your distro.
 
 As a general access to the msr registers is not desired on security sensitive
-systems you can either implement a more sophisticated access rights settings
+systems, you can either implement a more sophisticated access rights settings
 with e.g. setgid. A common solution used on many other device files, e.g. for
 audio, is to introduce a group and make a chown on the msr device files to that
-group. Now if you execute likwid-perfctr with setgid on that group the
-executing user can use the tool but cannot directly write or read the msr
-device files.
+group or use dbus rules. Now if you execute likwid-perfctr with setgid on that
+group the executing user can use the tool but cannot directly write or read the
+msr device files.
 
 A secure solution is to use the accessDaemon, which encapsulates the access to
-the msr device files and performs a address check for allowed registers. For
-more information how to setup and use this solution have a look at the WIKI
-page:
-
-http://code.google.com/p/likwid/wiki/MSRDaemon
-
-A common solution to give access is to use the likwid-accessD and make it suid root.
-Starting with version 3.1.3 make install will do those steps. Of course this will only
-work as long as you are root while calling make install.
+the msr device files and performs an address check for allowed registers. For
+more information how to setup look at the HTML documentation.
 
-If for you are not root and someone else needs to install the daemon the
-following steps need to be carried out:
-
-1. Go to the directory where you installed the likwid tools.
-2. Change to the sbin directory there.
-3. Execute (as root): chown root.<some user group>  likwid-accessD
-4. Execute (as root): chmod u+s likwid-accessD
-
-
-This should be sufficient on many machines.
-You need to perform the same procedure for likwid-setFreq.
-
-=== THIS IS USUALLY NOT NECESSARY ANYMORE ==
 A demo for a root exploit involving the msr device files was published. As
 a consequence the security settings for access to the msr device files are
-tightened in recent kernels.
+tightened in recent kernels. The exploit used a specify register to alter the
+entry point for the current process to a malware. The daemon grants access only
+to hardware performance counter related registers.
+
 Just setting the file access rights or using suid root on the access daemon is
-not sufficient anymore. You have to register your binary now to get access.
-This is only necessary if above setup dos not work.
+not sufficient anymore for some distros. You have to register your binary at the
+libcap now to get access. This is only necessary if above setup does not work.
 
 You register the necessary capability by calling
 
@@ -108,27 +125,10 @@ sudo setcap cap_sys_rawio+ep EXECUTABLE
 
 on the executables. This is only possible on local file systems.
 The only feasable way is to register the likwid-accessD and proxy all access over it.
-=== SNIP ==
 
 If you have still problems please let me know on the likwid mailing list:
-
 http://groups.google.com/group/likwid-users
 
-== NOTICE for Intel Xeon Phi (KNC) ==
-
-If you want to use LIKWID on a Xeon Phi you have to use set MIC as COMPILER in
-config.mk. This build of LIKWID won't be binary compatible with other X86
-processors. It is required to set the default access mode to direct in
-and disable the build of likwid-accessD in config.mk.
-
-To use LIKWID you have to turn of power management on the MIC. LIKWID relies on
-RDTSC being used for wallclock time. On the MIC this is only given if power
-management is turned off. This can be configured in
-/etc/sysconfig/mic/default.conf.
-
-At the end of this file the power management is configured. The following configuration worked:
-
-    PowerManagement "cpufreq_off;corec6_off;pc3_off;pc6_off"
 
 
 
diff --git a/Makefile b/Makefile
index eecd4e9..b0bc261 100644
--- a/Makefile
+++ b/Makefile
@@ -1,16 +1,17 @@
+#
 # =======================================================================================
 #
 #      Filename:  Makefile
 #
 #      Description:  Central Makefile
 #
-#      Version:   3.1.3
-#      Released:  4.11.2014
+#      Version:   <VERSION>
+#      Released:  <DATE>
 #
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2013 Jan Treibig
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -31,26 +32,10 @@ DOC_DIR     = ./doc
 GROUP_DIR   = ./groups
 FILTER_DIR  = ./filters
 MAKE_DIR    = ./make
+EXT_TARGETS = ./ext/lua
 
 #DO NOT EDIT BELOW
 
-# determine kernel Version
-KERNEL_VERSION_MAJOR := $(shell uname -r | awk '{split($$1,a,"."); print a[1]}' | cut -d '-' -f1)
-KERNEL_VERSION := $(shell uname -r | awk  '{split($$1,a,"."); print a[2]}' | cut -d '-' -f1)
-KERNEL_VERSION_MINOR := $(shell uname -r | awk '{split($$1,a,"."); print a[3]}' | cut -d '-' -f1)
-
-HAS_MEMPOLICY = $(shell if [ $(KERNEL_VERSION) -lt 7 -a $(KERNEL_VERSION_MAJOR) -lt 3 -a $(KERNEL_VERSION_MINOR) -lt 7 ]; then \
-               echo 0;  else echo 1; \
-			   fi; )
-
-HAS_RDTSCP = $(shell  /bin/bash -c "cat /proc/cpuinfo | grep -c rdtscp")
-
-# determine glibc Version
-GLIBC_VERSION := $(shell ldd --version | grep ldd |  awk '{ print $$NF }' | awk -F. '{ print $$2 }')
-
-HAS_SCHEDAFFINITY = $(shell if [ $(GLIBC_VERSION) -lt 4 ]; then \
-               echo 0;  else echo 1; \
-			   fi; )
 
 # Dependency chains:
 # *.[ch] -> *.o -> executables
@@ -59,161 +44,104 @@ HAS_SCHEDAFFINITY = $(shell if [ $(GLIBC_VERSION) -lt 4 ]; then \
 
 include ./config.mk
 include $(MAKE_DIR)/include_$(COMPILER).mk
-INCLUDES  += -I./src/includes  -I$(BUILD_DIR)
+
+DYNAMIC_TARGET_LIB := liblikwid.so
+STATIC_TARGET_LIB := liblikwid.a
+LIBLUA := ext/lua/liblua.a
+
+include $(MAKE_DIR)/config_checks.mk
+include $(MAKE_DIR)/config_defines.mk
+
+INCLUDES  += -I./src/includes -I./ext/lua/includes -I./ext/hwloc/include -I$(BUILD_DIR)
 LIBS      +=
-DEFINES   += -DVERSION=$(VERSION)         \
-		 -DRELEASE=$(RELEASE)                 \
-		 -DCFGFILE=$(CFG_FILE_PATH)           \
-		 -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) \
-		 -DMAX_NUM_NODES=$(MAX_NUM_NODES)     \
-		 -DHASH_TABLE_SIZE=$(HASH_TABLE_SIZE) \
-		 -DLIBLIKWIDPIN=$(LIBLIKWIDPIN)       \
-		 -DLIKWIDFILTERPATH=$(LIKWIDFILTERPATH)
 
 #CONFIGURE BUILD SYSTEM
 BUILD_DIR  = ./$(COMPILER)
 Q         ?= @
 GENGROUPLOCK = .gengroup
 
-ifeq ($(COMPILER),MIC)
-BENCH_DIR   = ./bench/phi
-else
-ifeq ($(COMPILER),GCCX86)
-BENCH_DIR   = ./bench/x86
-else
-BENCH_DIR   = ./bench/x86-64
-endif
-endif
-
-LIKWID_LIB = liblikwid
-ifeq ($(SHARED_LIBRARY),true)
-CFLAGS += $(SHARED_CFLAGS) -ggdb
-DYNAMIC_TARGET_LIB := $(LIKWID_LIB).so
-TARGET_LIB := $(DYNAMIC_TARGET_LIB)
-LIBS += -L. -llikwid
-SHARED_LFLAGS += -lm -lpthread
-else
-STATIC_TARGET_LIB := $(LIKWID_LIB).a
-TARGET_LIB := $(STATIC_TARGET_LIB)
-endif
-
-ifneq ($(COLOR),NONE)
-DEFINES += -DCOLOR=$(COLOR)
-endif
-
-ifneq ($(COMPILER),MIC)
-    DAEMON_TARGET = likwid-accessD
-else
-    $(info Info: Compiling for Xeon Phi. Disabling build of likwid-accessD.);
-endif
-
-ifeq ($(INSTRUMENT_BENCH),true)
-DEFINES += -DPERFMON
-endif
-
-ifeq ($(HAS_MEMPOLICY),1)
-DEFINES += -DHAS_MEMPOLICY
-else
-$(info Kernel $(KERNEL_VERSION_MAJOR).$(KERNEL_VERSION).$(KERNEL_VERSION_MINOR) has no mempolicy support! First Linux kernel with memory policies has version 2.6.7);
-endif
-
-ifeq ($(HAS_RDTSCP),0)
-$(info Building without RDTSCP timing support!);
-else
-ifneq ($(COMPILER),MIC)
-DEFINES += -DHAS_RDTSCP
-else
-    $(info Info: Compiling for Xeon Phi. Disabling RDTSCP support.);
-endif
-endif
-
-ifeq ($(HAS_SCHEDAFFINITY),1)
-DEFINES += -DHAS_SCHEDAFFINITY
-PINLIB  = liblikwidpin.so
-else
-$(info GLIBC version 2.$(GLIBC_VERSION) has no pthread_setaffinity_np support!);
-PINLIB  =
-endif
-
-DEFINES += -DACCESSDAEMON=$(ACCESSDAEMON)
-
-ifeq ($(ACCESSMODE),accessdaemon)
-ifneq ($(COMPILER),MIC)
-    DEFINES += -DACCESSMODE=1
-else
-    $(info Info: Compiling for Xeon Phi. Set accessmode to direct.);
-    DEFINES += -DACCESSMODE=0
-endif
-else
-    DEFINES += -DACCESSMODE=0
-endif
-
-SETFREQ_TARGET = likwid-setFreq
-
 VPATH     = $(SRC_DIR)
 OBJ       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))
-OBJ      += $(patsubst $(SRC_DIR)/%.s, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.s))
 OBJ      += $(patsubst $(SRC_DIR)/%.cc, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.cc))
+OBJ      += $(patsubst $(SRC_DIR)/%.S, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.S))
+ifeq ($(FILTER_HWLOC_OBJ),yes)
+OBJ := $(filter-out $(BUILD_DIR)/topology_hwloc.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/numa_hwloc.o,$(OBJ))
+OBJ := $(filter-out $(BUILD_DIR)/pci_hwloc.o,$(OBJ))
+endif
+ifneq ($(FORTRAN_INTERFACE),true)
+OBJ := $(filter-out $(BUILD_DIR)/likwid_f90_interface.o,$(OBJ))
+endif
 PERFMONHEADERS  = $(patsubst $(SRC_DIR)/includes/%.txt, $(BUILD_DIR)/%.h,$(wildcard $(SRC_DIR)/includes/*.txt))
-OBJ_BENCH  =  $(patsubst $(BENCH_DIR)/%.ptt, $(BUILD_DIR)/%.o,$(wildcard $(BENCH_DIR)/*.ptt))
-
-APPS      = likwid-perfctr    \
-            likwid-features   \
-            likwid-powermeter \
-            likwid-memsweeper \
-            likwid-topology   \
-            likwid-genCfg     \
-            likwid-pin        \
-            likwid-bench
-
-PERL_APPS = likwid-mpirun         \
-            likwid-setFrequencies \
-            likwid-perfscope
-
-DAEMON_APPS = $(SETFREQ_TARGET) \
-			$(DAEMON_TARGET)
+OBJ_LUA    =  $(wildcard ./ext/lua/$(COMPILER)/*.o)
+OBJ_HWLOC  =  $(wildcard ./ext/hwloc/$(COMPILER)/*.o)
+BENCH_TARGET = likwid-bench
+
+L_APPS      =   likwid-perfctr \
+				likwid-pin \
+				likwid-powermeter \
+				likwid-topology \
+				likwid-memsweeper \
+				likwid-agent \
+				likwid-mpirun \
+				likwid-perfscope \
+				likwid-genTopoCfg
+C_APPS      =   bench/likwid-bench
+L_HELPER    =   likwid.lua
+ifeq ($(BUILDFREQ),true)
+	L_APPS += likwid-setFrequencies
+endif
 
 CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
 
-ifneq ($(FORTRAN_INTERFACE),false)
-HAS_FORTRAN_COMPILER = $(shell $(FC) --version 2>/dev/null || echo 'NOFORTRAN' )
-ifeq ($(HAS_FORTRAN_COMPILER),NOFORTRAN)
-FORTRAN_INTERFACE=
-$(info Warning: You have selected the fortran interface in config.mk, but there seems to be no fortran compiler - not compiling it!)
-else
-FORTRAN_INTERFACE = likwid.mod
-FORTRAN_INSTALL =  @cp -f likwid.mod  $(PREFIX)/include/
-endif
-else
-FORTRAN_INTERFACE =
-FORTRAN_INSTALL =
-endif
-
-all: $(BUILD_DIR) $(GENGROUPLOCK) $(PERFMONHEADERS) $(OBJ) $(OBJ_BENCH) $(TARGET_LIB) $(APPS) $(FORTRAN_INTERFACE)  $(PINLIB)  $(DAEMON_TARGET) $(SETFREQ_TARGET)
+all: $(BUILD_DIR) $(EXT_TARGETS) $(PERFMONHEADERS) $(OBJ) $(STATIC_TARGET_LIB) $(DYNAMIC_TARGET_LIB) $(FORTRAN_IF)  $(PINLIB) $(L_APPS) $(L_HELPER) $(DAEMON_TARGET) $(FREQ_TARGET) $(BENCH_TARGET)
 
 tags:
 	@echo "===>  GENERATE  TAGS"
 	$(Q)ctags -R
 
-$(APPS):  $(addprefix $(SRC_DIR)/applications/,$(addsuffix  .c,$(APPS))) $(BUILD_DIR) $(GENGROUPLOCK)  $(OBJ) $(OBJ_BENCH)
-	@echo "===>  LINKING  $@"
-	$(Q)${CC} $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) ${LFLAGS} -o $@  $(addprefix $(SRC_DIR)/applications/,$(addsuffix  .c,$@)) $(OBJ_BENCH) $(STATIC_TARGET_LIB) $(LIBS)
-
-$(STATIC_TARGET_LIB): $(OBJ)
+docs:
+	@echo "===>  GENERATE DOXYGEN DOCS"
+	@cp doc/lua-doxygen.md doc/lua-doxygen.md.safe
+	@cp doc/likwid-doxygen.md doc/likwid-doxygen.md.safe
+	@sed -i -e s+'<PREFIX>'+$(PREFIX)+g -e s+'<VERSION>'+$(VERSION)+g -e s+'<DATE>'+'$(DATE)'+g -e s+'<RELEASE>'+$(RELEASE)+g doc/lua-doxygen.md
+	@sed -i -e s+'<PREFIX>'+$(PREFIX)+g -e s+'<VERSION>'+$(VERSION)+g -e s+'<DATE>'+'$(DATE)'+g -e s+'<RELEASE>'+$(RELEASE)+g doc/likwid-doxygen.md
+	$(Q)doxygen doc/Doxyfile
+	@mv doc/lua-doxygen.md.safe doc/lua-doxygen.md
+	@mv doc/likwid-doxygen.md.safe doc/likwid-doxygen.md
+
+$(L_APPS):  $(addprefix $(SRC_DIR)/applications/,$(addsuffix  .lua,$(L_APPS)))
+	@echo "===>  ADJUSTING  $@"
+	@if [ "$(ACCESSMODE)" = "direct" ]; then sed -i -e s/"access_mode = 1"/"access_mode = 0"/g $(SRC_DIR)/applications/$@.lua;fi
+	@sed -e s/'<PREFIX>'/$(subst /,\\/,$(PREFIX))/g \
+		-e s/'<VERSION>'/$(VERSION).$(RELEASE)/g \
+		-e s/'<DATE>'/$(DATE)/g \
+		$(addprefix $(SRC_DIR)/applications/,$(addsuffix  .lua,$@)) > $@
+	@if [ "$(ACCESSMODE)" = "direct" ]; then sed -i -e s/"access_mode = 0"/"access_mode = 1"/g $(SRC_DIR)/applications/$@.lua;fi
+
+$(L_HELPER):
+	@echo "===>  ADJUSTING  $@"
+	@sed -e s/'<PREFIX>'/$(subst /,\\/,$(PREFIX))/g \
+		-e s/'<VERSION>'/$(VERSION)/g \
+		-e s/'<RELEASE>'/$(RELEASE)/g \
+		$(SRC_DIR)/applications/$@ > $@
+
+$(STATIC_TARGET_LIB): $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ)
 	@echo "===>  CREATE STATIC LIB  $(STATIC_TARGET_LIB)"
-	$(Q)${AR} -crus $(STATIC_TARGET_LIB) $(OBJ)
+	$(Q)${AR} -crus $(STATIC_TARGET_LIB) $(OBJ) $(LIBHWLOC) $(LIBLUA)
+
 
-$(DYNAMIC_TARGET_LIB): $(OBJ)
+$(DYNAMIC_TARGET_LIB): $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ)
 	@echo "===>  CREATE SHARED LIB  $(DYNAMIC_TARGET_LIB)"
-	$(Q)${CC} $(SHARED_CFLAGS) -o $(DYNAMIC_TARGET_LIB) $(OBJ) -lm $(SHARED_LFLAGS)
+	$(Q)${CC} $(DEBUG_FLAGS) $(SHARED_LFLAGS) $(SHARED_CFLAGS) -Wl,-soname,$(DYNAMIC_TARGET_LIB).$(VERSION) -o $(DYNAMIC_TARGET_LIB) $(OBJ) $(LIBS) $(LIBHWLOC) $(LIBLUA)
 
 $(DAEMON_TARGET): $(SRC_DIR)/access-daemon/accessDaemon.c
-	@echo "===>  Build access daemon $(DAEMON_TARGET)"
-	$(Q)$(MAKE) -s -C  $(SRC_DIR)/access-daemon $(DAEMON_TARGET)
+	@echo "===>  Build access daemon likwid-accessD"
+	$(Q)$(MAKE) -C  $(SRC_DIR)/access-daemon likwid-accessD
 
-$(SETFREQ_TARGET): $(SRC_DIR)/access-daemon/setFreq.c
-	@echo "===>  Build frequency daemon $(SETFREQ_TARGET)"
-	$(Q)$(MAKE) -s -C  $(SRC_DIR)/access-daemon $(SETFREQ_TARGET)
+$(FREQ_TARGET): $(SRC_DIR)/access-daemon/setFreq.c
+	@echo "===>  Build frequency daemon likwid-setFreq"
+	$(Q)$(MAKE) -C  $(SRC_DIR)/access-daemon likwid-setFreq
 
 $(BUILD_DIR):
 	@mkdir $(BUILD_DIR)
@@ -227,135 +155,204 @@ $(GENGROUPLOCK): $(foreach directory,$(shell ls $(GROUP_DIR)), $(wildcard $(GROU
 	$(Q)$(GEN_GROUPS) ./groups  $(BUILD_DIR) ./perl/templates
 	$(Q)touch $(GENGROUPLOCK)
 
-$(FORTRAN_INTERFACE): $(SRC_DIR)/likwid.f90
+$(FORTRAN_IF): $(SRC_DIR)/likwid.f90
 	@echo "===>  COMPILE FORTRAN INTERFACE  $@"
 	$(Q)$(FC) -c  $(FCFLAGS) $<
 	@rm -f likwid.o
 
+$(EXT_TARGETS):
+	@echo "===>  ENTER  $@"
+	$(Q)$(MAKE) --no-print-directory -C $@ $(MAKECMDGOALS)
+
+$(BENCH_TARGET):
+	@echo "===>  ENTER  $@"
+	$(Q)$(MAKE) --no-print-directory -C bench $(MAKECMDGOALS)
+
 #PATTERN RULES
 $(BUILD_DIR)/%.o:  %.c
 	@echo "===>  COMPILE  $@"
-	$(Q)$(CC) -c  $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@
-	$(Q)$(CC) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
-
-$(BUILD_DIR)/%.o:  %.s
-	@echo "===>  ASSEMBLE  $@"
-	$(Q)$(AS) $(ASFLAGS)  $< -o $@
+	$(Q)$(CC) -g -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@
+	$(Q)$(CC) -g $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 
 $(BUILD_DIR)/%.o:  %.cc
 	@echo "===>  COMPILE  $@"
-	$(Q)$(CXX) -c  $(CXXFLAGS) $(CPPFLAGS) $< -o $@
-	$(Q)$(CXX) $(CXXFLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+	$(Q)$(CXX) -c $(DEBUG_FLAGS) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
+	$(Q)$(CXX) $(DEBUG_FLAGS) $(CXXFLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
 
+$(BUILD_DIR)/%.o:  %.S
+	@echo "===>  COMPILE  $@"
+	$(Q)$(CC) -c $(DEBUG_FLAGS) $(CFLAGS) $(CXXFLAGS) $(CPPFLAGS) $< -o $@
 
-$(BUILD_DIR)/%.pas:  $(BENCH_DIR)/%.ptt
-	@echo "===>  GENERATE BENCHMARKS"
-	$(Q)$(GEN_PAS)  $(BENCH_DIR) $(BUILD_DIR) ./perl/templates
 
 $(BUILD_DIR)/%.h:  $(SRC_DIR)/includes/%.txt
 	@echo "===>  GENERATE HEADER $@"
 	$(Q)$(GEN_PMHEADER) $< $@
 
-$(BUILD_DIR)/%.o:  $(BUILD_DIR)/%.pas
-	@echo "===>  ASSEMBLE  $@"
-	$(Q)$(PAS) -i $(PASFLAGS) -o $(BUILD_DIR)/$*.s $<  '$(DEFINES)'
-	$(Q)$(AS) $(ASFLAGS)  $(BUILD_DIR)/$*.s -o $@
 
 ifeq ($(findstring $(MAKECMDGOALS),clean),)
 -include $(OBJ:.o=.d)
 endif
 
-.PHONY: clean distclean install uninstall
+.PHONY: clean distclean install uninstall $(EXT_TARGETS)
+
 
 .PRECIOUS: $(BUILD_DIR)/%.pas
 
 .NOTPARALLEL:
 
 
-clean:
+clean: $(EXT_TARGETS) $(BENCH_TARGET)
 	@echo "===>  CLEAN"
 	@rm -rf $(BUILD_DIR)
 	@rm -f $(GENGROUPLOCK)
 
 distclean: clean
 	@echo "===>  DIST CLEAN"
-	@rm -f likwid-*
-	@rm -f $(LIKWID_LIB)*
-	@rm -f $(FORTRAN_INTERFACE)
+	@for APP in $(L_APPS); do \
+		rm -f $$APP; \
+	done
+	@rm -f likwid.lua
+	@rm -f $(STATIC_TARGET_LIB)
+	@rm -f $(DYNAMIC_TARGET_LIB)
+	@rm -f $(FORTRAN_IF_NAME)
+	@rm -f $(FREQ_TARGET) $(DAEMON_TARGET)
 	@rm -f $(PINLIB)
+	@rm -rf doc/html
 	@rm -f tags
 
-install:
+ifeq ($(BUILDDAEMON),true)
+install_daemon:
+	@echo "===> INSTALL access daemon to $(ACCESSDAEMON)"
+	@mkdir -p `dirname $(ACCESSDAEMON)`
+	@install -m 4775 -g root -o root $(DAEMON_TARGET) $(ACCESSDAEMON)
+uninstall_daemon:
+	@echo "===> REMOVING access daemon from $(ACCESSDAEMON)"
+	@rm -f $(ACCESSDAEMON)
+else
+install_daemon:
+	@echo "===> No INSTALL of the access daemon"
+uninstall_daemon:
+	@echo "===> No UNINSTALL of the access daemon"
+endif
+
+ifeq ($(BUILDFREQ),true)
+install_freq:
+	@echo "===> INSTALL setFrequencies tool to $(PREFIX)/sbin/$(FREQ_TARGET)"
+	@mkdir -p $(PREFIX)/sbin
+	@install -m 4775 -g root -o root $(FREQ_TARGET) $(PREFIX)/sbin/$(FREQ_TARGET)
+uninstall_freq:
+	@echo "===> REMOVING setFrequencies tool from $(PREFIX)/sbin/$(FREQ_TARGET)"
+	@rm -f $(PREFIX)/sbin/$(FREQ_TARGET)
+else
+install_freq:
+	@echo "===> No INSTALL of setFrequencies tool"
+uninstall_freq:
+	@echo "===> No UNINSTALL of setFrequencies tool"
+endif
+
+install: install_daemon install_freq
 	@echo "===> INSTALL applications to $(PREFIX)/bin"
 	@mkdir -p $(PREFIX)/bin
-	@for app in $(APPS); do \
-		cp -f $$app $(PREFIX)/bin; \
-	done
-	@cp -f perl/feedGnuplot  $(PREFIX)/bin
-	@for app in $(PERL_APPS); do \
-		sed -e "s+<PREFIX>+$(PREFIX)+g" perl/$$app > $(PREFIX)/bin/$$app; \
+	@for APP in $(L_APPS); do \
+		install -m 755 $$APP  $(PREFIX)/bin; \
 	done
-	@chmod 755 $(PREFIX)/bin/likwid-*
-	@echo "===> INSTALL daemon applications to $(PREFIX)/sbin"
-	@mkdir -p $(PREFIX)/sbin
-	@for app in $(DAEMON_APPS); do \
-		cp -f $$app $(PREFIX)/sbin; \
-		if [ $(shell id -u) = "0" ]; then \
-			chown root $(PREFIX)/sbin/$$app; \
-			chmod 4775 $(PREFIX)/sbin/$$app; \
-		else \
-			echo "Only root can adjust the privileges of the daemon applications in $(PREFIX)/sbin"; \
-		fi; \
+	@for APP in $(C_APPS); do \
+		install -m 755 $$APP  $(PREFIX)/bin; \
 	done
+	@install -m 755 ext/lua/lua $(PREFIX)/bin/likwid-lua
+	@echo "===> INSTALL helper applications"
+	@install -m 755 perl/feedGnuplot $(PREFIX)/bin
+	@echo "===> INSTALL lua to likwid interface to $(PREFIX)/share/lua"
+	@mkdir -p $(PREFIX)/share/lua
+	@install -m 755 likwid.lua $(PREFIX)/share/lua
+	@echo "===> INSTALL libraries to $(PREFIX)/lib"
+	@mkdir -p $(PREFIX)/lib
+	@install -m 755 liblikwid.so $(PREFIX)/lib
+	@install -m 644 liblikwid.a $(PREFIX)/lib
+	@install -m 755 liblikwidpin.so $(PREFIX)/lib
+	@install -m 644 ext/lua/liblua.a $(PREFIX)/lib
+	@install -m 755 ext/hwloc/libhwloc.so $(PREFIX)/lib
+	@install -m 644 ext/hwloc/libhwloc.a $(PREFIX)/lib
 	@echo "===> INSTALL man pages to $(MANPREFIX)/man1"
 	@mkdir -p $(MANPREFIX)/man1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-topology.1 > $(MANPREFIX)/man1/likwid-topology.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-features.1 > $(MANPREFIX)/man1/likwid-features.1
-	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-perfctr.1 > $(MANPREFIX)/man1/likwid-perfctr.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s+<PREFIX>+$(PREFIX)+g" < $(DOC_DIR)/likwid-perfctr.1 > $(MANPREFIX)/man1/likwid-perfctr.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-powermeter.1 > $(MANPREFIX)/man1/likwid-powermeter.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-pin.1 > $(MANPREFIX)/man1/likwid-pin.1
-	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-setFrequencies.1 > $(MANPREFIX)/man1/likwid-setFrequencies.1
-	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-bench.1 > $(MANPREFIX)/man1/likwid-bench.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/feedGnuplot.1 > $(MANPREFIX)/man1/feedGnuplot.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-accessD.1 > $(MANPREFIX)/man1/likwid-accessD.1
-	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-genCfg.1 > $(MANPREFIX)/man1/likwid-genCfg.1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-genTopoCfg.1 > $(MANPREFIX)/man1/likwid-genTopoCfg.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-memsweeper.1 > $(MANPREFIX)/man1/likwid-memsweeper.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-mpirun.1 > $(MANPREFIX)/man1/likwid-mpirun.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-perfscope.1 > $(MANPREFIX)/man1/likwid-perfscope.1
 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-setFreq.1 > $(MANPREFIX)/man1/likwid-setFreq.1
 	@chmod 644 $(MANPREFIX)/man1/likwid-*
 	@echo "===> INSTALL headers to $(PREFIX)/include"
-	@mkdir -p $(PREFIX)/include/likwid
-	@cp -f src/includes/likwid*.h  $(PREFIX)/include/
-	@cp -f src/includes/*  $(PREFIX)/include/likwid
-	@cp -f GCC/perfmon_group_types.h  $(PREFIX)/include/likwid
+	@mkdir -p $(PREFIX)/include
+	@install -m 644 src/includes/likwid.h  $(PREFIX)/include/
+	@install -m 644 src/includes/bstrlib.h  $(PREFIX)/include/
 	$(FORTRAN_INSTALL)
-	@echo "===> INSTALL libraries to $(PREFIX)/lib"
-	@mkdir -p $(PREFIX)/lib
-	@cp -f $(LIKWID_LIB)*  $(PREFIX)/lib
-	@chmod 755 $(PREFIX)/lib/$(PINLIB)
+	@echo "===> INSTALL groups to $(PREFIX)/share/likwid/perfgroups"
+	@mkdir -p $(PREFIX)/share/likwid/perfgroups
+	@cp -rf groups/* $(PREFIX)/share/likwid/perfgroups
+	@echo "===> INSTALL monitoring groups to $(PREFIX)/share/likwid/mongroups"
+	@mkdir -p $(PREFIX)/share/likwid/mongroups
+	@cp -rf monitoring/groups/* $(PREFIX)/share/likwid/mongroups
+	@mkdir -p $(PREFIX)/share/likwid/docs
+	@install -m 644 doc/bstrlib.txt $(PREFIX)/share/likwid/docs
+	@mkdir -p $(PREFIX)/share/likwid/examples
+	@install -m 644 examples/* $(PREFIX)/share/likwid/examples
+	@echo "===> INSTALL default likwid-agent.conf to $(PREFIX)/etc"
+	@sed -e "s+<PREFIX>+$(PREFIX)+g" monitoring/likwid-agent.conf > $(PREFIX)/share/likwid/mongroups/likwid-agent.conf
+	@chmod 600 $(PREFIX)/share/likwid/mongroups/likwid-agent.conf
 	@echo "===> INSTALL filters to $(LIKWIDFILTERPATH)"
 	@mkdir -p $(LIKWIDFILTERPATH)
 	@cp -f filters/*  $(LIKWIDFILTERPATH)
 	@chmod 755 $(LIKWIDFILTERPATH)/*
 
-uninstall:
+
+uninstall: uninstall_daemon uninstall_freq
 	@echo "===> REMOVING applications from $(PREFIX)/bin"
-	@rm -f $(addprefix $(PREFIX)/bin/,$(APPS))
-	@rm -f $(addprefix $(PREFIX)/bin/,$(PERL_APPS))
+	@rm -f $(addprefix $(PREFIX)/bin/,$(addsuffix  .lua,$(L_APPS)))
+	@for APP in $(L_APPS); do \
+		rm -f $(PREFIX)/bin/$$APP; \
+	done
+	@for APP in $(C_APPS); do \
+		rm -f $(PREFIX)/bin/$$APP; \
+	done
 	@rm -f $(PREFIX)/bin/feedGnuplot
-	@echo "===> REMOVING daemon applications from $(PREFIX)/sbin"
-	@rm -f $(addprefix $(PREFIX)/sbin/,$(DAEMON_APPS))
+	@rm -rf $(PREFIX)/bin/likwid-lua
+	@echo "===> REMOVING Lua to likwid interface from $(PREFIX)/share/lua"
+	@rm -rf  $(PREFIX)/share/lua/likwid.lua
+	@echo "===> REMOVING libs from $(PREFIX)/lib"
+	@rm -f $(PREFIX)/lib/liblikwid*
+	@rm -f $(PREFIX)/lib/libhwloc*
+	@rm -f $(PREFIX)/lib/liblua*
 	@echo "===> REMOVING man pages from $(MANPREFIX)/man1"
-	@rm -f $(MANPREFIX)/man1/likwid-*
+	@rm -f $(addprefix $(MANPREFIX)/man1/,$(addsuffix  .1,$(L_APPS)))
 	@rm -f $(MANPREFIX)/man1/feedGnuplot.1
-	@echo "===> REMOVING headers from $(PREFIX)/include"
-	@rm -f $(PREFIX)/include/likwid*.h
-	@rm -rf $(PREFIX)/include/likwid
-	@echo "===> REMOVING libs from $(PREFIX)/lib"
-	@rm -f $(PREFIX)/lib/$(LIKWID_LIB)*
-	@echo "===> REMOVING filter from $(PREFIX)/share"
-	@rm -rf  $(PREFIX)/share/likwid
-
-
+	@rm -f $(MANPREFIX)/man1/likwid-setFreq.1
+	@rm -f $(MANPREFIX)/man1/likwid-accessD.1
+	@echo "===> REMOVING header from $(PREFIX)/include"
+	@rm -f $(PREFIX)/include/likwid.h
+	@rm -f $(PREFIX)/include/bstrlib.h
+	$(FORTRAN_REMOVE)
+	@echo "===> REMOVING filter, groups and default configs from $(PREFIX)/share/likwid"
+	@rm -rf $(LIKWIDFILTERPATH)
+	@rm -rf $(PREFIX)/share/likwid/mongroups
+	@rm -rf $(PREFIX)/share/likwid/perfgroups
+	@rm -rf $(PREFIX)/share/likwid/docs
+	@rm -rf $(PREFIX)/share/likwid/examples
+
+
+local: $(L_APPS) likwid.lua
+	@echo "===> Setting Lua scripts to run from current directory"
+	@PWD=$(shell pwd)
+	@for APP in $(L_APPS); do \
+		sed -i -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s/<RELEASE>/$(RELEASE)/g" -e "s+$(PREFIX)/bin/likwid-lua+$(PWD)/ext/lua/lua+" -e "s+$(PREFIX)/share/lua/?.lua+$(PWD)/?.lua+" $$APP; \
+		chmod +x $$APP; \
+	done
+	@sed -i -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" -e "s/<RELEASE>/$(RELEASE)/g" -e "s+$(PREFIX)/lib+$(PWD)+g" -e "s+$(PREFIX)/share/likwid/perfgroups+$(PWD)/groups+g" likwid.lua;
+	@sed -i -e "s+$(PREFIX)/share/likwid/mongroups+$(PWD)/monitoring/groups+g" likwid-agent
 
diff --git a/README b/README
index f47ac01..8660eef 100644
--- a/README
+++ b/README
@@ -1,29 +1,55 @@
+--------------------------------------------------------------------------------
+Introduction
+--------------------------------------------------------------------------------
 Likwid is a simple to install and use toolsuite of command line applications
 for performance oriented programmers. It works for Intel and AMD processors
 on the Linux operating system.
 
 It consists of:
 
-likwid-topology       - print thread and cache topology
-likwid-features       - view and toggle feature reagister on Intel processors
-likwid-perfctr        - configure and read out hardware performance counters on Intel and AMD processors
-likwid-powermeter     - read out RAPL Energy information and get info about Turbo Mode steps
-likwid-setFrequencies - read out RAPL Energy information and get info about Turbo Mode steps
-likwid-memsweeper     - cleans up filled NUMA memory domains and evicts dirty cacheline from cache hierarchy
-likwid-pin            - pin your threaded application (pthread, Intel and gcc OpenMP to dedicated processors
-likwid-bench          - Micro benchmarking platform
-likwid-gencfg         - Dumps topology information to a file
-likwid-mpirun         - Wrapper to start MPI and Hybrid MPI/OpenMP applications (Supports Intel MPI and OpenMPI)
-likwid-scope          - Frontend to the timeline mode of likwid-perfctr, plots live graphs of performance metrics
-
+likwid-topology   - print thread, cache and NUMA topology
+likwid-perfctr    - configure and read out hardware performance counters on 
+                    Intel and AMD processors
+likwid-powermeter - read out RAPL Energy information and get info about Turbo 
+                    mode steps
+likwid-pin        - pin your threaded application (pthread, Intel and gcc OpenMP
+                    to dedicated processors)
+likwid-bench      - Micro benchmarking platform
+likwid-genTopoCfg - Dumps topology information to a file
+likwid-mpirun     - Wrapper to start MPI and Hybrid MPI/OpenMP applications
+                    (Supports Intel MPI, OpenMPI and MPICH)
+likwid-perfscope  - Frontend to the timeline mode of likwid-perfctr, plots live
+                    graphs of performance metrics using gnuplot
+likwid-agent      - Monitoring agent for hardware performance counters
+likwid-memsweeper - Sweep memory of NUMA domains and evict cachelines from the
+                    last level cache
+likwid-setFrequencies - Tool to control the CPU frequency
+
+--------------------------------------------------------------------------------
+Download, Build and Install
+--------------------------------------------------------------------------------
+You can get the releases of LIKWID at:
+http://ftp.fau.de/pub/likwid/
+
+For build and installation hints see INSTALL file
+
+--------------------------------------------------------------------------------
+Documentation
+--------------------------------------------------------------------------------
 For a detailed  documentation on the usage of the tools have a look at the
-likwid wiki pages at:
+html documentation build with doxygen. Call
+
+make docs
 
-http://code.google.com/p/likwid/wiki/Introduction
+or after installation, look at the man pages.
 
-If you have problems or suggestions please let us know on the likwid mailing list:
+There is also a wiki at the github page:
+https://github.com/rrze-likwid/likwid/
 
+If you have problems or suggestions please let me know on the likwid mailing list:
 http://groups.google.com/group/likwid-users
+or if it is bug, add an issue at:
+https://github.com/rrze-likwid/likwid/issues
 
 
 
diff --git a/bench/Makefile b/bench/Makefile
new file mode 100644
index 0000000..7d15a12
--- /dev/null
+++ b/bench/Makefile
@@ -0,0 +1,154 @@
+#
+# =======================================================================================
+#
+#      Filename:  Makefile
+#
+#      Description:  likwid-bench Makefile
+#
+#      Version:   <VERSION>
+#      Released:  <DATE>
+#
+#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2013 Jan Treibig
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+SRC_DIR     = ./src
+MAKE_DIR    = ../make
+
+#DO NOT EDIT BELOW
+
+
+# Dependency chains:
+# *.[ch] -> *.o -> executables
+# *.ptt -> *.pas -> *.s -> *.o -> executables
+# *.txt -> *.h (generated)
+
+include ../config.mk
+include $(MAKE_DIR)/include_$(COMPILER).mk
+include $(MAKE_DIR)/config_checks.mk
+include $(MAKE_DIR)/config_defines.mk
+
+#INCLUDES  += -I./includes -I../src/includes -I../ext/hwloc/include -I../$(COMPILER) -I$(BUILD_DIR)
+INCLUDES  += -I./includes -I$(BUILD_DIR)
+LIBS      +=
+
+#CONFIGURE BUILD SYSTEM
+BUILD_DIR  = ./$(COMPILER)
+Q         ?= @
+
+ifeq ($(COMPILER),MIC)
+BENCH_DIR   = ./phi
+else
+ifeq ($(COMPILER),GCCX86)
+BENCH_DIR   = ./x86
+else
+BENCH_DIR   = ./x86-64
+endif
+endif
+
+STATIC_TARGET_LIB := ../liblikwid.a
+STATIC_HWLOC_LIB := ../ext/hwloc/libhwloc.a
+
+BENCH_LIBS := $(STATIC_TARGET_LIB)
+ifeq ($(USE_HWLOC),true)
+BENCH_LIBS += $(STATIC_HWLOC_LIB)
+endif
+
+ifeq ($(DEBUG),true)
+DEBUG_FLAGS = -g
+DEFINES += -DDEBUG_LIKWID
+else
+DEBUG_FLAGS =
+endif
+
+
+VPATH     = $(SRC_DIR)
+OBJ       = $(patsubst $(SRC_DIR)/%.c, $(BUILD_DIR)/%.o,$(wildcard $(SRC_DIR)/*.c))
+
+OBJ_BENCH  =  $(patsubst $(BENCH_DIR)/%.ptt, $(BUILD_DIR)/%.o,$(wildcard $(BENCH_DIR)/*.ptt))
+
+CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
+
+all: $(BUILD_DIR) $(OBJ) $(OBJ_BENCH) likwid-bench_target
+
+
+$(STATIC_TARGET_LIB):
+	@echo "===> Building likwid"
+	make -C ..
+
+likwid-bench_target:  $(BUILD_DIR) $(OBJ) $(OBJ_BENCH) $(STATIC_TARGET_LIB)
+	@echo "===>  LINKING  likwid-bench"
+	$(Q)${CC} $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) ${LFLAGS} -o likwid-bench  likwid-bench.c $(OBJ_BENCH) $(OBJ) $(BENCH_LIBS) -lm
+
+
+$(BUILD_DIR):
+	@mkdir $(BUILD_DIR)
+
+
+#PATTERN RULES
+$(BUILD_DIR)/%.o:  %.c
+	@echo "===>  COMPILE C $@"
+	$(Q)$(CC) -g -c $(DEBUG_FLAGS) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $< -o $@
+	$(Q)$(CC) -g $(DEBUG_FLAGS) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+
+
+$(BUILD_DIR)/%.pas:  $(BENCH_DIR)/%.ptt
+	@echo "===>  GENERATE BENCHMARKS"
+	$(Q)$(GEN_PAS) $(BENCH_DIR) $(BUILD_DIR) ./perl/templates
+
+
+$(BUILD_DIR)/%.o:  $(BUILD_DIR)/%.pas
+	@echo "===>  ASSEMBLE  $@"
+	$(Q)$(PAS) -i $(PASFLAGS) -o $(BUILD_DIR)/$*.s $<  '$(DEFINES)'
+	$(Q)$(AS) $(ASFLAGS)  $(BUILD_DIR)/$*.s -o $@
+
+ifeq ($(findstring $(MAKECMDGOALS),clean),)
+-include $(OBJ:.o=.d)
+endif
+
+.PHONY: clean distclean install uninstall
+
+
+.PRECIOUS: $(BUILD_DIR)/%.pas
+
+.NOTPARALLEL:
+
+
+clean:
+	@echo "===>  CLEAN"
+	@rm -rf $(BUILD_DIR)
+	@rm -rf likwid-bench
+
+distclean: clean
+
+install:
+	@echo "===> INSTALL applications to $(PREFIX)/bin"
+	cp -f likwid-bench $(PREFIX)/bin
+	@echo "===> INSTALL man pages to $(MANPREFIX)/man1"
+	@mkdir -p $(MANPREFIX)/man1
+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-bench.1 > $(MANPREFIX)/man1/likwid-bench.1
+
+
+uninstall:
+	@echo "===> REMOVING applications from $(PREFIX)/bin"
+	rm -rf $(PREFIX)/bin/likwid-bench
+	@echo "===> REMOVING man pages from $(MANPREFIX)/man1"
+	@rm -f $(MANPREFIX)/man1/likwid-bench.1
+
+
+
diff --git a/bench/includes/allocator.h b/bench/includes/allocator.h
new file mode 100644
index 0000000..a5aba78
--- /dev/null
+++ b/bench/includes/allocator.h
@@ -0,0 +1,50 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  allocator.h
+ *
+ *      Description:  Header File allocator Module. 
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  none
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef ALLOCATOR_H
+#define ALLOCATOR_H
+
+#include <stdint.h>
+#include <bstrlib.h>
+#include <test_types.h>
+
+#define LLU_CAST (unsigned long long)
+
+extern void allocator_init(int numVectors);
+extern void allocator_finalize();
+extern void allocator_allocateVector(void** ptr,
+        int alignment,
+        uint64_t size,
+        int offset,
+        DataType type,
+        bstring domain);
+
+#endif /*ALLOCATOR_H*/
+
diff --git a/bench/includes/barrier.h b/bench/includes/barrier.h
new file mode 100644
index 0000000..99d4a88
--- /dev/null
+++ b/bench/includes/barrier.h
@@ -0,0 +1,57 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  barrier.h
+ *
+ *      Description:  Header File barrier Module
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef BARRIER_H
+#define BARRIER_H
+
+#include <barrier_types.h>
+
+/**
+ * @brief  Initialize the barrier module
+ * @param  numberOfThreads The total number of threads in the barrier
+ */
+extern void barrier_init(int numberOfGroups);
+
+/**
+ * @brief  Register a thread for a barrier
+ * @param  threadId The id of the thread to register
+ */
+extern int barrier_registerGroup(int numThreads);
+extern void barrier_registerThread(BarrierData* barr, int groupsId, int threadId);
+
+/**
+ * @brief  Synchronize threads
+ * @param  threadId The id of the calling thread
+ * @param  numberOfThreads Total number of threads in the barrier
+ */
+extern void  barrier_synchronize(BarrierData* barr);
+
+
+#endif /*BARRIER_H*/
diff --git a/bench/includes/barrier_types.h b/bench/includes/barrier_types.h
new file mode 100644
index 0000000..65ae411
--- /dev/null
+++ b/bench/includes/barrier_types.h
@@ -0,0 +1,49 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  barrier_types.h
+ *
+ *      Description:  Type Definitions for barrier Module
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef BARRIER_TYPES_H
+#define BARRIER_TYPES_H
+
+#include <stdint.h>
+
+typedef struct {
+    int        numberOfThreads;
+    int        offset;
+    int        val;
+    int*       index;
+    volatile int*  bval;
+} BarrierData;
+
+typedef struct {
+    int*       groupBval;
+    int        numberOfThreads;
+} BarrierGroup;
+
+#endif /*BARRIER_TYPES_H*/
diff --git a/bench/includes/bstrlib.h b/bench/includes/bstrlib.h
new file mode 120000
index 0000000..daa8a68
--- /dev/null
+++ b/bench/includes/bstrlib.h
@@ -0,0 +1 @@
+../../src/includes/bstrlib.h
\ No newline at end of file
diff --git a/bench/includes/likwid.h b/bench/includes/likwid.h
new file mode 100644
index 0000000..82d3b4e
--- /dev/null
+++ b/bench/includes/likwid.h
@@ -0,0 +1,1069 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  likwid.h
+ *
+ *      Description:  Header File of likwid API
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Authors:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_H
+#define LIKWID_H
+
+#include <stdint.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bstrlib.h>
+
+#define DEBUGLEV_ONLY_ERROR 0
+#define DEBUGLEV_INFO 1
+#define DEBUGLEV_DETAIL 2
+#define DEBUGLEV_DEVELOP 3
+
+extern int perfmon_verbosity;
+
+/** \addtogroup MarkerAPI Marker API module
+*  @{
+*/
+/*!
+\def LIKWID_MARKER_INIT
+Shortcut for likwid_markerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_THREADINIT
+Shortcut for likwid_markerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_REGISTER(regionTag)
+Shortcut for likwid_markerRegisterRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_START(regionTag)
+Shortcut for likwid_markerStartRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_STOP(regionTag)
+Shortcut for likwid_markerStopRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+Shortcut for likwid_markerGetResults() for \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_SWITCH
+Shortcut for likwid_markerNextGroup() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_CLOSE
+Shortcut for likwid_markerClose() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/** @}*/
+
+#ifdef LIKWID_PERFMON
+#define LIKWID_MARKER_INIT likwid_markerInit()
+#define LIKWID_MARKER_THREADINIT likwid_markerThreadInit()
+#define LIKWID_MARKER_SWITCH likwid_markerNextGroup()
+#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
+#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag)
+#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag)
+#define LIKWID_MARKER_CLOSE likwid_markerClose()
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count)
+#else
+#define LIKWID_MARKER_INIT
+#define LIKWID_MARKER_THREADINIT
+#define LIKWID_MARKER_SWITCH
+#define LIKWID_MARKER_REGISTER(regionTag)
+#define LIKWID_MARKER_START(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)
+#define LIKWID_MARKER_CLOSE
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+
+/*
+################################################################################
+# Marker API related functions
+################################################################################
+*/
+/** \addtogroup MarkerAPI Marker API module
+*  @{
+*/
+/*! \brief Initialize LIKWID's marker API
+
+Must be called in serial region of the application to set up basic data structures
+of LIKWID. 
+Reads environment variables: 
+- LIKWID_MODE (access mode)
+- LIKWID_MASK (event bitmask)
+- LIKWID_EVENTS (event string)
+- LIKWID_THREADS (cpu list separated by ,)
+- LIKWID_GROUPS (amount of groups)
+*/
+extern void likwid_markerInit(void);
+/*! \brief Initialize LIKWID's marker API for the current thread
+
+Must be called in parallel region of the application to set up basic data structures
+of LIKWID. Before you can call likwid_markerThreadInit() you have to call likwid_markerInit().
+
+*/
+extern void likwid_markerThreadInit(void);
+/*! \brief Select next group to measure
+
+Must be called in parallel region of the application to switch group on every CPU.
+*/
+extern void likwid_markerNextGroup(void);
+/*! \brief Close LIKWID's marker API
+
+Must be called in serial region of the application. It gathers all data of regions and
+writes them out to a file (filepath in env variable LIKWID_FILEPATH).
+*/
+extern void likwid_markerClose(void);
+/*! \brief Register a measurement region
+
+Initializes the hashTable entry in order to reduce execution time of likwid_markerStartRegion()
+ at param regionTag [in] Initialize data using this string
+ at return Error code
+*/
+extern int likwid_markerRegisterRegion(const char* regionTag);
+/*! \brief Start a measurement region
+
+Reads the values of all configured counters and saves the results under the name given
+in regionTag.
+ at param regionTag [in] Store data using this string
+ at return Error code of start operation
+*/
+extern int likwid_markerStartRegion(const char* regionTag);
+/*! \brief Stop a measurement region
+
+Reads the values of all configured counters and saves the results under the name given
+in regionTag. The measurement data of the stopped region gets summed up in global region counters.
+ at param regionTag [in] Store data using this string
+ at return Error code of stop operation
+*/
+extern int likwid_markerStopRegion(const char* regionTag);
+
+/*! \brief Get accumulated data of a code region
+
+Get the accumulated data of the current thread for the given regionTag.
+ at param regionTag [in] Print data using this string
+ at param nr_events [in,out] Length of events array
+ at param events [out] Events array for the intermediate results
+ at param time [out] Accumulated measurement time
+ at param count [out] Call count of the code region
+*/
+extern void likwid_markerGetRegion(const char* regionTag, int* nr_events, double* events, double *time, int *count);
+/* utility routines */
+/*! \brief Get CPU ID of the current process/thread
+
+Returns the ID of the CPU the current process or thread is running on.
+ at return current CPU ID
+*/
+extern int  likwid_getProcessorId();
+/*! \brief Pin the current process to given CPU
+
+Pin the current process to the given CPU ID. The process cannot be scheduled to 
+another CPU after pinning but the pinning can be changed anytime with this function.
+ at param [in] processorId CPU ID to pin the current process to
+ at return error code (1 for success, 0 for error)
+*/
+extern int  likwid_pinProcess(int processorId);
+/*! \brief Pin the current thread to given CPU
+
+Pin the current thread to the given CPU ID. The thread cannot be scheduled to 
+another CPU after pinning but the pinning can be changed anytime with this function
+ at param [in] processorId CPU ID to pin the current thread to
+ at return error code (1 for success, 0 for error)
+*/
+extern int  likwid_pinThread(int processorId);
+/** @}*/
+
+/* 
+################################################################################
+# Access client related functions
+################################################################################
+*/
+/** \addtogroup AccessClient Access client module
+ *  @{
+ */
+
+/*! \brief Enum for the access modes
+
+LIKWID supports multiple access modes to the MSR and PCI performance monitoring 
+registers. For direct access the user must have enough priviledges to access the
+MSR and PCI devices. The daemon mode forwards the operations to a daemon with 
+higher priviledges.
+*/
+typedef enum {
+    ACCESSMODE_DIRECT = 0, /*!< \brief Access performance monitoring registers directly */
+    ACCESSMODE_DAEMON = 1 /*!< \brief Use the access daemon to access the registers */
+} AccessMode;
+
+/*! \brief Set accessClient mode
+
+Sets the mode how the MSR and PCI registers should be accessed. 0 for direct access (propably root priviledges required) and 1 for accesses through the access daemon. It must be called before accessClient_init()
+ at param [in] mode (0=direct, 1=daemon)
+*/
+extern void accessClient_setaccessmode(int mode);
+/*! \brief Initialize socket for communication with the access daemon
+
+Initializes the file descriptor by starting and connecting to a new access daemon.
+ at param [out] socket_fd Pointer to socket file descriptor
+*/
+extern void accessClient_init(int* socket_fd);
+/*! \brief Destroy socket for communication with the access daemon
+
+Destroys the file descriptor by disconnecting and shutting down the access daemon
+ at param [in] socket_fd socket file descriptor
+*/
+extern void accessClient_finalize(int socket_fd);
+/** @}*/
+
+/*
+################################################################################
+# Config file related functions
+################################################################################
+*/
+/** \addtogroup Config Config file module
+*  @{
+*/
+/*! \brief Structure holding values of the configuration file
+
+LIKWID supports the definition of runtime values in a configuration file. The 
+most important configurations in most cases are the path the access daemon and 
+the corresponding access mode. In order to avoid reading in the system topology
+at each start, a path to a topology file can be set. The other values are mostly
+used internally.
+*/
+typedef struct {
+    char* configFileName; /*!< \brief Path to the configuration file */
+    char* topologyCfgFileName; /*!< \brief Path to the topology file */
+    char* daemonPath; /*!< \brief Path of the access daemon */
+    AccessMode daemonMode; /*!< \brief Access mode to the MSR and PCI registers */
+    int maxNumThreads; /*!< \brief Maximum number of HW threads */
+    int maxNumNodes; /*!< \brief Maximum number of NUMA nodes */
+} Configuration;
+
+/** \brief Pointer for exporting the Configuration data structure */
+typedef Configuration* Configuration_t;
+/*! \brief Read the config file of LIKWID, if it exists
+
+Search for LIKWID config file and read the values in
+Currently the paths /usr/local/etc/likwid.cfg, /etc/likwid.cfg and the path
+defined in config.mk are checked.
+ at return error code (0 for success, -EFAULT if no file can be found)
+*/
+extern int init_configuration(void);
+/*! \brief Destroy the config structure
+
+Destroys the current config structure and frees all allocated memory for path names
+ at return error code (0 for success, -EFAULT if config structure not initialized)
+*/
+extern int destroy_configuration(void);
+
+
+/*! \brief Retrieve the config structure
+
+Get the initialized configuration
+\sa Configuration_t
+ at return Configuration_t (pointer to internal Configuration structure)
+*/
+extern Configuration_t get_configuration(void);
+/** @}*/
+/* 
+################################################################################
+# CPU topology related functions
+################################################################################
+*/
+/** \addtogroup CPUTopology CPU information module
+*  @{
+*/
+/*! \brief Structure with general CPU information
+
+General information covers CPU family, model, name and current clock and vendor 
+specific information like the version of Intel's performance monitoring facility.
+*/
+typedef struct {
+    uint32_t family; /*!< \brief CPU family ID*/
+    uint32_t model; /*!< \brief CPU model ID */
+    uint32_t stepping; /*!< \brief Stepping (version) of the CPU */
+    uint64_t clock; /*!< \brief Current clock frequency of the executing CPU*/
+    int      turbo; /*!< \brief Flag if CPU has a turbo mode */
+    char*  osname; /*!< \brief Name of the CPU reported by OS */
+    char*  name; /*!< \brief Name of the CPU as identified by LIKWID */
+    char*  short_name; /*!< \brief Short name of the CPU*/
+    char*  features; /*!< \brief String with all features supported by the CPU*/
+    int         isIntel; /*!< \brief Flag if it is an Intel CPU*/
+    int     supportUncore; /*!< \brief Flag if system has Uncore performance monitors */
+    uint32_t featureFlags; /*!< \brief Mask of all features supported by the CPU*/
+    uint32_t perf_version; /*!< \brief Version of Intel's performance monitoring facility */
+    uint32_t perf_num_ctr; /*!< \brief Number of general purpose core-local performance monitoring counters */
+    uint32_t perf_width_ctr; /*!< \brief Bit width of fixed and general purpose counters */
+    uint32_t perf_num_fixed_ctr; /*!< \brief Number of fixed purpose core-local performance monitoring counters */
+} CpuInfo;
+
+/*! \brief Structure with IDs of a HW thread
+
+For each HW thread this structure stores the ID of the thread inside a CPU, the
+CPU core ID of the HW thread and the CPU socket ID.
+\extends CpuTopology
+*/
+typedef struct {
+    uint32_t threadId; /*!< \brief ID of HW thread inside the CPU core */
+    uint32_t coreId; /*!< \brief ID of CPU core that executes the HW thread */
+    uint32_t packageId; /*!< \brief ID of CPU socket containing the HW thread */
+    uint32_t apicId; /*!< \brief ID of HW thread retrieved through the Advanced Programmable Interrupt Controller */
+    uint32_t inCpuSet; /*!< \brief ID of HW thread inside the CPU core */
+} HWThread;
+
+/*! \brief Enum of possible caches
+
+CPU caches can have different tasks and hold different kind of data. This enum lists all shapes used in all supported CPUs
+\extends CacheLevel
+*/
+typedef enum {
+    NOCACHE=0, /*!< \brief No cache used as undef value */
+    DATACACHE, /*!< \brief Cache holding data cache lines */
+    INSTRUCTIONCACHE, /*!< \brief Cache holding instruction cache lines */
+    UNIFIEDCACHE, /*!< \brief Cache holding both instruction and data cache lines */
+    ITLB, /*!< \brief Translation Lookaside Buffer cache for instruction pages */
+    DTLB /*!< \brief Translation Lookaside Buffer cache for data pages */
+} CacheType;
+
+/*! \brief Structure describing a cache level
+
+CPUs are connected to a cache hierarchy with different amount of caches at each level. The CacheLevel structure holds general information about the cache.
+\extends CpuTopology
+*/
+typedef struct {
+    uint32_t level; /*!< \brief Level of the cache in the hierarchy */
+    CacheType type; /*!< \brief Type of the cache */
+    uint32_t associativity; /*!< \brief Amount of cache lines hold by each set */
+    uint32_t sets; /*!< \brief Amount of sets */
+    uint32_t lineSize; /*!< \brief Size in bytes of one cache line */
+    uint32_t size; /*!< \brief Size in bytes of the cache */
+    uint32_t threads; /*!< \brief Number of HW thread connected to the cache */
+    uint32_t inclusive; /*!< \brief Flag if cache is inclusive (holds also cache lines available in caches nearer to the CPU) or exclusive */
+} CacheLevel;
+
+/*! \brief Structure describing the topology of the HW threads in the system
+
+This structure describes the topology at HW thread level like the amount of HW threads, how they are distributed over the CPU sockets/packages and how the caching hierarchy is assembled.
+*/
+typedef struct {
+    uint32_t numHWThreads; /*!< \brief Amount of HW threads in the system and length of \a threadPool */
+    uint32_t activeHWThreads; /*!< \brief Amount of HW threads in the system and length of \a threadPool */
+    uint32_t numSockets; /*!< \brief Amount of CPU sockets/packages in the system */
+    uint32_t numCoresPerSocket; /*!< \brief Amount of physical cores in one CPU socket/package */
+    uint32_t numThreadsPerCore; /*!< \brief Amount of HW threads in one physical CPU core */
+    uint32_t numCacheLevels; /*!< \brief Amount of caches for each HW thread and length of \a cacheLevels */
+    HWThread* threadPool; /*!< \brief List of all HW thread descriptions */
+    CacheLevel*  cacheLevels; /*!< \brief List of all caches in the hierarchy */
+    struct treeNode* topologyTree; /*!< \brief Anchor for a tree structure describing the system topology */
+} CpuTopology;
+
+/*! \brief Variable holding the global cpu information structure */
+extern CpuInfo cpuid_info;
+/*! \brief Variable holding the global cpu topology structure */
+extern CpuTopology cpuid_topology;
+
+/** \brief Pointer for exporting the CpuInfo data structure */
+typedef CpuInfo* CpuInfo_t;
+/** \brief Pointer for exporting the CpuTopology data structure */
+typedef CpuTopology* CpuTopology_t;
+/*! \brief Initialize topology information
+
+CpuInfo_t and CpuTopology_t are initialized by either HWLOC, CPUID/ProcFS or topology file if present. The topology file name can be configured in the configuration file. Furthermore, the paths /etc/likwid_topo.cfg and <PREFIX>/etc/likwid_topo.cfg are checked.
+\sa CpuInfo_t and CpuTopology_t
+ at return always 0
+*/
+extern int topology_init(void);
+/*! \brief Retrieve CPU topology of the current machine
+
+\sa CpuTopology_t
+ at return CpuTopology_t (pointer to internal cpuid_topology structure)
+*/
+extern CpuTopology_t get_cpuTopology(void);
+/*! \brief Retrieve CPU information of the current machine
+
+Get the previously initialized CPU info structure containing number of CPUs/Threads
+\sa CpuInfo_t
+ at return CpuInfo_t (pointer to internal cpuid_info structure)
+*/
+extern CpuInfo_t get_cpuInfo(void);
+/*! \brief Destroy topology structures CpuInfo_t and CpuTopology_t.
+
+Retrieved pointers to the structures are not valid anymore after this function call
+\sa CpuInfo_t and CpuTopology_t
+*/
+extern void topology_finalize(void);
+/*! \brief Print all supported architectures
+*/
+extern void print_supportedCPUs(void);
+/** @}*/
+/* 
+################################################################################
+# NUMA related functions
+################################################################################
+*/
+/** \addtogroup NumaTopology NUMA memory topology module
+ *  @{
+ */
+/*! \brief CPUs in NUMA node and general information about a NUMA domain
+
+The NumaNode structure describes the topology and holds general information of a
+NUMA node. The structure is filled by calling numa_init() by either the HWLOC 
+library or by evaluating the /proc filesystem.
+\extends NumaTopology
+*/
+typedef struct {
+    uint32_t id; /*!< \brief ID of the NUMA node */
+    uint64_t totalMemory; /*!< \brief Amount of memory in the NUMA node */
+    uint64_t freeMemory; /*!< \brief Amount of free memory in the NUMA node */
+    uint32_t numberOfProcessors; /*!< \brief umber of processors covered by the NUMA node and length of \a processors */
+    uint32_t*  processors; /*!< \brief List of HW threads in the NUMA node */
+    uint32_t*  processorsCompact; /*!< \brief Currently unused */
+    uint32_t numberOfDistances; /*!< \brief Amount of distances to the other NUMA nodes in the system and self  */
+    uint32_t*  distances; /*!< \brief List of distances to the other NUMA nodes and self */
+} NumaNode;
+
+
+/*! \brief  The NumaTopology structure describes all NUMA nodes in the current system.
+*/
+typedef struct {
+    uint32_t numberOfNodes; /*!< \brief Number of NUMA nodes in the system and length of \a nodes  */
+    NumaNode* nodes; /*!< \brief List of NUMA nodes */
+} NumaTopology;
+
+/*! \brief Variable holding the global NUMA information structure */
+extern NumaTopology numa_info;
+
+/** \brief Pointer for exporting the NumaTopology data structure */
+typedef NumaTopology* NumaTopology_t;
+
+/*! \brief Initialize NUMA information
+
+Initialize NUMA information NumaTopology_t using either HWLOC or CPUID/ProcFS. If
+a topology config file is present it is read at topology_init() and fills \a NumaTopology_t
+\sa NumaTopology_t
+ at return error code (0 for success, -1 if initialization failed)
+*/
+extern int numa_init(void);
+/*! \brief Retrieve NUMA information of the current machine
+
+Get the previously initialized NUMA info structure
+\sa NumaTopology_t
+ at return NumaTopology_t (pointer to internal numa_info structure)
+*/
+extern NumaTopology_t get_numaTopology(void);
+/*! \brief Set memory allocation policy to interleaved
+
+Set the memory allocation policy to interleaved for given list of CPUs
+ at param [in] processorList List of processors
+ at param [in] numberOfProcessors Length of processor list
+*/
+extern void numa_setInterleaved(int* processorList, int numberOfProcessors);
+/*! \brief Allocate memory from a specific specific NUMA node
+ at param [in,out] ptr Start pointer of memory
+ at param [in] size Size for the allocation
+ at param [in] domainId ID of NUMA node for the allocation
+*/
+extern void numa_membind(void* ptr, size_t size, int domainId);
+/*! \brief Destroy NUMA information structure
+
+Destroys the NUMA information structure NumaTopology_t. Retrieved pointers
+to the structures are not valid anymore after this function call
+\sa NumaTopology_t
+*/
+extern void numa_finalize(void);
+/*! \brief Retrieve the number of NUMA nodes
+
+Returns the number of NUMA nodes of the current machine. Can also be read out of
+NumaTopology_t
+\sa NumaTopology_t
+ at return Number of NUMA nodes
+*/
+extern int likwid_getNumberOfNodes(void);
+/** @}*/
+/* 
+################################################################################
+# Affinity domains related functions
+################################################################################
+*/
+/** \addtogroup AffinityDomains Thread affinity module
+ *  @{
+ */
+
+/*! \brief The AffinityDomain data structure describes a single domain in the current system
+
+The AffinityDomain data structure describes a single domain in the current system. Example domains are NUMA nodes, CPU sockets/packages or LLC (Last Level Cache) cache domains.
+\extends AffinityDomains
+*/
+typedef struct {
+    bstring tag; /*!< \brief Bstring with the ID for the affinity domain. Currently possible values: N (node), SX (socket/package X), CX (LLC cache domain X) and MX (memory domain X) */
+    uint32_t numberOfProcessors; /*!< \brief Number of HW threads in the domain and length of \a processorList */
+    uint32_t numberOfCores; /*!< \brief Number of CPU cores in the domain */
+    int*  processorList; /*!< \brief List of HW thread IDs in the domain */
+} AffinityDomain;
+
+/*! \brief The AffinityDomains data structure holds different count variables describing the
+various system layers
+
+Affinity domains are for example the amount of NUMA domains, CPU sockets/packages or LLC 
+(Last Level Cache) cache domains of the current machine. Moreover a list of
+\a domains holds the processor lists for each domain that are used for
+scheduling processes to domain specific HW threads. Some amounts are duplicates
+or derivation of values in \a CpuInfo, \a CpuTopology and \a NumaTopology.
+*/
+typedef struct {
+    uint32_t numberOfSocketDomains; /*!< \brief Number of CPU sockets/packages in the system */
+    uint32_t numberOfNumaDomains; /*!< \brief Number of NUMA nodes in the system */
+    uint32_t numberOfProcessorsPerSocket; /*!< \brief Number of HW threads per socket/package in the system */
+    uint32_t numberOfCacheDomains; /*!< \brief Number of LLC caches in the system */
+    uint32_t numberOfCoresPerCache; /*!< \brief Number of HW threads per LLC cache in the system */
+    uint32_t numberOfProcessorsPerCache; /*!< \brief Number of CPU cores per LLC cache in the system */
+    uint32_t numberOfAffinityDomains; /*!< \brief Number of affinity domains in the current system  and length of \a domains array */
+    AffinityDomain* domains; /*!< \brief List of all domains in the system */
+} AffinityDomains;
+
+/** \brief Pointer for exporting the AffinityDomains data structure */
+typedef AffinityDomains* AffinityDomains_t;
+
+/*! \brief Initialize affinity information
+
+Initialize affinity information AffinityDomains_t using the data of the structures
+\a CpuInfo_t, CpuTopology_t and NumaTopology_t
+\sa AffinityDomains_t
+*/
+extern void affinity_init();
+/*! \brief Retrieve affinity structure
+
+Get the previously initialized affinity info structure
+\sa AffinityDomains_t
+ at return AffinityDomains_t (pointer to internal affinityDomains structure)
+*/
+extern AffinityDomains_t get_affinityDomains(void);
+/*! \brief Pin process to a CPU
+
+Pin process to a CPU. Duplicate of likwid_pinProcess()
+ at param [in] processorId CPU ID for pinning
+*/
+extern void affinity_pinProcess(int processorId);
+/*! \brief Pin thread to a CPU
+
+Pin thread to a CPU. Duplicate of likwid_pinThread()
+ at param [in] processorId CPU ID for pinning
+*/
+extern void affinity_pinThread(int processorId);
+/*! \brief Return the CPU ID where the current process runs.
+
+ at return CPU ID
+*/
+extern int affinity_processGetProcessorId();
+/*! \brief Return the CPU ID where the current thread runs.
+
+ at return CPU ID
+*/
+extern int affinity_threadGetProcessorId();
+/*! \brief Destroy affinity information structure
+
+Destroys the affinity information structure AffinityDomains_t. Retrieved pointers
+to the structures are not valid anymore after this function call
+\sa AffinityDomains_t
+*/
+extern void affinity_finalize();
+/** @}*/
+
+/*
+################################################################################
+# Performance monitoring related functions
+################################################################################
+*/
+/** \addtogroup PerfMon Performance monitoring module
+ *  @{
+ */
+/*! \brief Initialize performance monitoring facility
+
+Initialize the performance monitoring feature by creating basic data structures.
+The access mode must already be set when calling perfmon_init()
+ at param [in] nrThreads Amount of threads
+ at param [in] threadsToCpu List of CPUs
+ at return error code (0 on success, -ERRORCODE on failure)
+*/
+extern int perfmon_init(int nrThreads, int threadsToCpu[]);
+
+/*! \brief Initialize performance monitoring maps
+
+Initialize the performance monitoring maps for counters, events and Uncore boxes#
+for the current architecture. topology_init() and numa_init() must be called before calling
+perfmon_init_maps()
+\sa RegisterMap list, PerfmonEvent list and BoxMap list
+*/
+extern void perfmon_init_maps(void);
+/*! \brief Add an event string to LIKWID
+
+A event string looks like Eventname:Countername(:Option1:Option2:...),...
+The eventname, countername and options are checked if they are available.
+ at param [in] eventCString Event string
+ at return Returns the ID of the new eventSet
+*/
+extern int perfmon_addEventSet(char* eventCString);
+/*! \brief Setup all performance monitoring counters of an eventSet
+
+A event string looks like Eventname:Countername(:Option1:Option2:...),...
+The eventname, countername and options are checked if they are available.
+ at param [in] groupId (returned from perfmon_addEventSet()
+ at return error code (-ENOENT if groupId is invalid and -1 if the counters of one CPU cannot be set up)
+*/
+extern int perfmon_setupCounters(int groupId);
+/*! \brief Start performance monitoring counters
+
+Start the counters that have been previously set up by perfmon_setupCounters().
+The counter registered are zeroed before enabling the counters
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_startCounters(void);
+/*! \brief Stop performance monitoring counters 
+
+Stop the counters that have been previously started by perfmon_startCounters().
+All config registers get zeroed before reading the counter register.
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_stopCounters(void);
+/*! \brief Read the performance monitoring counters on all CPUs
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again.
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readCounters(void);
+/*! \brief Read the performance monitoring counters on one CPU
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again. Only one CPU is read.
+ at param [in] cpu_id CPU ID of the CPU that should be read
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readCountersCpu(int cpu_id);
+/*! \brief Switch the active eventSet to a new one
+
+Stops the currently running counters, switches the eventSet by setting up the
+counters and start the counters.
+ at param [in] new_group ID of group that should be switched to.
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_switchActiveGroup(int new_group);
+/*! \brief Close the perfomance monitoring facility of LIKWID
+
+Deallocates all internal data that is used during performance monitoring. Also
+the counter values are not accessible after this function.
+*/
+extern void perfmon_finalize(void);
+/*! \brief Get the results of the specified group, counter and thread
+
+Get the result of the last measurement cycle. The function takes care of happened
+overflows and if the counter values need to be calculated with multipliers.
+ at param [in] groupId ID of the group that should be read
+ at param [in] eventId ID of the event that should be read
+ at param [in] threadId ID of the thread/cpu that should be read
+ at return The counter result
+*/
+extern double perfmon_getResult(int groupId, int eventId, int threadId);
+/*! \brief Get the number of configured event groups
+
+ at return Number of groups
+*/
+extern int perfmon_getNumberOfGroups(void);
+/*! \brief Get the number of configured eventSets in group
+
+ at param [in] groupId ID of group
+ at return Number of eventSets
+*/
+extern int perfmon_getNumberOfEvents(int groupId);
+/*! \brief Get the measurement time a group
+
+ at param [in] groupId ID of group
+ at return Time in seconds the event group was measured
+*/
+extern double perfmon_getTimeOfGroup(int groupId);
+/*! \brief Get the ID of the currently set up event group
+
+ at return Number of active group
+*/
+extern int perfmon_getIdOfActiveGroup(void);
+/*! \brief Get the number of threads specified at perfmon_init()
+
+ at return Number of threads
+*/
+extern int perfmon_getNumberOfThreads(void);
+/** @}*/
+
+/*
+################################################################################
+# Time measurements related functions
+################################################################################
+*/
+
+/** \addtogroup TimerMon Time measurement module
+ *  @{
+ */
+
+/*! \brief Struct defining the start and stop time of a time interval
+\extends TimerData
+*/
+typedef union
+{
+    uint64_t int64; /*!< \brief Cycle count in 64 bit */
+    struct {uint32_t lo, hi;} int32; /*!< \brief Cycle count stored in two 32 bit fields */
+} TscCounter;
+
+/*! \brief Struct defining the start and stop time of a time interval
+*/
+typedef struct {
+    TscCounter start; /*!< \brief Cycles at start */
+    TscCounter stop; /*!< \brief Cycles at stop */
+} TimerData;
+
+/*! \brief Initialize timer by retrieving baseline frequency and cpu clock
+*/
+extern void timer_init( void );
+/*! \brief Return the measured interval in seconds
+
+ at param [in] time Structure holding the cycle count at start and stop
+ at return Time in seconds
+*/
+extern double timer_print( TimerData* time);
+/*! \brief Return the measured interval in cycles
+
+ at param [in] time Structure holding the cycle count at start and stop
+ at return Time in cycles
+*/
+extern uint64_t timer_printCycles( TimerData* time);
+/*! \brief Return the CPU clock determined at timer_init
+
+ at return CPU clock
+*/
+extern uint64_t timer_getCpuClock( void );
+/*! \brief Return the baseline CPU clock determined at timer_init
+
+ at return Baseline CPU clock
+*/
+extern uint64_t timer_getBaseline( void );
+/*! \brief Start time measurement
+
+ at param [in,out] time Structure holding the cycle count at start
+*/
+extern void timer_start( TimerData* time );
+/*! \brief Stop time measurement
+
+ at param [in,out] time Structure holding the cycle count at stop
+*/
+extern void timer_stop ( TimerData* time);
+
+/** @}*/
+
+/* 
+################################################################################
+# Power measurements related functions
+################################################################################
+*/
+/** \addtogroup PowerMon Power and Energy monitoring module
+ *  @{
+ */
+
+/*!
+\def NUM_POWER_DOMAINS
+Amount of currently supported RAPL domains
+*/
+#define NUM_POWER_DOMAINS 4
+/*! \brief List of all RAPL domain names
+*/
+extern const char* power_names[NUM_POWER_DOMAINS];
+
+/*!
+\def POWER_DOMAIN_SUPPORT_STATUS
+Flag to check in PowerDomain's supportFlag if the status msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_STATUS (1ULL<<0)
+/*!
+\def POWER_DOMAIN_SUPPORT_LIMIT
+Flag to check in PowerDomain's supportFlag if the limit msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_LIMIT (1ULL<<1)
+/*!
+\def POWER_DOMAIN_SUPPORT_POLICY
+Flag to check in PowerDomain's supportFlag if the policy msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_POLICY (1ULL<<2)
+/*!
+\def POWER_DOMAIN_SUPPORT_PERF
+Flag to check in PowerDomain's supportFlag if the perf msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_PERF (1ULL<<3)
+/*!
+\def POWER_DOMAIN_SUPPORT_INFO
+Flag to check in PowerDomain's supportFlag if the info msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_INFO (1ULL<<4)
+
+
+/*! \brief Information structure of CPU's turbo mode
+\extends PowerInfo
+*/
+typedef struct {
+    int numSteps; /*!< \brief Amount of turbo mode steps/frequencies */
+    double* steps; /*!< \brief List of turbo mode steps */
+} TurboBoost;
+
+/*! \brief Enum for all supported RAPL domains
+\extends PowerDomain
+*/
+typedef enum {
+    PKG = 0, /*!< \brief PKG domain, mostly one CPU socket/package */
+    PP0 = 1, /*!< \brief PP0 domain, not clearly defined by Intel */
+    PP1 = 2, /*!< \brief PP1 domain, not clearly defined by Intel */
+    DRAM = 3 /*!< \brief DRAM domain, the memory modules */
+} PowerType;
+
+/*! \brief Structure describing an RAPL power domain
+\extends PowerInfo
+*/
+typedef struct {
+    PowerType type; /*!< \brief Identifier which RAPL domain is managed by this struct */
+    uint32_t supportFlags; /*!< \brief Bitmask which features are supported by the power domain */
+    double energyUnit; /*!< \brief Multiplier for energy measurements */
+    double tdp; /*!< \brief Thermal Design Power (maximum amount of heat generated by the CPU) */
+    double minPower; /*!< \brief Minimal power consumption of the CPU */
+    double maxPower; /*!< \brief Maximal power consumption of the CPU */
+    double maxTimeWindow; /*!< \brief Minimal power measurement interval */
+} PowerDomain;
+
+/*! \brief Information structure of CPU's power measurement facility
+*/
+typedef struct {
+    double baseFrequency; /*!< \brief Base frequency of the CPU */
+    double minFrequency; /*!< \brief Minimal frequency of the CPU */
+    TurboBoost turbo; /*!< \brief Turbo boost information */
+    int hasRAPL; /*!< \brief RAPL support flag */
+    double powerUnit; /*!< \brief Multiplier for power measurements */
+    double timeUnit; /*!< \brief Multiplier for time information */
+    PowerDomain domains[NUM_POWER_DOMAINS]; /*!< \brief List of power domains */
+} PowerInfo;
+
+/*! \brief Power measurement data for start/stop measurements
+*/
+typedef struct {
+    int domain; /*!< \brief RAPL domain identifier */
+    uint32_t before; /*!< \brief Counter state at start */
+    uint32_t after; /*!< \brief Counter state at stop */
+} PowerData;
+
+/*! \brief Variable holding the global power information structure */
+extern PowerInfo power_info;
+
+/** \brief Pointer for exporting the PowerInfo data structure */
+typedef PowerInfo* PowerInfo_t;
+/** \brief Pointer for exporting the PowerData data structure */
+typedef PowerData* PowerData_t;
+
+/*! \brief Initialize power measurements on specific CPU
+
+Additionally, it reads basic information about the power measurements like 
+minimal measurement time.
+ at param [in] cpuId Initialize power facility for this CPU
+ at return error code
+*/
+extern int power_init(int cpuId);
+/*! \brief Get a pointer to the power facility information
+
+ at return PowerInfo_t pointer
+\sa PowerInfo_t
+*/
+extern PowerInfo_t get_powerInfo(void);
+/*! \brief Read the current power value
+
+ at param [in] cpuId Read power facility for this CPU
+ at param [in] reg Power register
+ at param [out] data Power data
+*/
+extern int power_read(int cpuId, uint64_t reg, uint32_t *data);
+/*! \brief Read the current power value using a specific communication socket
+
+ at param [in] socket_fd Communication socket for the read operation
+ at param [in] cpuId Read power facility for this CPU
+ at param [in] reg Power register
+ at param [out] data Power data
+*/
+extern int power_tread(int socket_fd, int cpuId, uint64_t reg, uint32_t *data);
+/*! \brief Start power measurements
+
+ at param [in,out] data Data structure holding start and stop values for power measurements
+ at param [in] cpuId Start power facility for this CPU
+ at param [in] type Which type should be measured
+ at return error code
+*/
+extern int power_start(PowerData_t data, int cpuId, PowerType type);
+/*! \brief Stop power measurements
+
+ at param [in,out] data Data structure holding start and stop values for power measurements
+ at param [in] cpuId Start power facility for this CPU
+ at param [in] type Which type should be measured
+ at return error code
+*/
+extern int power_stop(PowerData_t data, int cpuId, PowerType type);
+/*! \brief Print power measurements gathered by power_start() and power_stop()
+
+ at param [in] data Data structure holding start and stop values for power measurements
+ at return Consumed energy in Joules
+*/
+extern double power_printEnergy(PowerData* data);
+/*! \brief Get energy Unit
+
+ at param [in] domain RAPL domain ID
+ at return Power unit of the given RAPL domain
+*/
+extern double power_getEnergyUnit(int domain);
+
+/*! \brief Get the values of the limit register of a domain
+
+ at param [in] cpuId CPU ID
+ at param [in] domain RAPL domain ID
+ at param [out] power Power limit
+ at param [out] time Time limit
+ at return error code
+*/
+int power_limitGet(int cpuId, PowerType domain, double* power, double* time);
+
+/*! \brief Set the values of the limit register of a domain
+
+ at param [in] cpuId CPU ID
+ at param [in] domain RAPL domain ID
+ at param [in] power Power limit
+ at param [in] time Time limit
+ at param [in] doClamping Activate clamping (going below OS-requested power level)
+ at return error code
+*/
+int power_limitSet(int cpuId, PowerType domain, double power, double time, int doClamping);
+
+/*! \brief Get the state of a power limit, activated or deactivated
+
+ at param [in] cpuId CPU ID
+ at param [in] domain RAPL domain ID
+ at return state, 1 for active, 0 for inactive
+*/
+int power_limitState(int cpuId, PowerType domain);
+
+/*! \brief Free space of power_unit
+*/
+extern void power_finalize(void);
+/** @}*/
+
+/* 
+################################################################################
+# Thermal measurements related functions
+################################################################################
+*/
+/** \addtogroup ThermalMon Thermal monitoring module
+ *  @{
+ */
+/*! \brief Initialize thermal measurements on specific CPU
+
+ at param [in] cpuId Initialize thermal facility for this CPU
+*/
+extern void thermal_init(int cpuId);
+/*! \brief Read the current thermal value
+
+ at param [in] cpuId Read thermal facility for this CPU
+ at param [out] data Thermal data
+*/
+extern int thermal_read(int cpuId, uint32_t *data);
+/*! \brief Read the current thermal value using a specific communication socket
+
+ at param [in] socket_fd Communication socket for the read operation
+ at param [in] cpuId Read thermal facility for this CPU
+ at param [out] data Thermal data
+*/
+extern int thermal_tread(int socket_fd, int cpuId, uint32_t *data);
+/** @}*/
+
+/* 
+################################################################################
+# Timeline daemon related functions
+################################################################################
+*/
+/** \addtogroup Daemon Timeline daemon module
+ *  @{
+ */
+/*! \brief Start timeline daemon
+
+Starts the timeline daemon which reads and prints the counter values after each \a duration time
+ at param [in] duration Time interval in ns
+ at param [in] outfile File to write the intermediate readings or NULL to write to stderr
+ at return 0 on success and -EFAULT if counters cannot be started
+*/
+extern int daemon_start(uint64_t duration, const char* outfile);
+/*! \brief Stop timeline daemon
+
+Stop the timeline daemon using the signal \a sig
+ at param [in] sig Signal code to kill the daemon (see signal.h for signal codes)
+ at return 0 on success and the negative error code at failure
+*/
+extern int daemon_stop(int sig);
+/** @}*/
+
+/* 
+################################################################################
+# Memory sweeping related functions
+################################################################################
+*/
+/** \addtogroup MemSweep Memory sweeping module
+ *  @{
+ */
+/*! \brief Sweeping the memory of a NUMA node
+
+Sweeps (zeros) the memory of NUMA node with ID \a domainId
+ at param [in] domainId NUMA node ID
+*/
+extern void memsweep_domain(int domainId);
+/*! \brief Sweeping the memory of all NUMA nodes covered by CPU list
+
+Sweeps (zeros) the memory of all NUMA nodes containing the CPUs in \a processorList
+ at param [in] processorList List of CPU IDs
+ at param [in] numberOfProcessors Number of CPUs in list
+*/
+extern void memsweep_threadGroup(int* processorList, int numberOfProcessors);
+/** @}*/
+#ifdef __cplusplus
+}
+#endif
+
+#endif /*LIKWID_H*/
diff --git a/bench/includes/strUtil.h b/bench/includes/strUtil.h
new file mode 100644
index 0000000..7c773a7
--- /dev/null
+++ b/bench/includes/strUtil.h
@@ -0,0 +1,59 @@
+/*
+ * =======================================================================================
+ *      Filename:  strUtil.h
+ *
+ *      Description:  Some sting functions
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#ifndef STRUTIL_H
+#define STRUTIL_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <bstrlib.h>
+#include <likwid.h>
+
+#include <test_types.h>
+
+typedef struct {
+    bstring domain;
+    int offset;
+    void* ptr;
+} Stream;
+
+typedef struct {
+    uint32_t numberOfThreads;
+    int* processorIds;
+    uint64_t size;
+    Stream* streams;
+} Workgroup;
+
+
+extern void bstr_to_workgroup(Workgroup* group, const_bstring str, DataType type, int numberOfStreams);
+
+#endif
diff --git a/bench/includes/test_types.h b/bench/includes/test_types.h
new file mode 100644
index 0000000..b2cb514
--- /dev/null
+++ b/bench/includes/test_types.h
@@ -0,0 +1,105 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  test_types.h
+ *
+ *      Description:  Type definitions for benchmarking framework
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef TEST_TYPES_H
+#define TEST_TYPES_H
+
+#include <stdint.h>
+#include <bstrlib.h>
+
+typedef void (*FuncPrototype)();
+
+typedef enum {
+    SINGLE = 0,
+    DOUBLE} DataType;
+
+typedef enum {
+    STREAM_1 = 1,
+    STREAM_2,
+    STREAM_3,
+    STREAM_4,
+    STREAM_5,
+    STREAM_6,
+    STREAM_7,
+    STREAM_8,
+    STREAM_9,
+    STREAM_10,
+    STREAM_11,
+    STREAM_12,
+    STREAM_13,
+    STREAM_14,
+    STREAM_15,
+    STREAM_16,
+    STREAM_17,
+    STREAM_18,
+    STREAM_19,
+    STREAM_20,
+    STREAM_21,
+    STREAM_22,
+    STREAM_23,
+    STREAM_24,
+    STREAM_25,
+    STREAM_26,
+    STREAM_27,
+    STREAM_28,
+    STREAM_29,
+    STREAM_30,
+    STREAM_31,
+    STREAM_32,
+    STREAM_33,
+    STREAM_34,
+    STREAM_35,
+    STREAM_36,
+    STREAM_37,
+    STREAM_38,
+    MAX_STREAMS} Pattern;
+
+typedef struct {
+    char* name;
+    Pattern streams;
+    DataType type ;
+    int stride;
+    FuncPrototype kernel;
+    int  flops;
+    int  bytes;
+} TestCase;
+
+typedef struct {
+    uint64_t   size;
+    uint64_t   iter;
+    uint32_t   min_runtime;
+    const TestCase* test;
+    uint64_t   cycles;
+    uint32_t numberOfThreads;
+    int* processors;
+    void** streams;
+} ThreadUserData;
+
+#endif /*TEST_TYPES_H*/
diff --git a/bench/includes/threads.h b/bench/includes/threads.h
new file mode 100644
index 0000000..45521c2
--- /dev/null
+++ b/bench/includes/threads.h
@@ -0,0 +1,113 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  threads.h
+ *
+ *      Description:  Header file of pthread interface module
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef THREADS_H
+#define THREADS_H
+
+#include <pthread.h>
+#include <threads_types.h>
+
+#define THREADS_BARRIER pthread_barrier_wait(&threads_barrier)
+
+extern pthread_barrier_t threads_barrier;
+extern ThreadData* threads_data;
+extern ThreadGroup* threads_groups;
+
+
+/**
+ * @brief  Test the maximal possible thread count
+ * @return  numberOfThreads  The number of available threads
+ */
+extern int threads_test(void);
+
+/**
+ * @brief  Initialization of the thread module
+ * @param  numberOfThreads  The total number of threads
+ */
+extern void threads_init(int numberOfThreads);
+
+/**
+ * @brief  Create all threads
+ * @param  startRoutine thread entry function pointer
+ */
+extern void threads_create(void *(*startRoutine)(void*));
+
+/**
+ * @brief  Register User thread data for all threads
+ * @param  data  Reference to the user data structo
+ * @param  func  Optional function pointer to copy data
+ */
+extern void threads_registerDataAll(
+        ThreadUserData* data,
+        threads_copyDataFunc func);
+
+/**
+ * @brief  Register User thread data for one thread
+ * @param  threadId thread Id 
+ * @param  data  Reference to the user data structo
+ * @param  func  Optional function pointer to copy data
+ */
+extern void threads_registerDataThread(
+        int threadId,
+        ThreadUserData* data,
+        threads_copyDataFunc func);
+
+/**
+ * @brief  Register User thread data for a thread group
+ * @param  groupId  group Id
+ * @param  data  Reference to the user data structo
+ * @param  func  Optional function pointer to copy data
+ */
+extern void threads_registerDataGroup(
+        int groupId,
+        ThreadUserData* data,
+        threads_copyDataFunc func);
+
+extern size_t threads_updateIterations(int groupId, size_t demandIter);
+
+/**
+ * @brief  Join the threads and free pthread related data structures
+ * @param
+ */
+extern void threads_join(void);
+
+/**
+ * @brief  Free memory of thread data structures
+ * @param  numberOfGroups The number of groups to destroy
+ */
+extern void threads_destroy(int numberOfGroups);
+
+/**
+ * @brief  Create Thread groups
+ * @param  numberOfGroups The number of groups to create
+ */
+extern void threads_createGroups(int numberOfGroups);
+
+#endif /* THREADS_H */
diff --git a/bench/includes/threads_types.h b/bench/includes/threads_types.h
new file mode 100644
index 0000000..3fc089e
--- /dev/null
+++ b/bench/includes/threads_types.h
@@ -0,0 +1,56 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  threads_types.h
+ *
+ *      Description:  Types file for threads module.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef THREADS_TYPES_H
+#define THREADS_TYPES_H
+
+#include <stdint.h>
+#include <test_types.h>
+
+typedef struct {
+    int        globalNumberOfThreads;
+    int        numberOfThreads;
+    int        globalThreadId;
+    int        threadId;
+    int        numberOfGroups;
+    int        groupId;
+    double     time;
+    uint64_t   cycles;
+    ThreadUserData data;
+} ThreadData;
+
+typedef struct {
+    int        numberOfThreads;
+    int*       threadIds;
+} ThreadGroup;
+
+typedef void (*threads_copyDataFunc)(ThreadUserData* src,ThreadUserData* dst);
+
+#endif /*THREADS_TYPES_H*/
diff --git a/bench/likwid-bench.c b/bench/likwid-bench.c
new file mode 100644
index 0000000..77f6bd0
--- /dev/null
+++ b/bench/likwid-bench.c
@@ -0,0 +1,424 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  likwid-bench.c
+ *
+ *      Description:  A flexible and extensible benchmarking toolbox
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <errno.h>
+#include <stdio.h>
+#include <string.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <ctype.h>
+#include <inttypes.h>
+
+#include <bstrlib.h>
+#include <errno.h>
+#include <threads.h>
+#include <barrier.h>
+#include <testcases.h>
+#include <strUtil.h>
+#include <allocator.h>
+
+#include <likwid.h>
+
+extern void* runTest(void* arg);
+extern void* getIter(void* arg);
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+#define HELP_MSG printf("Threaded Memory Hierarchy Benchmark --  Version  %d.%d \n\n",VERSION,RELEASE); \
+    printf("\n"); \
+    printf("Supported Options:\n"); \
+    printf("-h\t Help message\n"); \
+    printf("-a\t List available benchmarks \n"); \
+    printf("-d\t Delimiter used for physical core list (default ,) \n"); \
+    printf("-p\t List available thread domains\n\t or the physical ids of the cores selected by the -c expression \n"); \
+    printf("-s <TIME>\t Seconds to run the test minimally (default 1)\n");\
+    printf("\t If resulting iteration count is below 10, it is normalized to 10.\n");\
+    printf("-l <TEST>\t list properties of benchmark \n"); \
+    printf("-t <TEST>\t type of test \n"); \
+    printf("-w\t <thread_domain>:<size>[:<num_threads>[:<chunk size>:<stride>]-<streamId>:<domain_id>[:<offset>], size in kB, MB or GB  (mandatory)\n"); \
+    printf("\n"); \
+    printf("Usage: likwid-bench -t copy -w S0:100kB:1 \n")
+
+#define VERSION_MSG \
+    printf("likwid-bench   %d.%d \n\n",VERSION,RELEASE)
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE  ############ */
+
+    void
+copyThreadData(ThreadUserData* src,ThreadUserData* dst)
+{
+    uint32_t i;
+
+    *dst = *src;
+    dst->processors = (int*) malloc(src->numberOfThreads*sizeof(int));
+    dst->streams = (void**) malloc(src->test->streams*sizeof(void*));
+
+    for (i=0; i<  src->test->streams; i++)
+    {
+        dst->streams[i] = src->streams[i];
+    }
+
+    for (i=0; i<src->numberOfThreads; i++)
+    {
+        dst->processors[i] = src->processors[i];
+    }
+}
+
+
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+int main(int argc, char** argv)
+{
+    uint64_t iter = 100;
+    uint32_t i;
+    uint32_t j;
+    int globalNumberOfThreads = 0;
+    int optPrintDomains = 0;
+    int c;
+    ThreadUserData myData;
+    bstring testcase = bfromcstr("none");
+    uint64_t numberOfWorkgroups = 0;
+    int tmp = 0;
+    double time;
+    double cycPerUp = 0.0;
+    const TestCase* test = NULL;
+    uint64_t realSize = 0;
+    uint64_t realIter = 0;
+    uint64_t maxCycles = 0;
+    uint64_t cpuClock = 0;
+    uint64_t demandIter = 0;
+    TimerData itertime;
+    Workgroup* currentWorkgroup = NULL;
+    Workgroup* groups = NULL;
+    uint32_t min_runtime = 1; /* 1s */
+    bstring HLINE = bfromcstr("");
+    binsertch(HLINE, 0, 80, '-');
+    binsertch(HLINE, 80, 1, '\n');
+    int (*ownprintf)(const char *format, ...);
+    ownprintf = &printf;
+
+    /* Handling of command line options */
+    if (argc ==  1)
+    {
+        HELP_MSG;
+        exit(EXIT_SUCCESS);
+    }
+
+    while ((c = getopt (argc, argv, "w:t:s:l:aphv")) != -1) {
+        switch (c)
+        {
+            case 'h':
+                HELP_MSG;
+                exit (EXIT_SUCCESS);
+            case 'v':
+                VERSION_MSG;
+                exit (EXIT_SUCCESS);
+            case 'a':
+                printf(TESTS"\n");
+                exit (EXIT_SUCCESS);
+            case 'w':
+                numberOfWorkgroups++;
+                break;
+            case 's':
+                min_runtime = atoi(optarg);
+                break;
+            case 'l':
+                testcase = bfromcstr(optarg);
+                for (i=0; i<NUMKERNELS; i++)
+                {
+                    if (biseqcstr(testcase, kernels[i].name))
+                    {
+                        test = kernels+i;
+                        break;
+                    }
+                }
+
+                if (biseqcstr(testcase,"none"))
+                {
+                    fprintf (stderr, "Unknown test case %s\n",optarg);
+                    return EXIT_FAILURE;
+                }
+                else
+                {
+                    printf("Name: %s\n",test->name);
+                    printf("Number of streams: %d\n",test->streams);
+                    printf("Loop stride: %d\n",test->stride);
+                    printf("Flops: %d\n",test->flops);
+                    printf("Bytes: %d\n",test->bytes);
+                    switch (test->type)
+                    {
+                        case SINGLE:
+                            printf("Data Type: Single precision float\n");
+                            break;
+                        case DOUBLE:
+                            printf("Data Type: Double precision float\n");
+                            break;
+                    }
+                }
+                bdestroy(testcase);
+                exit (EXIT_SUCCESS);
+
+                break;
+            case 'p':
+                optPrintDomains = 1;
+                break;
+            case 'g':
+                numberOfWorkgroups = LLU_CAST atol(optarg);
+
+                tmp = numberOfWorkgroups;
+
+                break;
+            case 't':
+                testcase = bfromcstr(optarg);
+
+                for (i=0; i<NUMKERNELS; i++)
+                {
+                    if (biseqcstr(testcase, kernels[i].name))
+                    {
+                        test = kernels+i;
+                        break;
+                    }
+                }
+
+                if (biseqcstr(testcase,"none"))
+                {
+                    fprintf (stderr, "Unknown test case %s\n",optarg);
+                    return EXIT_FAILURE;
+                }
+                bdestroy(testcase);
+                break;
+            case '?':
+                if (isprint (optopt))
+                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
+                else
+                    fprintf (stderr,
+                            "Unknown option character `\\x%x'.\n",
+                            optopt);
+                return EXIT_FAILURE;
+            default:
+                HELP_MSG;
+        }
+    }
+    if ((numberOfWorkgroups == 0) && (!optPrintDomains))
+    {
+        fprintf(stderr, "At least one workgroup (-w) must be set on commandline\n");
+        exit (EXIT_FAILURE);
+    }
+
+    if (topology_init() != EXIT_SUCCESS)
+    {
+        fprintf(stderr, "Unsupported processor!\n");
+        exit(EXIT_FAILURE);
+    }
+    numa_init();
+    affinity_init();
+    timer_init();
+
+    allocator_init(numberOfWorkgroups * MAX_STREAMS);
+    groups = (Workgroup*) malloc(numberOfWorkgroups*sizeof(Workgroup));
+    tmp = 0;
+
+    if (optPrintDomains)
+    {
+        AffinityDomains_t affinity = get_affinityDomains();
+        printf("Number of Domains %d\n",affinity->numberOfAffinityDomains);
+        for (i=0; i < affinity->numberOfAffinityDomains; i++ )
+        {
+            printf("Domain %d:\n",i);
+            printf("\tTag %s:",bdata(affinity->domains[i].tag));
+
+            for ( uint32_t j=0; j < affinity->domains[i].numberOfProcessors; j++ )
+            {
+                printf(" %d",affinity->domains[i].processorList[j]);
+            }
+            printf("\n");
+        }
+        exit (EXIT_SUCCESS);
+    }
+
+    optind = 0;
+    while ((c = getopt (argc, argv, "w:t:s:l:i:aphv")) != -1)
+    {
+        switch (c)
+        {
+            case 'w':
+                currentWorkgroup = groups+tmp;
+                testcase = bfromcstr(optarg);
+                bstr_to_workgroup(currentWorkgroup, testcase, test->type, test->streams);
+                bdestroy(testcase);
+                for (i=0; i<  test->streams; i++)
+                {
+                    if (currentWorkgroup->streams[i].offset%test->stride)
+                    {
+                        fprintf (stderr, "Stream %d: offset is not a multiple of stride!\n",i);
+                        return EXIT_FAILURE;
+                    }
+                    allocator_allocateVector(&(currentWorkgroup->streams[i].ptr),
+                            PAGE_ALIGNMENT,
+                            currentWorkgroup->size,
+                            currentWorkgroup->streams[i].offset,
+                            test->type,
+                            currentWorkgroup->streams[i].domain);
+                }
+                tmp++;
+                break;
+            default:
+                continue;
+                break;
+        }
+    }
+
+    /* :WARNING:05/04/2010 08:58:05 AM:jt: At the moment the thread
+     * module only allows equally sized thread groups*/
+    for (i=0; i<numberOfWorkgroups; i++)
+    {
+        globalNumberOfThreads += groups[i].numberOfThreads;
+    }
+
+    ownprintf(bdata(HLINE));
+    printf("LIKWID MICRO BENCHMARK\n");
+    printf("Test: %s\n",test->name);
+    ownprintf(bdata(HLINE));
+    printf("Using %" PRIu64 " work groups\n",numberOfWorkgroups);
+    printf("Using %d threads\n",globalNumberOfThreads);
+    ownprintf(bdata(HLINE));
+
+
+    threads_init(globalNumberOfThreads);
+    threads_createGroups(numberOfWorkgroups);
+
+    /* we configure global barriers only */
+    barrier_init(1);
+    barrier_registerGroup(globalNumberOfThreads);
+    cpuClock = timer_getCpuClock();
+
+#ifdef PERFMON
+    printf("Using Likwid Marker API\n");
+    likwid_markerInit();
+#endif
+
+
+    /* initialize data structures for threads */
+    for (i=0; i<numberOfWorkgroups; i++)
+    {
+        myData.iter = iter;
+        if (demandIter > 0)
+        {
+            myData.iter = demandIter;
+        }
+        myData.min_runtime = min_runtime;
+        myData.size = groups[i].size;
+        myData.test = test;
+        myData.numberOfThreads = groups[i].numberOfThreads;
+        myData.processors = (int*) malloc(myData.numberOfThreads * sizeof(int));
+        myData.streams = (void**) malloc(test->streams * sizeof(void*));
+
+        for (j=0; j<groups[i].numberOfThreads; j++)
+        {
+            myData.processors[j] = groups[i].processorIds[j];
+        }
+
+        for (j=0; j<  test->streams; j++)
+        {
+            myData.streams[j] = groups[i].streams[j].ptr;
+        }
+
+        threads_registerDataGroup(i, &myData, copyThreadData);
+
+        free(myData.processors);
+        free(myData.streams);
+    }
+
+    if (demandIter == 0)
+    {
+        threads_create(getIter);
+        threads_join();
+    }
+    for (i=0; i<numberOfWorkgroups; i++)
+    {
+        iter = threads_updateIterations(i, demandIter);
+    }
+    threads_create(runTest);
+    threads_join();
+    allocator_finalize();
+
+    for (int i=0; i<globalNumberOfThreads; i++)
+    {
+        realSize += threads_data[i].data.size;
+        realIter += threads_data[i].data.iter;
+        if (threads_data[i].cycles > maxCycles)
+        {
+            maxCycles = threads_data[i].cycles;
+        }
+    }
+
+
+
+    time = (double) maxCycles / (double) cpuClock;
+    ownprintf(bdata(HLINE));
+    printf("Cycles:\t\t\t%" PRIu64 "\n", maxCycles);
+    printf("CPU Clock:\t\t%" PRIu64 "\n", cpuClock);
+    printf("Time:\t\t\t%e sec\n", time);
+    printf("Iterations:\t\t%" PRIu64 "\n", realIter);
+    printf("Iterations per thread:\t%" PRIu64 "\n",threads_data[0].data.iter);
+    printf("Size:\t\t\t%" PRIu64 "\n",  realSize*test->bytes );
+    printf("Size per thread:\t%" PRIu64 "\n", threads_data[0].data.size*test->bytes);
+    printf("Number of Flops:\t%" PRIu64 "\n", (threads_data[0].data.iter * realSize *  test->flops));
+    printf("MFlops/s:\t\t%.2f\n",
+            1.0E-06 * ((double) threads_data[0].data.iter * realSize *  test->flops/  time));
+    printf("Data volume (Byte):\t%llu\n", LLU_CAST (threads_data[0].data.iter * realSize *  test->bytes));
+    printf("MByte/s:\t\t%.2f\n",
+            1.0E-06 * ( (double) threads_data[0].data.iter * realSize *  test->bytes/ time));
+
+    cycPerUp = ((double) maxCycles / (double) (threads_data[0].data.iter * realSize));
+    printf("Cycles per update:\t%f\n", cycPerUp);
+
+    switch ( test->type )
+    {
+        case SINGLE:
+            printf("Cycles per cacheline:\t%f\n", (16.0 * cycPerUp));
+            break;
+        case DOUBLE:
+            printf("Cycles per cacheline:\t%f\n", (8.0 * cycPerUp));
+            break;
+    }
+
+    ownprintf(bdata(HLINE));
+    threads_destroy(numberOfWorkgroups);
+
+#ifdef PERFMON
+    printf("Writing Likwid Marker API results to file\n");
+    likwid_markerClose();
+#endif
+
+    return EXIT_SUCCESS;
+}
+
diff --git a/bench/perl/AsmGen.pl b/bench/perl/AsmGen.pl
new file mode 100755
index 0000000..dcd7946
--- /dev/null
+++ b/bench/perl/AsmGen.pl
@@ -0,0 +1,284 @@
+#!/usr/bin/perl -w
+use strict;
+no strict "refs";
+use warnings;
+use lib './perl';
+use Parse::RecDescent;
+use Data::Dumper;
+use Getopt::Std;
+use Cwd 'abs_path';
+
+use gas;
+
+my $ROOT = abs_path('./');
+my $DEBUG=0;
+my $VERBOSE=0;
+our $ISA = 'x86';
+our $AS  = 'gas';
+my $OPT_STRING = 'hpvda:i:o:';
+my %OPT;
+my $INPUTFILE;
+my $OUTPUTFILE;
+my $CPP_ARGS='';
+
+# Enable warnings within the Parse::RecDescent module.
+$::RD_ERRORS = 1; # Make sure the parser dies when it encounters an error
+#$::RD_WARN   = 1; # Enable warnings. This will warn on unused rules &c.
+#$::RD_HINT   = 1; # Give out hints to help fix problems.
+#$::RD_TRACE  = 1;     # if defined, also trace parsers' behaviour
+$::RD_AUTOACTION = q { [@item[0..$#item]] };
+
+sub init
+{
+	getopts( "$OPT_STRING", \%OPT ) or usage();
+	if ($OPT{h}) { usage(); };
+	if ($OPT{v}) { $VERBOSE = 1;}
+	if ($OPT{d}) { $DEBUG = 1;}
+
+	if (! $ARGV[0]) {
+		die "ERROR: Please specify a input file!\n\nCall script with argument -h for help.\n";
+	}
+
+	$INPUTFILE = $ARGV[0];
+	$CPP_ARGS = $ARGV[1] if ($ARGV[1]);
+
+	if ($INPUTFILE =~ /.pas$/) {
+		$INPUTFILE =~ s/\.pas//; 
+	} else {
+		die "ERROR: Input file must have pas ending!\n";
+	}
+	if ($OPT{o}) { 
+		$OUTPUTFILE = $OPT{o};
+	}else {
+		$OUTPUTFILE = "$INPUTFILE.s";
+	}
+	if ($OPT{i}) { 
+		$ISA = $OPT{i};
+		print "INFO: Using isa $ISA.\n\n" if ($VERBOSE);
+	} else {
+		print "INFO: No isa specified.\n Using default $ISA.\n\n" if ($VERBOSE);
+	}
+	if ($OPT{a}) { 
+		$AS = $OPT{a};
+		print "INFO: Using as $AS.\n\n" if ($VERBOSE);
+	} else {
+		print "INFO: No as specified.\n Using default $AS.\n\n" if ($VERBOSE);
+	}
+
+  as::isa_init();
+}
+
+sub usage
+{
+    print <<END;
+usage: $0 [-$OPT_STRING]  <INFILE>
+
+Required:
+<INFILE>  : Input pas file
+
+Optional:
+-h        : this (help) message
+-v        : verbose output
+-d        : debug mode: prints out the parse tree
+-p        : Print out intermediate preprocessed output
+-o <FILE> : Output file
+-a <ASM>  : Specify different assembler (Default: gas)
+-i <ISA>  : Specify different isa (Default: x86)
+
+Example: 
+$0 -i x86-64  -a masm -o out.s  myfile.pas
+
+END
+
+exit(0);
+}
+
+#=======================================
+# GRAMMAR
+#=======================================
+$main::grammar = <<'_EOGRAMMAR_';
+# Terminals
+FUNC        : /func/i
+LOOP        : /loop/i
+ALLOCATE    : /allocate/i
+FACTOR      : /factor/i
+DEFINE      : /define/i
+USE         : /use/i
+STOP        : /stop/i
+START       : /start/i
+LOCAL       : /local/i
+TIMER       : /timer/i
+INCREMENT   : /increment/i
+ALIGN       : /align/i
+INT         : /int/i
+SINGLE      : /single/i
+DOUBLE      : /double/i
+INUMBER     : NUMBER
+UNUMBER     : NUMBER
+SNUMBER     : NUMBER
+FNUMBER     : NUMBER
+OFFSET      : /([0-9]+\,){15}[0-9]+/
+NUMBER      : /[-+]?[0-9]*\.?[0-9]+/
+SYMBOL      : /[.A-Z-a-z_][A-Za-z0-9_]*/
+REG         : /GPR[0-9]+/i
+SREG         : /GPR[0-9]+/i
+COMMENT     : /#.*/
+{'skip'}
+
+type: SINGLE 
+     |DOUBLE
+	 |INT
+
+align: ALIGN <commit> NUMBER
+{
+{FUNC => 'as::align',
+ ARGS => ["$item{NUMBER}[1]"]}
+}
+
+ASMCODE     : /[A-Za-z1-9.:]+.*/
+{
+{FUNC => 'as::emit_code',
+ ARGS => [$item[1]]}
+}
+
+function:  FUNC SYMBOL block
+{[
+ {FUNC => 'as::function_entry',
+  ARGS => [$item{SYMBOL}[1],0]},
+ $item{block},
+ {FUNC => 'as::function_exit',
+  ARGS => [$item{SYMBOL}[1]]}
+]}
+
+function_allocate:  FUNC SYMBOL ALLOCATE NUMBER block
+{[
+ {FUNC => 'as::function_entry',
+  ARGS => [$item{SYMBOL}[1],$item{NUMBER}[1]]},
+ $item{block},
+ {FUNC => 'as::function_exit',
+  ARGS => [$item{SYMBOL}[1]]}
+]}
+
+loop:  LOOP SYMBOL INUMBER SNUMBER block
+{[
+{FUNC => 'as::loop_entry',
+ ARGS => [$item{SYMBOL}[1],$item{SNUMBER}[1][1]]},
+ $item{block},
+{FUNC => 'as::loop_exit',
+ ARGS => [$item{SYMBOL}[1],$item{INUMBER}[1][1]]}
+]}
+| LOOP SYMBOL INUMBER SREG block
+{[
+{FUNC => 'as::loop_entry',
+ ARGS => [$item{SYMBOL}[1],$item{SREG}[1]]},
+ $item{block},
+{FUNC => 'as::loop_exit',
+ ARGS => [$item{SYMBOL}[1],$item{INUMBER}[1][1]]}
+]}
+
+timer: START TIMER
+{
+{FUNC => 'isa::start_timer',
+ ARGS => []}
+}
+| STOP TIMER
+{
+{FUNC => 'isa::stop_timer',
+ ARGS => []}
+}
+
+mode:  START LOCAL
+{
+{FUNC => 'as::mode',
+ ARGS => [$item[1][1]]}
+}
+| STOP LOCAL
+{
+{FUNC => 'as::mode',
+ ARGS => [$item[1][1]]}
+}
+
+block: '{' expression(s) '}'
+{ $item[2] }
+
+define_data: DEFINE type  SYMBOL  OFFSET
+{
+{FUNC => 'as::define_offset',
+ ARGS => [$item{SYMBOL}[1], $item{type}[1][1],"$item{OFFSET}[1]"]}
+}
+
+define_data: DEFINE type  SYMBOL  NUMBER
+{
+{FUNC => 'as::define_data',
+ ARGS => [$item{SYMBOL}[1], $item{type}[1][1],"$item{NUMBER}[1]"]}
+}
+
+
+expression:  align
+            |COMMENT
+            |loop
+            |timer
+            |mode
+			|ASMCODE
+{ $item[1] }
+
+instruction : define_data
+            | align
+            | COMMENT
+            | mode
+            | function
+            | function_allocate
+{ $item[1] }
+
+startrule: instruction(s)
+{ $item[1] }
+
+_EOGRAMMAR_
+
+
+#=======================================
+# MAIN
+#=======================================
+init();
+print "INFO: Calling cpp with arguments $CPP_ARGS.\n" if ($VERBOSE);
+my $text = `cpp -x assembler-with-cpp $CPP_ARGS $INPUTFILE.pas`;
+
+if ($OPT{p}) {
+	open FILE,">$INPUTFILE.Pas";
+	print FILE $text;
+	close FILE;
+}
+
+open STDOUT,">$OUTPUTFILE";
+print "$as::AS->{HEADER}\n";
+
+my $parser = new Parse::RecDescent ($main::grammar)  or die "ERROR: Bad grammar!\n";
+my $parse_tree = $parser->startrule($text) or print STDERR "ERROR: Syntax Error\n";
+tree_exec($parse_tree);
+
+if ($DEBUG) {
+	open FILE,'>parse_tree.txt';
+	print FILE Dumper $parse_tree,"\n";
+	close FILE;
+}
+
+print "$as::AS->{FOOTER}\n";
+
+sub tree_exec 
+{
+	my $tree = shift;
+
+	foreach my $node (@$tree) {
+		if ($node !~ /^skip|^instruction|^expression|^loop/) {
+			if (ref($node) eq 'ARRAY')  {
+				tree_exec($node);
+			}else {
+				if (ref($node) eq 'HASH') {
+					&{$node->{FUNC}}(@{$node->{ARGS}});
+				}
+			}
+		}
+	}
+}
+
+
diff --git a/bench/perl/Parse/RecDescent.pm b/bench/perl/Parse/RecDescent.pm
new file mode 100644
index 0000000..35b9e9d
--- /dev/null
+++ b/bench/perl/Parse/RecDescent.pm
@@ -0,0 +1,3045 @@
+# GENERATE RECURSIVE DESCENT PARSER OBJECTS FROM A GRAMMARC
+# SEE RecDescent.pod FOR FULL DETAILS
+
+use 5.005;
+use strict;
+
+package Parse::RecDescent;
+
+use Text::Balanced qw ( extract_codeblock extract_bracketed extract_quotelike extract_delimited );
+
+use vars qw ( $skip );
+
+   *defskip  = \ '\s*';	# DEFAULT SEPARATOR IS OPTIONAL WHITESPACE
+   $skip  = '\s*';		# UNIVERSAL SEPARATOR IS OPTIONAL WHITESPACE
+my $MAXREP  = 100_000_000;	# REPETITIONS MATCH AT MOST 100,000,000 TIMES
+
+
+sub import	# IMPLEMENT PRECOMPILER BEHAVIOUR UNDER:
+		#    perl -MParse::RecDescent - <grammarfile> <classname>
+{
+	local *_die = sub { print @_, "\n"; exit };
+
+	my ($package, $file, $line) = caller;
+	if (substr($file,0,1) eq '-' && $line == 0)
+	{
+		_die("Usage: perl -MLocalTest - <grammarfile> <classname>")
+			unless @ARGV == 2;
+
+		my ($sourcefile, $class) = @ARGV;
+
+		local *IN;
+		open IN, $sourcefile
+			or _die("Can't open grammar file '$sourcefile'");
+
+		my $grammar = join '', <IN>;
+
+		Parse::RecDescent->Precompile($grammar, $class, $sourcefile);
+		exit;
+	}
+}
+		
+sub Save
+{
+	my ($self, $class) = @_;
+	$self->{saving} = 1;
+	$self->Precompile(undef,$class);
+	$self->{saving} = 0;
+}
+
+sub Precompile
+{
+		my ($self, $grammar, $class, $sourcefile) = @_;
+
+		$class =~ /^(\w+::)*\w+$/ or croak("Bad class name: $class");
+
+		my $modulefile = $class;
+		$modulefile =~ s/.*:://;
+		$modulefile .= ".pm";
+
+		open OUT, ">$modulefile"
+			or croak("Can't write to new module file '$modulefile'");
+
+		print STDERR "precompiling grammar from file '$sourcefile'\n",
+			     "to class $class in module file '$modulefile'\n"
+					if $grammar && $sourcefile;
+
+		# local $::RD_HINT = 1;
+		$self = Parse::RecDescent->new($grammar,1,$class)
+			|| croak("Can't compile bad grammar")
+				if $grammar;
+
+		foreach ( keys %{$self->{rules}} )
+			{ $self->{rules}{$_}{changed} = 1 }
+
+		print OUT "package $class;\nuse Parse::RecDescent;\n\n";
+
+		print OUT "{ my \$ERRORS;\n\n";
+
+		print OUT $self->_code();
+
+		print OUT "}\npackage $class; sub new { ";
+		print OUT "my ";
+
+		require Data::Dumper;
+		print OUT Data::Dumper->Dump([$self], [qw(self)]);
+
+		print OUT "}";
+
+		close OUT
+			or croak("Can't write to new module file '$modulefile'");
+}
+
+
+package Parse::RecDescent::LineCounter;
+
+
+sub TIESCALAR	# ($classname, \$text, $thisparser, $prevflag)
+{
+	bless {
+		text    => $_[1],
+		parser  => $_[2],
+		prev	=> $_[3]?1:0,
+	      }, $_[0];
+}
+
+my %counter_cache;
+
+sub FETCH
+{
+        my $parser = $_[0]->{parser};
+        my $from = $parser->{fulltextlen}-length(${$_[0]->{text}})-$_[0]->{prev}
+;
+
+    unless (exists $counter_cache{$from}) {
+        $parser->{lastlinenum} = $parser->{offsetlinenum}
+		   - Parse::RecDescent::_linecount(substr($parser->{fulltext},$from))
+                   + 1;
+        $counter_cache{$from} = $parser->{lastlinenum};
+    }
+    return $counter_cache{$from};
+}
+
+sub STORE
+{
+	my $parser = $_[0]->{parser};
+	$parser->{offsetlinenum} -= $parser->{lastlinenum} - $_[1];
+	return undef;
+}
+
+sub resync       # ($linecounter)
+{
+        my $self = tied($_[0]);
+        die "Tried to alter something other than a LineCounter\n"
+                unless $self =~ /Parse::RecDescent::LineCounter/;
+	
+	my $parser = $self->{parser};
+	my $apparently = $parser->{offsetlinenum}
+			 - Parse::RecDescent::_linecount(${$self->{text}})
+			 + 1;
+
+	$parser->{offsetlinenum} += $parser->{lastlinenum} - $apparently;
+	return 1;
+}
+
+package Parse::RecDescent::ColCounter;
+
+sub TIESCALAR	# ($classname, \$text, $thisparser, $prevflag)
+{
+	bless {
+		text    => $_[1],
+		parser  => $_[2],
+		prev    => $_[3]?1:0,
+	      }, $_[0];
+}
+
+sub FETCH    
+{
+	my $parser = $_[0]->{parser};
+	my $missing = $parser->{fulltextlen}-length(${$_[0]->{text}})-$_[0]->{prev}+1;
+	substr($parser->{fulltext},0,$missing) =~ m/^(.*)\Z/m;
+	return length($1);
+}
+
+sub STORE
+{
+	die "Can't set column number via \$thiscolumn\n";
+}
+
+
+package Parse::RecDescent::OffsetCounter;
+
+sub TIESCALAR	# ($classname, \$text, $thisparser, $prev)
+{
+	bless {
+		text    => $_[1],
+		parser  => $_[2],
+		prev	=> $_[3]?-1:0,
+	      }, $_[0];
+}
+
+sub FETCH    
+{
+	my $parser = $_[0]->{parser};
+	return $parser->{fulltextlen}-length(${$_[0]->{text}})+$_[0]->{prev};
+}
+
+sub STORE
+{
+	die "Can't set current offset via \$thisoffset or \$prevoffset\n";
+}
+
+
+
+package Parse::RecDescent::Rule;
+
+sub new ($$$$$)
+{
+	my $class = ref($_[0]) || $_[0];
+	my $name  = $_[1];
+	my $owner = $_[2];
+	my $line  = $_[3];
+	my $replace = $_[4];
+
+	if (defined $owner->{"rules"}{$name})
+	{
+		my $self = $owner->{"rules"}{$name};
+		if ($replace && !$self->{"changed"})
+		{
+			$self->reset;
+		}
+		return $self;
+	}
+	else
+	{
+		return $owner->{"rules"}{$name} =
+			bless
+			{
+				"name"     => $name,
+				"prods"    => [],
+				"calls"    => [],
+				"changed"  => 0,
+				"line"     => $line,
+				"impcount" => 0,
+				"opcount"  => 0,
+				"vars"	   => "",
+			}, $class;
+	}
+}
+
+sub reset($)
+{
+	@{$_[0]->{"prods"}} = ();
+	@{$_[0]->{"calls"}} = ();
+	$_[0]->{"changed"}  = 0;
+	$_[0]->{"impcount"}  = 0;
+	$_[0]->{"opcount"}  = 0;
+	$_[0]->{"vars"}  = "";
+}
+
+sub DESTROY {}
+
+sub hasleftmost($$)
+{
+	my ($self, $ref) = @_;
+
+	my $prod;
+	foreach $prod ( @{$self->{"prods"}} )
+	{
+		return 1 if $prod->hasleftmost($ref);
+	}
+
+	return 0;
+}
+
+sub leftmostsubrules($)
+{
+	my $self = shift;
+	my @subrules = ();
+
+	my $prod;
+	foreach $prod ( @{$self->{"prods"}} )
+	{
+		push @subrules, $prod->leftmostsubrule();
+	}
+
+	return @subrules;
+}
+
+sub expected($)
+{
+	my $self = shift;
+	my @expected = ();
+
+	my $prod;
+	foreach $prod ( @{$self->{"prods"}} )
+	{
+		my $next = $prod->expected();
+		unless (! $next or _contains($next, at expected) )
+		{
+			push @expected, $next;
+		}
+	}
+
+	return join ', or ', @expected;
+}
+
+sub _contains($@)
+{
+	my $target = shift;
+	my $item;
+	foreach $item ( @_ ) { return 1 if $target eq $item; }
+	return 0;
+}
+
+sub addcall($$)
+{
+	my ( $self, $subrule ) = @_;
+	unless ( _contains($subrule, @{$self->{"calls"}}) )
+	{
+		push @{$self->{"calls"}}, $subrule;
+	}
+}
+
+sub addprod($$)
+{
+	my ( $self, $prod ) = @_;
+	push @{$self->{"prods"}}, $prod;
+	$self->{"changed"} = 1;
+	$self->{"impcount"} = 0;
+	$self->{"opcount"} = 0;
+	$prod->{"number"} = $#{$self->{"prods"}};
+	return $prod;
+}
+
+sub addvar
+{
+	my ( $self, $var, $parser ) = @_;
+	if ($var =~ /\A\s*local\s+([%@\$]\w+)/)
+	{
+		$parser->{localvars} .= " $1";
+		$self->{"vars"} .= "$var;\n" }
+	else 
+		{ $self->{"vars"} .= "my $var;\n" }
+	$self->{"changed"} = 1;
+	return 1;
+}
+
+sub addautoscore
+{
+	my ( $self, $code ) = @_;
+	$self->{"autoscore"} = $code;
+	$self->{"changed"} = 1;
+	return 1;
+}
+
+sub nextoperator($)
+{
+	my $self = shift;
+	my $prodcount = scalar @{$self->{"prods"}};
+	my $opcount = ++$self->{"opcount"};
+	return "_operator_${opcount}_of_production_${prodcount}_of_rule_$self->{name}";
+}
+
+sub nextimplicit($)
+{
+	my $self = shift;
+	my $prodcount = scalar @{$self->{"prods"}};
+	my $impcount = ++$self->{"impcount"};
+	return "_alternation_${impcount}_of_production_${prodcount}_of_rule_$self->{name}";
+}
+
+
+sub code
+{
+	my ($self, $namespace, $parser) = @_;
+
+eval 'undef &' . $namespace . '::' . $self->{"name"} unless $parser->{saving};
+
+	my $code =
+'
+# ARGS ARE: ($parser, $text; $repeating, $_noactions, \@args)
+sub ' . $namespace . '::' . $self->{"name"} .  '
+{
+	my $thisparser = $_[0];
+	use vars q{$tracelevel};
+	local $tracelevel = ($tracelevel||0)+1;
+	$ERRORS = 0;
+	my $thisrule = $thisparser->{"rules"}{"' . $self->{"name"} . '"};
+	
+	Parse::RecDescent::_trace(q{Trying rule: [' . $self->{"name"} . ']},
+				  Parse::RecDescent::_tracefirst($_[1]),
+				  q{' . $self->{"name"} . '},
+				  $tracelevel)
+					if defined $::RD_TRACE;
+
+	' . ($parser->{deferrable}
+		? 'my $def_at = @{$thisparser->{deferred}};'
+		: '') .
+	'
+	my $err_at = @{$thisparser->{errors}};
+
+	my $score;
+	my $score_return;
+	my $_tok;
+	my $return = undef;
+	my $_matched=0;
+	my $commit=0;
+	my @item = ();
+	my %item = ();
+	my $repeating =  defined($_[2]) && $_[2];
+	my $_noactions = defined($_[3]) && $_[3];
+ 	my @arg =        defined $_[4] ? @{ &{$_[4]} } : ();
+	my %arg =        ($#arg & 01) ? @arg : (@arg, undef);
+	my $text;
+	my $lastsep="";
+	my $expectation = new Parse::RecDescent::Expectation($thisrule->expected());
+	$expectation->at($_[1]);
+	'. ($parser->{_check}{thisoffset}?'
+	my $thisoffset;
+	tie $thisoffset, q{Parse::RecDescent::OffsetCounter}, \$text, $thisparser;
+	':'') . ($parser->{_check}{prevoffset}?'
+	my $prevoffset;
+	tie $prevoffset, q{Parse::RecDescent::OffsetCounter}, \$text, $thisparser, 1;
+	':'') . ($parser->{_check}{thiscolumn}?'
+	my $thiscolumn;
+	tie $thiscolumn, q{Parse::RecDescent::ColCounter}, \$text, $thisparser;
+	':'') . ($parser->{_check}{prevcolumn}?'
+	my $prevcolumn;
+	tie $prevcolumn, q{Parse::RecDescent::ColCounter}, \$text, $thisparser, 1;
+	':'') . ($parser->{_check}{prevline}?'
+	my $prevline;
+	tie $prevline, q{Parse::RecDescent::LineCounter}, \$text, $thisparser, 1;
+	':'') . '
+	my $thisline;
+	tie $thisline, q{Parse::RecDescent::LineCounter}, \$text, $thisparser;
+
+	'. $self->{vars} .'
+';
+
+	my $prod;
+	foreach $prod ( @{$self->{"prods"}} )
+	{
+		$prod->addscore($self->{autoscore},0,0) if $self->{autoscore};
+		next unless $prod->checkleftmost();
+		$code .= $prod->code($namespace,$self,$parser);
+
+		$code .= $parser->{deferrable}
+				? '		splice
+				@{$thisparser->{deferred}}, $def_at unless $_matched;
+				  '
+				: '';
+	}
+
+	$code .=
+'
+        unless ( $_matched || defined($return) || defined($score) )
+	{
+		' .($parser->{deferrable}
+			? '		splice @{$thisparser->{deferred}}, $def_at;
+			  '
+			: '') . '
+
+		$_[1] = $text;	# NOT SURE THIS IS NEEDED
+		Parse::RecDescent::_trace(q{<<Didn\'t match rule>>},
+					 Parse::RecDescent::_tracefirst($_[1]),
+					 q{' . $self->{"name"} .'},
+					 $tracelevel)
+					if defined $::RD_TRACE;
+		return undef;
+	}
+	if (!defined($return) && defined($score))
+	{
+		Parse::RecDescent::_trace(q{>>Accepted scored production<<}, "",
+					  q{' . $self->{"name"} .'},
+					  $tracelevel)
+						if defined $::RD_TRACE;
+		$return = $score_return;
+	}
+	splice @{$thisparser->{errors}}, $err_at;
+	$return = $item[$#item] unless defined $return;
+	if (defined $::RD_TRACE)
+	{
+		Parse::RecDescent::_trace(q{>>Matched rule<< (return value: [} .
+					  $return . q{])}, "",
+					  q{' . $self->{"name"} .'},
+					  $tracelevel);
+		Parse::RecDescent::_trace(q{(consumed: [} .
+					  Parse::RecDescent::_tracemax(substr($_[1],0,-length($text))) . q{])}, 
+					  Parse::RecDescent::_tracefirst($text),
+					  , q{' . $self->{"name"} .'},
+					  $tracelevel)
+	}
+	$_[1] = $text;
+	return $return;
+}
+';
+
+	return $code;
+}
+
+my @left;
+sub isleftrec($$)
+{
+	my ($self, $rules) = @_;
+	my $root = $self->{"name"};
+	@left = $self->leftmostsubrules();
+	my $next;
+	foreach $next ( @left )
+	{
+		next unless defined $rules->{$next}; # SKIP NON-EXISTENT RULES
+		return 1 if $next eq $root;
+		my $child;
+		foreach $child ( $rules->{$next}->leftmostsubrules() )
+		{
+		    push(@left, $child)
+			if ! _contains($child, @left) ;
+		}
+	}
+	return 0;
+}
+
+package Parse::RecDescent::Production;
+
+sub describe ($;$)
+{
+	return join ' ', map { $_->describe($_[1]) or () } @{$_[0]->{items}};
+}
+
+sub new ($$;$$)
+{
+	my ($self, $line, $uncommit, $error) = @_;
+	my $class = ref($self) || $self;
+
+	bless
+	{
+		"items"    => [],
+		"uncommit" => $uncommit,
+		"error"    => $error,
+		"line"     => $line,
+		strcount   => 0,
+		patcount   => 0,
+		dircount   => 0,
+		actcount   => 0,
+	}, $class;
+}
+
+sub expected ($)
+{
+	my $itemcount = scalar @{$_[0]->{"items"}};
+	return ($itemcount) ? $_[0]->{"items"}[0]->describe(1) : '';
+}
+
+sub hasleftmost ($$)
+{
+	my ($self, $ref) = @_;
+	return ${$self->{"items"}}[0] eq $ref  if scalar @{$self->{"items"}};
+	return 0;
+}
+
+sub leftmostsubrule($)
+{
+	my $self = shift;
+
+	if ( $#{$self->{"items"}} >= 0 )
+	{
+		my $subrule = $self->{"items"}[0]->issubrule();
+		return $subrule if defined $subrule;
+	}
+
+	return ();
+}
+
+sub checkleftmost($)
+{
+	my @items = @{$_[0]->{"items"}};
+	if (@items==1 && ref($items[0]) =~ /\AParse::RecDescent::Error/
+	    && $items[0]->{commitonly} )
+	{
+		Parse::RecDescent::_warn(2,"Lone <error?> in production treated
+					    as <error?> <reject>");
+		Parse::RecDescent::_hint("A production consisting of a single
+					  conditional <error?> directive would 
+					  normally succeed (with the value zero) if the
+					  rule is not 'commited' when it is
+					  tried. Since you almost certainly wanted
+					  '<error?> <reject>' Parse::RecDescent
+					  supplied it for you.");
+		push @{$_[0]->{items}},
+			Parse::RecDescent::UncondReject->new(0,0,'<reject>');
+	}
+	elsif (@items==1 && ($items[0]->describe||"") =~ /<rulevar|<autoscore/)
+	{
+		# Do nothing
+	}
+	elsif (@items &&
+		( ref($items[0]) =~ /\AParse::RecDescent::UncondReject/
+		|| ($items[0]->describe||"") =~ /<autoscore/
+		))
+	{
+		Parse::RecDescent::_warn(1,"Optimizing away production: [". $_[0]->describe ."]");
+		my $what = $items[0]->describe =~ /<rulevar/
+				? "a <rulevar> (which acts like an unconditional <reject> during parsing)"
+		         : $items[0]->describe =~ /<autoscore/
+				? "an <autoscore> (which acts like an unconditional <reject> during parsing)"
+				: "an unconditional <reject>";
+		my $caveat = $items[0]->describe =~ /<rulevar/
+				? " after the specified variable was set up"
+				: "";
+		my $advice = @items > 1
+				? "However, there were also other (useless) items after the leading "
+				  . $items[0]->describe
+				  . ", so you may have been expecting some other behaviour."
+				: "You can safely ignore this message.";
+		Parse::RecDescent::_hint("The production starts with $what. That means that the
+					  production can never successfully match, so it was
+					  optimized out of the final parser$caveat. $advice");
+		return 0;
+	}
+	return 1;
+}
+
+sub changesskip($)
+{
+	my $item;
+	foreach $item (@{$_[0]->{"items"}})
+	{
+		if (ref($item) =~ /Parse::RecDescent::(Action|Directive)/)
+		{
+			return 1 if $item->{code} =~ /\$skip/;
+		}
+	}
+	return 0;
+}
+
+sub adddirective
+{
+	my ( $self, $whichop, $line, $name ) = @_;
+	push @{$self->{op}},
+		{ type=>$whichop, line=>$line, name=>$name,
+		  offset=> scalar(@{$self->{items}}) };
+}
+
+sub addscore
+{
+	my ( $self, $code, $lookahead, $line ) = @_;
+	$self->additem(Parse::RecDescent::Directive->new(
+			      "local \$^W;
+			       my \$thisscore = do { $code } + 0;
+			       if (!defined(\$score) || \$thisscore>\$score)
+					{ \$score=\$thisscore; \$score_return=\$item[-1]; }
+			       undef;", $lookahead, $line,"<score: $code>") )
+		unless $self->{items}[-1]->describe =~ /<score/;
+	return 1;
+}
+
+sub check_pending
+{
+	my ( $self, $line ) = @_;
+	if ($self->{op})
+	{
+	    while (my $next = pop @{$self->{op}})
+	    {
+		Parse::RecDescent::_error("Incomplete <$next->{type}op:...>.", $line);
+		Parse::RecDescent::_hint(
+			"The current production ended without completing the
+			 <$next->{type}op:...> directive that started near line
+			 $next->{line}. Did you forget the closing '>'?");
+	    }
+	}
+	return 1;
+}
+
+sub enddirective
+{
+	my ( $self, $line, $minrep, $maxrep ) = @_;
+	unless ($self->{op})
+	{
+		Parse::RecDescent::_error("Unmatched > found.", $line);
+		Parse::RecDescent::_hint(
+			"A '>' angle bracket was encountered, which typically
+			 indicates the end of a directive. However no suitable
+			 preceding directive was encountered. Typically this
+			 indicates either a extra '>' in the grammar, or a
+			 problem inside the previous directive.");
+		return;
+	}
+	my $op = pop @{$self->{op}};
+	my $span = @{$self->{items}} - $op->{offset};
+	if ($op->{type} =~ /left|right/)
+	{
+	    if ($span != 3)
+	    {
+		Parse::RecDescent::_error(
+			"Incorrect <$op->{type}op:...> specification:
+			 expected 3 args, but found $span instead", $line);
+		Parse::RecDescent::_hint(
+			"The <$op->{type}op:...> directive requires a
+			 sequence of exactly three elements. For example:
+		         <$op->{type}op:leftarg /op/ rightarg>");
+	    }
+	    else
+	    {
+		push @{$self->{items}},
+			Parse::RecDescent::Operator->new(
+				$op->{type}, $minrep, $maxrep, splice(@{$self->{"items"}}, -3));
+		$self->{items}[-1]->sethashname($self);
+		$self->{items}[-1]{name} = $op->{name};
+	    }
+	}
+}
+
+sub prevwasreturn
+{
+	my ( $self, $line ) = @_;
+	unless (@{$self->{items}})
+	{
+		Parse::RecDescent::_error(
+			"Incorrect <return:...> specification:
+			expected item missing", $line);
+		Parse::RecDescent::_hint(
+			"The <return:...> directive requires a
+			sequence of at least one item. For example:
+		        <return: list>");
+		return;
+	}
+	push @{$self->{items}},
+		Parse::RecDescent::Result->new();
+}
+
+sub additem
+{
+	my ( $self, $item ) = @_;
+	$item->sethashname($self);
+	push @{$self->{"items"}}, $item;
+	return $item;
+}
+
+
+sub preitempos
+{
+	return q
+	{
+		push @itempos, {'offset' => {'from'=>$thisoffset, 'to'=>undef},
+				'line'   => {'from'=>$thisline,   'to'=>undef},
+				'column' => {'from'=>$thiscolumn, 'to'=>undef} };
+	}
+}
+
+sub incitempos
+{
+	return q
+	{
+		$itempos[$#itempos]{'offset'}{'from'} += length($1);
+		$itempos[$#itempos]{'line'}{'from'}   = $thisline;
+		$itempos[$#itempos]{'column'}{'from'} = $thiscolumn;
+	}
+}
+
+sub postitempos
+{
+	return q
+	{
+		$itempos[$#itempos]{'offset'}{'to'} = $prevoffset;
+		$itempos[$#itempos]{'line'}{'to'}   = $prevline;
+		$itempos[$#itempos]{'column'}{'to'} = $prevcolumn;
+	}
+}
+
+sub code($$$$)
+{
+	my ($self,$namespace,$rule,$parser) = @_;
+	my $code =
+'
+	while (!$_matched'
+	. (defined $self->{"uncommit"} ? '' : ' && !$commit')
+	. ')
+	{
+		' .
+		($self->changesskip()
+			? 'local $skip = defined($skip) ? $skip : $Parse::RecDescent::skip;'
+			: '') .'
+		Parse::RecDescent::_trace(q{Trying production: ['
+					  . $self->describe . ']},
+					  Parse::RecDescent::_tracefirst($_[1]),
+					  q{' . $rule ->{name}. '},
+					  $tracelevel)
+						if defined $::RD_TRACE;
+		my $thisprod = $thisrule->{"prods"}[' . $self->{"number"} . '];
+		' . (defined $self->{"error"} ? '' : '$text = $_[1];' ) . '
+		my $_savetext;
+		@item = (q{' . $rule->{"name"} . '});
+		%item = (__RULE__ => q{' . $rule->{"name"} . '});
+		my $repcount = 0;
+
+';
+	$code .= 
+'		my @itempos = ({});
+'			if $parser->{_check}{itempos};
+
+	my $item;
+	my $i;
+
+	for ($i = 0; $i < @{$self->{"items"}}; $i++)
+	{
+		$item = ${$self->{items}}[$i];
+
+		$code .= preitempos() if $parser->{_check}{itempos};
+
+		$code .= $item->code($namespace,$rule,$parser->{_check});
+
+		$code .= postitempos() if $parser->{_check}{itempos};
+
+	}
+
+	if ($parser->{_AUTOACTION} && defined($item) && !$item->isa("Parse::RecDescent::Action"))
+	{
+		$code .= $parser->{_AUTOACTION}->code($namespace,$rule);
+		Parse::RecDescent::_warn(1,"Autogenerating action in rule
+					   \"$rule->{name}\":
+					    $parser->{_AUTOACTION}{code}")
+		and
+		Parse::RecDescent::_hint("The \$::RD_AUTOACTION was defined,
+					  so any production not ending in an
+					  explicit action has the specified
+		       			  \"auto-action\" automatically
+					  appended.");
+	}
+	elsif ($parser->{_AUTOTREE} && defined($item) && !$item->isa("Parse::RecDescent::Action"))
+	{
+		if ($i==1 && $item->isterminal)
+		{
+			$code .= $parser->{_AUTOTREE}{TERMINAL}->code($namespace,$rule);
+		}
+		else
+		{
+			$code .= $parser->{_AUTOTREE}{NODE}->code($namespace,$rule);
+		}
+		Parse::RecDescent::_warn(1,"Autogenerating tree-building action in rule
+					   \"$rule->{name}\"")
+		and
+		Parse::RecDescent::_hint("The directive <autotree> was specified,
+                                          so any production not ending
+                                          in an explicit action has
+                                          some parse-tree building code
+                                          automatically appended.");
+	}
+
+	$code .= 
+'
+
+		Parse::RecDescent::_trace(q{>>Matched production: ['
+					  . $self->describe . ']<<},
+					  Parse::RecDescent::_tracefirst($text),
+					  q{' . $rule->{name} . '},
+					  $tracelevel)
+						if defined $::RD_TRACE;
+		$_matched = 1;
+		last;
+	}
+
+';
+	return $code;
+}
+
+1;
+
+package Parse::RecDescent::Action;
+
+sub describe { undef }
+
+sub sethashname { $_[0]->{hashname} = '__ACTION' . ++$_[1]->{actcount} .'__'; }
+
+sub new
+{
+	my $class = ref($_[0]) || $_[0];
+	bless 
+	{
+		"code"      => $_[1],
+		"lookahead" => $_[2],
+		"line"      => $_[3],
+	}, $class;
+}
+
+sub issubrule { undef }
+sub isterminal { 0 }
+
+sub code($$$$)
+{
+	my ($self, $namespace, $rule) = @_;
+	
+'
+		Parse::RecDescent::_trace(q{Trying action},
+					  Parse::RecDescent::_tracefirst($text),
+					  q{' . $rule->{name} . '},
+					  $tracelevel)
+						if defined $::RD_TRACE;
+		' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) .'
+
+		$_tok = ($_noactions) ? 0 : do ' . $self->{"code"} . ';
+		' . ($self->{"lookahead"}<0?'if':'unless') . ' (defined $_tok)
+		{
+			Parse::RecDescent::_trace(q{<<Didn\'t match action>> (return value: [undef])})
+					if defined $::RD_TRACE;
+			last;
+		}
+		Parse::RecDescent::_trace(q{>>Matched action<< (return value: [}
+					  . $_tok . q{])},
+					  Parse::RecDescent::_tracefirst($text))
+						if defined $::RD_TRACE;
+		push @item, $_tok;
+		' . ($self->{line}>=0 ? '$item{'. $self->{hashname} .'}=$_tok;' : '' ) .'
+		' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .'
+'
+}
+
+
+1;
+
+package Parse::RecDescent::Directive;
+
+sub sethashname { $_[0]->{hashname} = '__DIRECTIVE' . ++$_[1]->{dircount} .  '__'; }
+
+sub issubrule { undef }
+sub isterminal { 0 }
+sub describe { $_[1] ? '' : $_[0]->{name} } 
+
+sub new ($$$$$)
+{
+	my $class = ref($_[0]) || $_[0];
+	bless 
+	{
+		"code"      => $_[1],
+		"lookahead" => $_[2],
+		"line"      => $_[3],
+		"name"      => $_[4],
+	}, $class;
+}
+
+sub code($$$$)
+{
+	my ($self, $namespace, $rule) = @_;
+	
+'
+		' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) .'
+
+		Parse::RecDescent::_trace(q{Trying directive: ['
+					. $self->describe . ']},
+					Parse::RecDescent::_tracefirst($text),
+					  q{' . $rule->{name} . '},
+					  $tracelevel)
+						if defined $::RD_TRACE; ' .'
+		$_tok = do { ' . $self->{"code"} . ' };
+		if (defined($_tok))
+		{
+			Parse::RecDescent::_trace(q{>>Matched directive<< (return value: [}
+						. $_tok . q{])},
+						Parse::RecDescent::_tracefirst($text))
+							if defined $::RD_TRACE;
+		}
+		else
+		{
+			Parse::RecDescent::_trace(q{<<Didn\'t match directive>>},
+						Parse::RecDescent::_tracefirst($text))
+							if defined $::RD_TRACE;
+		}
+		' . ($self->{"lookahead"} ? '$text = $_savetext and ' : '' ) .'
+		last '
+		. ($self->{"lookahead"}<0?'if':'unless') . ' defined $_tok;
+		push @item, $item{'.$self->{hashname}.'}=$_tok;
+		' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .'
+'
+}
+
+1;
+
+package Parse::RecDescent::UncondReject;
+
+sub issubrule { undef }
+sub isterminal { 0 }
+sub describe { $_[1] ? '' : $_[0]->{name} }
+sub sethashname { $_[0]->{hashname} = '__DIRECTIVE' . ++$_[1]->{dircount} .  '__'; }
+
+sub new ($$$;$)
+{
+	my $class = ref($_[0]) || $_[0];
+	bless 
+	{
+		"lookahead" => $_[1],
+		"line"      => $_[2],
+		"name"      => $_[3],
+	}, $class;
+}
+
+# MARK, YOU MAY WANT TO OPTIMIZE THIS.
+
+
+sub code($$$$)
+{
+	my ($self, $namespace, $rule) = @_;
+	
+'
+		Parse::RecDescent::_trace(q{>>Rejecting production<< (found '
+					 . $self->describe . ')},
+					 Parse::RecDescent::_tracefirst($text),
+					  q{' . $rule->{name} . '},
+					  $tracelevel)
+						if defined $::RD_TRACE;
+		undef $return;
+		' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) .'
+
+		$_tok = undef;
+		' . ($self->{"lookahead"} ? '$text = $_savetext and ' : '' ) .'
+		last '
+		. ($self->{"lookahead"}<0?'if':'unless') . ' defined $_tok;
+'
+}
+
+1;
+
+package Parse::RecDescent::Error;
+
+sub issubrule { undef }
+sub isterminal { 0 }
+sub describe { $_[1] ? '' : $_[0]->{commitonly} ? '<error?:...>' : '<error...>' }
+sub sethashname { $_[0]->{hashname} = '__DIRECTIVE' . ++$_[1]->{dircount} .  '__'; }
+
+sub new ($$$$$)
+{
+	my $class = ref($_[0]) || $_[0];
+	bless 
+	{
+		"msg"        => $_[1],
+		"lookahead"  => $_[2],
+		"commitonly" => $_[3],
+		"line"       => $_[4],
+	}, $class;
+}
+
+sub code($$$$)
+{
+	my ($self, $namespace, $rule) = @_;
+	
+	my $action = '';
+	
+	if ($self->{"msg"})  # ERROR MESSAGE SUPPLIED
+	{
+		#WAS: $action .= "Parse::RecDescent::_error(qq{$self->{msg}}" .  ',$thisline);'; 
+		$action .= 'push @{$thisparser->{errors}}, [qq{'.$self->{msg}.'},$thisline];'; 
+
+	}
+	else	  # GENERATE ERROR MESSAGE DURING PARSE
+	{
+		$action .= '
+		my $rule = $item[0];
+		   $rule =~ s/_/ /g;
+		#WAS: Parse::RecDescent::_error("Invalid $rule: " . $expectation->message() ,$thisline);
+		push @{$thisparser->{errors}}, ["Invalid $rule: " . $expectation->message() ,$thisline];
+		'; 
+	}
+
+	my $dir =
+	      new Parse::RecDescent::Directive('if (' .
+		($self->{"commitonly"} ? '$commit' : '1') . 
+		") { do {$action} unless ".' $_noactions; undef } else {0}',
+	        			$self->{"lookahead"},0,$self->describe); 
+	$dir->{hashname} = $self->{hashname};
+	return $dir->code($namespace, $rule, 0);
+}
+
+1;
+
+package Parse::RecDescent::Token;
+
+sub sethashname { $_[0]->{hashname} = '__PATTERN' . ++$_[1]->{patcount} . '__'; }
+
+sub issubrule { undef }
+sub isterminal { 1 }
+sub describe ($) { shift->{'description'}}
+
+
+# ARGS ARE: $self, $pattern, $left_delim, $modifiers, $lookahead, $linenum
+sub new ($$$$$$)
+{
+	my $class = ref($_[0]) || $_[0];
+	my $pattern = $_[1];
+	my $pat = $_[1];
+	my $ldel = $_[2];
+	my $rdel = $ldel;
+	$rdel =~ tr/{[(</}])>/;
+
+	my $mod = $_[3];
+
+	my $desc;
+
+	if ($ldel eq '/') { $desc = "$ldel$pattern$rdel$mod" }
+	else		  { $desc = "m$ldel$pattern$rdel$mod" }
+	$desc =~ s/\\/\\\\/g;
+	$desc =~ s/\$$/\\\$/g;
+	$desc =~ s/}/\\}/g;
+	$desc =~ s/{/\\{/g;
+
+	if (!eval "no strict;
+		   local \$SIG{__WARN__} = sub {0};
+		   '' =~ m$ldel$pattern$rdel" and $@)
+	{
+		Parse::RecDescent::_warn(3, "Token pattern \"m$ldel$pattern$rdel\"
+					     may not be a valid regular expression",
+					   $_[5]);
+		$@ =~ s/ at \(eval.*/./;
+		Parse::RecDescent::_hint($@);
+	}
+
+	# QUIETLY PREVENT (WELL-INTENTIONED) CALAMITY
+	$mod =~ s/[gc]//g;
+	$pattern =~ s/(\A|[^\\])\\G/$1/g;
+
+	bless 
+	{
+		"pattern"   => $pattern,
+		"ldelim"      => $ldel,
+		"rdelim"      => $rdel,
+		"mod"         => $mod,
+		"lookahead"   => $_[4],
+		"line"        => $_[5],
+		"description" => $desc,
+	}, $class;
+}
+
+
+sub code($$$$)
+{
+	my ($self, $namespace, $rule, $check) = @_;
+	my $ldel = $self->{"ldelim"};
+	my $rdel = $self->{"rdelim"};
+	my $sdel = $ldel;
+	my $mod  = $self->{"mod"};
+
+	$sdel =~ s/[[{(<]/{}/;
+	
+my $code = '
+		Parse::RecDescent::_trace(q{Trying terminal: [' . $self->describe
+					  . ']}, Parse::RecDescent::_tracefirst($text),
+					  q{' . $rule->{name} . '},
+					  $tracelevel)
+						if defined $::RD_TRACE;
+		$lastsep = "";
+		$expectation->is(q{' . ($rule->hasleftmost($self) ? ''
+				: $self->describe ) . '})->at($text);
+		' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) . '
+
+		' . ($self->{"lookahead"}<0?'if':'unless')
+		. ' ($text =~ s/\A($skip)/$lastsep=$1 and ""/e and '
+		. ($check->{itempos}? 'do {'.Parse::RecDescent::Production::incitempos().' 1} and ' : '')
+		. '  $text =~ s' . $ldel . '\A(?:' . $self->{"pattern"} . ')'
+				 . $rdel . $sdel . $mod . ')
+		{
+			'.($self->{"lookahead"} ? '$text = $_savetext;' : '').'
+			$expectation->failed();
+			Parse::RecDescent::_trace(q{<<Didn\'t match terminal>>},
+						  Parse::RecDescent::_tracefirst($text))
+					if defined $::RD_TRACE;
+
+			last;
+		}
+		Parse::RecDescent::_trace(q{>>Matched terminal<< (return value: [}
+						. $& . q{])},
+						  Parse::RecDescent::_tracefirst($text))
+					if defined $::RD_TRACE;
+		push @item, $item{'.$self->{hashname}.'}=$&;
+		' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .'
+';
+
+	return $code;
+}
+
+1;
+
+package Parse::RecDescent::Literal;
+
+sub sethashname { $_[0]->{hashname} = '__STRING' . ++$_[1]->{strcount} . '__'; }
+
+sub issubrule { undef }
+sub isterminal { 1 }
+sub describe ($) { shift->{'description'} }
+
+sub new ($$$$)
+{
+	my $class = ref($_[0]) || $_[0];
+
+	my $pattern = $_[1];
+
+	my $desc = $pattern;
+	$desc=~s/\\/\\\\/g;
+	$desc=~s/}/\\}/g;
+	$desc=~s/{/\\{/g;
+
+	bless 
+	{
+		"pattern"     => $pattern,
+		"lookahead"   => $_[2],
+		"line"        => $_[3],
+		"description" => "'$desc'",
+	}, $class;
+}
+
+
+sub code($$$$)
+{
+	my ($self, $namespace, $rule, $check) = @_;
+	
+my $code = '
+		Parse::RecDescent::_trace(q{Trying terminal: [' . $self->describe
+					  . ']},
+					  Parse::RecDescent::_tracefirst($text),
+					  q{' . $rule->{name} . '},
+					  $tracelevel)
+						if defined $::RD_TRACE;
+		$lastsep = "";
+		$expectation->is(q{' . ($rule->hasleftmost($self) ? ''
+				: $self->describe ) . '})->at($text);
+		' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) . '
+
+		' . ($self->{"lookahead"}<0?'if':'unless')
+		. ' ($text =~ s/\A($skip)/$lastsep=$1 and ""/e and '
+		. ($check->{itempos}? 'do {'.Parse::RecDescent::Production::incitempos().' 1} and ' : '')
+		. '  $text =~ s/\A' . quotemeta($self->{"pattern"}) . '//)
+		{
+			'.($self->{"lookahead"} ? '$text = $_savetext;' : '').'
+			$expectation->failed();
+			Parse::RecDescent::_trace(qq{<<Didn\'t match terminal>>},
+						  Parse::RecDescent::_tracefirst($text))
+							if defined $::RD_TRACE;
+			last;
+		}
+		Parse::RecDescent::_trace(q{>>Matched terminal<< (return value: [}
+						. $& . q{])},
+						  Parse::RecDescent::_tracefirst($text))
+							if defined $::RD_TRACE;
+		push @item, $item{'.$self->{hashname}.'}=$&;
+		' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .'
+';
+
+	return $code;
+}
+
+1;
+
+package Parse::RecDescent::InterpLit;
+
+sub sethashname { $_[0]->{hashname} = '__STRING' . ++$_[1]->{strcount} . '__'; }
+
+sub issubrule { undef }
+sub isterminal { 1 }
+sub describe ($) { shift->{'description'} }
+
+sub new ($$$$)
+{
+	my $class = ref($_[0]) || $_[0];
+
+	my $pattern = $_[1];
+	$pattern =~ s#/#\\/#g;
+
+	my $desc = $pattern;
+	$desc=~s/\\/\\\\/g;
+	$desc=~s/}/\\}/g;
+	$desc=~s/{/\\{/g;
+
+	bless 
+	{
+		"pattern"   => $pattern,
+		"lookahead" => $_[2],
+		"line"      => $_[3],
+		"description" => "'$desc'",
+	}, $class;
+}
+
+sub code($$$$)
+{
+	my ($self, $namespace, $rule, $check) = @_;
+	
+my $code = '
+		Parse::RecDescent::_trace(q{Trying terminal: [' . $self->describe
+					  . ']},
+					  Parse::RecDescent::_tracefirst($text),
+					  q{' . $rule->{name} . '},
+					  $tracelevel)
+						if defined $::RD_TRACE;
+		$lastsep = "";
+		$expectation->is(q{' . ($rule->hasleftmost($self) ? ''
+				: $self->describe ) . '})->at($text);
+		' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) . '
+
+		' . ($self->{"lookahead"}<0?'if':'unless')
+		. ' ($text =~ s/\A($skip)/$lastsep=$1 and ""/e and '
+		. ($check->{itempos}? 'do {'.Parse::RecDescent::Production::incitempos().' 1} and ' : '')
+		. '  do { $_tok = "' . $self->{"pattern"} . '"; 1 } and
+		     substr($text,0,length($_tok)) eq $_tok and
+		     do { substr($text,0,length($_tok)) = ""; 1; }
+		)
+		{
+			'.($self->{"lookahead"} ? '$text = $_savetext;' : '').'
+			$expectation->failed();
+			Parse::RecDescent::_trace(q{<<Didn\'t match terminal>>},
+						  Parse::RecDescent::_tracefirst($text))
+							if defined $::RD_TRACE;
+			last;
+		}
+		Parse::RecDescent::_trace(q{>>Matched terminal<< (return value: [}
+						. $_tok . q{])},
+						  Parse::RecDescent::_tracefirst($text))
+							if defined $::RD_TRACE;
+		push @item, $item{'.$self->{hashname}.'}=$_tok;
+		' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .'
+';
+
+	return $code;
+}
+
+1;
+
+package Parse::RecDescent::Subrule;
+
+sub issubrule ($) { return $_[0]->{"subrule"} }
+sub isterminal { 0 }
+sub sethashname {}
+
+sub describe ($)
+{
+	my $desc = $_[0]->{"implicit"} || $_[0]->{"subrule"};
+	$desc = "<matchrule:$desc>" if $_[0]->{"matchrule"};
+	return $desc;
+}
+
+sub callsyntax($$)
+{
+	if ($_[0]->{"matchrule"})
+	{
+		return "&{'$_[1]'.qq{$_[0]->{subrule}}}";
+	}
+	else
+	{
+		return $_[1].$_[0]->{"subrule"};
+	}
+}
+
+sub new ($$$$;$$$)
+{
+	my $class = ref($_[0]) || $_[0];
+	bless 
+	{
+		"subrule"   => $_[1],
+		"lookahead" => $_[2],
+		"line"      => $_[3],
+		"implicit"  => $_[4] || undef,
+		"matchrule" => $_[5],
+		"argcode"   => $_[6] || undef,
+	}, $class;
+}
+
+
+sub code($$$$)
+{
+	my ($self, $namespace, $rule) = @_;
+	
+'
+		Parse::RecDescent::_trace(q{Trying subrule: [' . $self->{"subrule"} . ']},
+				  Parse::RecDescent::_tracefirst($text),
+				  q{' . $rule->{"name"} . '},
+				  $tracelevel)
+					if defined $::RD_TRACE;
+		if (1) { no strict qw{refs};
+		$expectation->is(' . ($rule->hasleftmost($self) ? 'q{}'
+				# WAS : 'qq{'.$self->describe.'}' ) . ')->at($text);
+				: 'q{'.$self->describe.'}' ) . ')->at($text);
+		' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' )
+		. ($self->{"lookahead"}<0?'if':'unless')
+		. ' (defined ($_tok = '
+		. $self->callsyntax($namespace.'::')
+		. '($thisparser,$text,$repeating,'
+		. ($self->{"lookahead"}?'1':'$_noactions')
+		. ($self->{argcode} ? ",sub { return $self->{argcode} }"
+				   : ',sub { \\@arg }')
+		. ')))
+		{
+			'.($self->{"lookahead"} ? '$text = $_savetext;' : '').'
+			Parse::RecDescent::_trace(q{<<Didn\'t match subrule: ['
+			. $self->{subrule} . ']>>},
+						  Parse::RecDescent::_tracefirst($text),
+						  q{' . $rule->{"name"} .'},
+						  $tracelevel)
+							if defined $::RD_TRACE;
+			$expectation->failed();
+			last;
+		}
+		Parse::RecDescent::_trace(q{>>Matched subrule: ['
+					. $self->{subrule} . ']<< (return value: [}
+					. $_tok . q{]},
+					  
+					  Parse::RecDescent::_tracefirst($text),
+					  q{' . $rule->{"name"} .'},
+					  $tracelevel)
+						if defined $::RD_TRACE;
+		$item{q{' . $self->{subrule} . '}} = $_tok;
+		push @item, $_tok;
+		' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .'
+		}
+'
+}
+
+package Parse::RecDescent::Repetition;
+
+sub issubrule ($) { return $_[0]->{"subrule"} }
+sub isterminal { 0 }
+sub sethashname {  }
+
+sub describe ($)
+{
+	my $desc = $_[0]->{"expected"} || $_[0]->{"subrule"};
+	$desc = "<matchrule:$desc>" if $_[0]->{"matchrule"};
+	return $desc;
+}
+
+sub callsyntax($$)
+{
+	if ($_[0]->{matchrule})
+		{ return "sub { goto &{''.qq{$_[1]$_[0]->{subrule}}} }"; }
+	else
+		{ return "\\&$_[1]$_[0]->{subrule}"; }
+}
+
+sub new ($$$$$$$$$$)
+{
+	my ($self, $subrule, $repspec, $min, $max, $lookahead, $line, $parser, $matchrule, $argcode) = @_;
+	my $class = ref($self) || $self;
+	($max, $min) = ( $min, $max) if ($max<$min);
+
+	my $desc;
+	if ($subrule=~/\A_alternation_\d+_of_production_\d+_of_rule/)
+		{ $desc = $parser->{"rules"}{$subrule}->expected }
+
+	if ($lookahead)
+	{
+		if ($min>0)
+		{
+		   return new Parse::RecDescent::Subrule($subrule,$lookahead,$line,$desc,$matchrule,$argcode);
+		}
+		else
+		{
+			Parse::RecDescent::_error("Not symbol (\"!\") before
+				            \"$subrule\" doesn't make
+					    sense.",$line);
+			Parse::RecDescent::_hint("Lookahead for negated optional
+					   repetitions (such as
+					   \"!$subrule($repspec)\" can never
+					   succeed, since optional items always
+					   match (zero times at worst). 
+					   Did you mean a single \"!$subrule\", 
+					   instead?");
+		}
+	}
+	bless 
+	{
+		"subrule"   => $subrule,
+		"repspec"   => $repspec,
+		"min"       => $min,
+		"max"       => $max,
+		"lookahead" => $lookahead,
+		"line"      => $line,
+		"expected"  => $desc,
+		"argcode"   => $argcode || undef,
+		"matchrule" => $matchrule,
+	}, $class;
+}
+
+sub code($$$$)
+{
+	my ($self, $namespace, $rule) = @_;
+	
+	my ($subrule, $repspec, $min, $max, $lookahead) =
+		@{$self}{ qw{subrule repspec min max lookahead} };
+
+'
+		Parse::RecDescent::_trace(q{Trying repeated subrule: [' . $self->describe . ']},
+				  Parse::RecDescent::_tracefirst($text),
+				  q{' . $rule->{"name"} . '},
+				  $tracelevel)
+					if defined $::RD_TRACE;
+		$expectation->is(' . ($rule->hasleftmost($self) ? 'q{}'
+				# WAS : 'qq{'.$self->describe.'}' ) . ')->at($text);
+				: 'q{'.$self->describe.'}' ) . ')->at($text);
+		' . ($self->{"lookahead"} ? '$_savetext = $text;' : '' ) .'
+		unless (defined ($_tok = $thisparser->_parserepeat($text, '
+		. $self->callsyntax($namespace.'::')
+		. ', ' . $min . ', ' . $max . ', '
+		. ($self->{"lookahead"}?'1':'$_noactions')
+		. ',$expectation,'
+		. ($self->{argcode} ? "sub { return $self->{argcode} }"
+				   : 'undef')
+		. '))) 
+		{
+			Parse::RecDescent::_trace(q{<<Didn\'t match repeated subrule: ['
+			. $self->describe . ']>>},
+						  Parse::RecDescent::_tracefirst($text),
+						  q{' . $rule->{"name"} .'},
+						  $tracelevel)
+							if defined $::RD_TRACE;
+			last;
+		}
+		Parse::RecDescent::_trace(q{>>Matched repeated subrule: ['
+					. $self->{subrule} . ']<< (}
+					. @$_tok . q{ times)},
+					  
+					  Parse::RecDescent::_tracefirst($text),
+					  q{' . $rule->{"name"} .'},
+					  $tracelevel)
+						if defined $::RD_TRACE;
+		$item{q{' . "$self->{subrule}($self->{repspec})" . '}} = $_tok;
+		push @item, $_tok;
+		' . ($self->{"lookahead"} ? '$text = $_savetext;' : '' ) .'
+
+'
+}
+
+package Parse::RecDescent::Result;
+
+sub issubrule { 0 }
+sub isterminal { 0 }
+sub describe { '' }
+
+sub new
+{
+	my ($class, $pos) = @_;
+
+	bless {}, $class;
+}
+
+sub code($$$$)
+{
+	my ($self, $namespace, $rule) = @_;
+	
+	'
+		$return = $item[-1];
+	';
+}
+
+package Parse::RecDescent::Operator;
+
+my @opertype = ( " non-optional", "n optional" );
+
+sub issubrule { 0 }
+sub isterminal { 0 }
+
+sub describe { $_[0]->{"expected"} }
+sub sethashname { $_[0]->{hashname} = '__DIRECTIVE' . ++$_[1]->{dircount} .  '__'; }
+
+
+sub new
+{
+	my ($class, $type, $minrep, $maxrep, $leftarg, $op, $rightarg) = @_;
+
+	bless 
+	{
+		"type"      => "${type}op",
+		"leftarg"   => $leftarg,
+		"op"        => $op,
+		"min"       => $minrep,
+		"max"       => $maxrep,
+		"rightarg"  => $rightarg,
+		"expected"  => "<${type}op: ".$leftarg->describe." ".$op->describe." ".$rightarg->describe.">",
+	}, $class;
+}
+
+sub code($$$$)
+{
+	my ($self, $namespace, $rule) = @_;
+	
+	my ($leftarg, $op, $rightarg) =
+		@{$self}{ qw{leftarg op rightarg} };
+
+	my $code = '
+		Parse::RecDescent::_trace(q{Trying operator: [' . $self->describe . ']},
+				  Parse::RecDescent::_tracefirst($text),
+				  q{' . $rule->{"name"} . '},
+				  $tracelevel)
+					if defined $::RD_TRACE;
+		$expectation->is(' . ($rule->hasleftmost($self) ? 'q{}'
+				# WAS : 'qq{'.$self->describe.'}' ) . ')->at($text);
+				: 'q{'.$self->describe.'}' ) . ')->at($text);
+
+		$_tok = undef;
+		OPLOOP: while (1)
+		{
+		  $repcount = 0;
+		  my  @item;
+		  ';
+
+	if ($self->{type} eq "leftop" )
+	{
+		$code .= '
+		  # MATCH LEFTARG
+		  ' . $leftarg->code(@_[1..2]) . '
+
+		  $repcount++;
+
+		  my $savetext = $text;
+		  my $backtrack;
+
+		  # MATCH (OP RIGHTARG)(s)
+		  while ($repcount < ' . $self->{max} . ')
+		  {
+			$backtrack = 0;
+			' . $op->code(@_[1..2]) . '
+			' . ($op->isterminal() ? 'pop @item;' : '$backtrack=1;' ) . '
+			' . (ref($op) eq 'Parse::RecDescent::Token'
+				? 'if (defined $1) {push @item, $item{'.($self->{name}||$self->{hashname}).'}=$1; $backtrack=1;}'
+				: "" ) . '
+			' . $rightarg->code(@_[1..2]) . '
+			$savetext = $text;
+			$repcount++;
+		  }
+		  $text = $savetext;
+		  pop @item if $backtrack;
+
+		  ';
+	}
+	else
+	{
+		$code .= '
+		  my $savetext = $text;
+		  my $backtrack;
+		  # MATCH (LEFTARG OP)(s)
+		  while ($repcount < ' . $self->{max} . ')
+		  {
+			$backtrack = 0;
+			' . $leftarg->code(@_[1..2]) . '
+			$repcount++;
+			$backtrack = 1;
+			' . $op->code(@_[1..2]) . '
+			$savetext = $text;
+			' . ($op->isterminal() ? 'pop @item;' : "" ) . '
+			' . (ref($op) eq 'Parse::RecDescent::Token' ? 'do { push @item, $item{'.($self->{name}||$self->{hashname}).'}=$1; } if defined $1;' : "" ) . '
+		  }
+		  $text = $savetext;
+		  pop @item if $backtrack;
+
+		  # MATCH RIGHTARG
+		  ' . $rightarg->code(@_[1..2]) . '
+		  $repcount++;
+		  ';
+	}
+
+	$code .= 'unless (@item) { undef $_tok; last }' unless $self->{min}==0;
+
+	$code .= '
+		  $_tok = [ @item ];
+		  last;
+		} 
+
+		unless ($repcount>='.$self->{min}.')
+		{
+			Parse::RecDescent::_trace(q{<<Didn\'t match operator: ['
+						  . $self->describe
+						  . ']>>},
+						  Parse::RecDescent::_tracefirst($text),
+						  q{' . $rule->{"name"} .'},
+						  $tracelevel)
+							if defined $::RD_TRACE;
+			$expectation->failed();
+			last;
+		}
+		Parse::RecDescent::_trace(q{>>Matched operator: ['
+					  . $self->describe
+					  . ']<< (return value: [}
+					  . qq{@{$_tok||[]}} . q{]},
+					  Parse::RecDescent::_tracefirst($text),
+					  q{' . $rule->{"name"} .'},
+					  $tracelevel)
+						if defined $::RD_TRACE;
+
+		push @item, $item{'.($self->{name}||$self->{hashname}).'}=$_tok||[];
+
+';
+	return $code;
+}
+
+
+package Parse::RecDescent::Expectation;
+
+sub new ($)
+{
+	bless {
+		"failed"	  => 0,
+		"expected"	  => "",
+		"unexpected"	  => "",
+		"lastexpected"	  => "",
+		"lastunexpected"  => "",
+		"defexpected"	  => $_[1],
+	      };
+}
+
+sub is ($$)
+{
+	$_[0]->{lastexpected} = $_[1]; return $_[0];
+}
+
+sub at ($$)
+{
+	$_[0]->{lastunexpected} = $_[1]; return $_[0];
+}
+
+sub failed ($)
+{
+	return unless $_[0]->{lastexpected};
+	$_[0]->{expected}   = $_[0]->{lastexpected}   unless $_[0]->{failed};
+	$_[0]->{unexpected} = $_[0]->{lastunexpected} unless $_[0]->{failed};
+	$_[0]->{failed} = 1;
+}
+
+sub message ($)
+{
+	my ($self) = @_;
+	$self->{expected} = $self->{defexpected} unless $self->{expected};
+	$self->{expected} =~ s/_/ /g;
+	if (!$self->{unexpected} || $self->{unexpected} =~ /\A\s*\Z/s)
+	{
+		return "Was expecting $self->{expected}";
+	}
+	else
+	{
+		$self->{unexpected} =~ /\s*(.*)/;
+		return "Was expecting $self->{expected} but found \"$1\" instead";
+	}
+}
+
+1;
+
+package Parse::RecDescent;
+
+use Carp;
+use vars qw ( $AUTOLOAD $VERSION );
+
+my $ERRORS = 0;
+
+$VERSION = '1.94';
+
+# BUILDING A PARSER
+
+my $nextnamespace = "namespace000001";
+
+sub _nextnamespace()
+{
+	return "Parse::RecDescent::" . $nextnamespace++;
+}
+
+sub new ($$$)
+{
+	my $class = ref($_[0]) || $_[0];
+        local $Parse::RecDescent::compiling = $_[2];
+        my $name_space_name = defined $_[3]
+		? "Parse::RecDescent::".$_[3] 
+		: _nextnamespace();
+	my $self =
+	{
+		"rules"     => {},
+		"namespace" => $name_space_name,
+		"startcode" => '',
+		"localvars" => '',
+		"_AUTOACTION" => undef,
+		"_AUTOTREE"   => undef,
+	};
+	if ($::RD_AUTOACTION)
+	{
+		my $sourcecode = $::RD_AUTOACTION;
+		$sourcecode = "{ $sourcecode }"
+			unless $sourcecode =~ /\A\s*\{.*\}\s*\Z/;
+		$self->{_check}{itempos} =
+			$sourcecode =~ /\@itempos\b|\$itempos\s*\[/;
+		$self->{_AUTOACTION}
+			= new Parse::RecDescent::Action($sourcecode,0,-1)
+	}
+	
+	bless $self, $class;
+	shift;
+	return $self->Replace(@_)
+}
+
+sub Compile($$$$) {
+
+	die "Compilation of Parse::RecDescent grammars not yet implemented\n";
+}
+
+sub DESTROY {}  # SO AUTOLOADER IGNORES IT
+
+# BUILDING A GRAMMAR....
+
+sub Replace ($$)
+{
+	splice(@_, 2, 0, 1);
+	return _generate(@_);
+}
+
+sub Extend ($$)
+{
+	splice(@_, 2, 0, 0);
+	return _generate(@_);
+}
+
+sub _no_rule ($$;$)
+{
+	_error("Ruleless $_[0] at start of grammar.",$_[1]);
+	my $desc = $_[2] ? "\"$_[2]\"" : "";
+	_hint("You need to define a rule for the $_[0] $desc
+	       to be part of.");
+}
+
+my $NEGLOOKAHEAD	= '\G(\s*\.\.\.\!)';
+my $POSLOOKAHEAD	= '\G(\s*\.\.\.)';
+my $RULE		= '\G\s*(\w+)[ \t]*:';
+my $PROD		= '\G\s*([|])';
+my $TOKEN		= q{\G\s*/((\\\\/|[^/])*)/([cgimsox]*)};
+my $MTOKEN		= q{\G\s*(m\s*[^\w\s])};
+my $LITERAL		= q{\G\s*'((\\\\['\\\\]|[^'])*)'};
+my $INTERPLIT		= q{\G\s*"((\\\\["\\\\]|[^"])*)"};
+my $SUBRULE		= '\G\s*(\w+)';
+my $MATCHRULE		= '\G(\s*<matchrule:)';
+my $SIMPLEPAT		= '((\\s+/[^/\\\\]*(?:\\\\.[^/\\\\]*)*/)?)';
+my $OPTIONAL		= '\G\((\?)'.$SIMPLEPAT.'\)';
+my $ANY			= '\G\((s\?)'.$SIMPLEPAT.'\)';
+my $MANY 		= '\G\((s|\.\.)'.$SIMPLEPAT.'\)';
+my $EXACTLY		= '\G\(([1-9]\d*)'.$SIMPLEPAT.'\)';
+my $BETWEEN		= '\G\((\d+)\.\.([1-9]\d*)'.$SIMPLEPAT.'\)';
+my $ATLEAST		= '\G\((\d+)\.\.'.$SIMPLEPAT.'\)';
+my $ATMOST		= '\G\(\.\.([1-9]\d*)'.$SIMPLEPAT.'\)';
+my $BADREP		= '\G\((-?\d+)?\.\.(-?\d+)?'.$SIMPLEPAT.'\)';
+my $ACTION		= '\G\s*\{';
+my $IMPLICITSUBRULE	= '\G\s*\(';
+my $COMMENT		= '\G\s*(#.*)';
+my $COMMITMK		= '\G\s*<commit>';
+my $UNCOMMITMK		= '\G\s*<uncommit>';
+my $QUOTELIKEMK		= '\G\s*<perl_quotelike>';
+my $CODEBLOCKMK		= '\G\s*<perl_codeblock(?:\s+([][()<>{}]+))?>';
+my $VARIABLEMK		= '\G\s*<perl_variable>';
+my $NOCHECKMK		= '\G\s*<nocheck>';
+my $AUTOTREEMK		= '\G\s*<autotree>';
+my $AUTOSTUBMK		= '\G\s*<autostub>';
+my $AUTORULEMK		= '\G\s*<autorule:(.*?)>';
+my $REJECTMK		= '\G\s*<reject>';
+my $CONDREJECTMK	= '\G\s*<reject:';
+my $SCOREMK		= '\G\s*<score:';
+my $AUTOSCOREMK		= '\G\s*<autoscore:';
+my $SKIPMK		= '\G\s*<skip:';
+my $OPMK		= '\G\s*<(left|right)op(?:=(\'.*?\'))?:';
+my $ENDDIRECTIVEMK	= '\G\s*>';
+my $RESYNCMK		= '\G\s*<resync>';
+my $RESYNCPATMK		= '\G\s*<resync:';
+my $RULEVARPATMK	= '\G\s*<rulevar:';
+my $DEFERPATMK		= '\G\s*<defer:';
+my $TOKENPATMK		= '\G\s*<token:';
+my $AUTOERRORMK		= '\G\s*<error(\??)>';
+my $MSGERRORMK		= '\G\s*<error(\??):';
+my $UNCOMMITPROD	= $PROD.'\s*<uncommit';
+my $ERRORPROD		= $PROD.'\s*<error';
+my $LONECOLON		= '\G\s*:';
+my $OTHER		= '\G\s*([^\s]+)';
+
+my $lines = 0;
+
+sub _generate($$$;$$)
+{
+	my ($self, $grammar, $replace, $isimplicit, $isleftop) = (@_, 0);
+
+	my $aftererror = 0;
+	my $lookahead = 0;
+	my $lookaheadspec = "";
+	$lines = _linecount($grammar) unless $lines;
+	$self->{_check}{itempos} = ($grammar =~ /\@itempos\b|\$itempos\s*\[/)
+		unless $self->{_check}{itempos};
+	for (qw(thisoffset thiscolumn prevline prevoffset prevcolumn))
+	{
+		$self->{_check}{$_} =
+			($grammar =~ /\$$_/) || $self->{_check}{itempos}
+				unless $self->{_check}{$_};
+	}
+	my $line;
+
+	my $rule = undef;
+	my $prod = undef;
+	my $item = undef;
+	my $lastgreedy = '';
+	pos $grammar = 0;
+	study $grammar;
+
+	while (pos $grammar < length $grammar)
+	{
+		$line = $lines - _linecount($grammar) + 1;
+		my $commitonly;
+		my $code = "";
+		my @components = ();
+		if ($grammar =~ m/$COMMENT/gco)
+		{
+			_parse("a comment",0,$line);
+			next;
+		}
+		elsif ($grammar =~ m/$NEGLOOKAHEAD/gco)
+		{
+			_parse("a negative lookahead",$aftererror,$line);
+			$lookahead = $lookahead ? -$lookahead : -1;
+			$lookaheadspec .= $1;
+			next;	# SKIP LOOKAHEAD RESET AT END OF while LOOP
+		}
+		elsif ($grammar =~ m/$POSLOOKAHEAD/gco)
+		{
+			_parse("a positive lookahead",$aftererror,$line);
+			$lookahead = $lookahead ? $lookahead : 1;
+			$lookaheadspec .= $1;
+			next;	# SKIP LOOKAHEAD RESET AT END OF while LOOP
+		}
+		elsif ($grammar =~ m/(?=$ACTION)/gco
+			and do { ($code) = extract_codeblock($grammar); $code })
+		{
+			_parse("an action", $aftererror, $line, $code);
+			$item = new Parse::RecDescent::Action($code,$lookahead,$line);
+			$prod and $prod->additem($item)
+			      or  $self->_addstartcode($code);
+		}
+		elsif ($grammar =~ m/(?=$IMPLICITSUBRULE)/gco
+			and do { ($code) = extract_codeblock($grammar,'{([',undef,'(',1);
+				$code })
+		{
+			$code =~ s/\A\s*\(|\)\Z//g;
+			_parse("an implicit subrule", $aftererror, $line,
+				"( $code )");
+			my $implicit = $rule->nextimplicit;
+			$self->_generate("$implicit : $code",$replace,1);
+			my $pos = pos $grammar;
+			substr($grammar,$pos,0,$implicit);
+			pos $grammar = $pos;;
+		}
+		elsif ($grammar =~ m/$ENDDIRECTIVEMK/gco)
+		{
+
+		# EXTRACT TRAILING REPETITION SPECIFIER (IF ANY)
+
+			my ($minrep,$maxrep) = (1,$MAXREP);
+			if ($grammar =~ m/\G[(]/gc)
+			{
+				pos($grammar)--;
+
+				if ($grammar =~ m/$OPTIONAL/gco)
+					{ ($minrep, $maxrep) = (0,1) }
+				elsif ($grammar =~ m/$ANY/gco)
+					{ $minrep = 0 }
+				elsif ($grammar =~ m/$EXACTLY/gco)
+					{ ($minrep, $maxrep) = ($1,$1) }
+				elsif ($grammar =~ m/$BETWEEN/gco)
+					{ ($minrep, $maxrep) = ($1,$2) }
+				elsif ($grammar =~ m/$ATLEAST/gco)
+					{ $minrep = $1 }
+				elsif ($grammar =~ m/$ATMOST/gco)
+					{ $maxrep = $1 }
+				elsif ($grammar =~ m/$MANY/gco)
+					{ }
+				elsif ($grammar =~ m/$BADREP/gco)
+				{
+					_parse("an invalid repetition specifier", 0,$line);
+					_error("Incorrect specification of a repeated directive",
+					       $line);
+					_hint("Repeated directives cannot have
+					       a maximum repetition of zero, nor can they have
+					       negative components in their ranges.");
+				}
+			}
+			
+			$prod && $prod->enddirective($line,$minrep,$maxrep);
+		}
+		elsif ($grammar =~ m/\G\s*<[^m]/gc)
+		{
+			pos($grammar)-=2;
+
+			if ($grammar =~ m/$OPMK/gco)
+			{
+				# $DB::single=1;
+				_parse("a $1-associative operator directive", $aftererror, $line, "<$1op:...>");
+				$prod->adddirective($1, $line,$2||'');
+			}
+			elsif ($grammar =~ m/$UNCOMMITMK/gco)
+			{
+				_parse("an uncommit marker", $aftererror,$line);
+				$item = new Parse::RecDescent::Directive('$commit=0;1',
+								  $lookahead,$line,"<uncommit>");
+				$prod and $prod->additem($item)
+				      or  _no_rule("<uncommit>",$line);
+			}
+			elsif ($grammar =~ m/$QUOTELIKEMK/gco)
+			{
+				_parse("an perl quotelike marker", $aftererror,$line);
+				$item = new Parse::RecDescent::Directive(
+					'my ($match, at res);
+					 ($match,$text,undef, at res) =
+						  Text::Balanced::extract_quotelike($text,$skip);
+					  $match ? \@res : undef;
+					', $lookahead,$line,"<perl_quotelike>");
+				$prod and $prod->additem($item)
+				      or  _no_rule("<perl_quotelike>",$line);
+			}
+			elsif ($grammar =~ m/$CODEBLOCKMK/gco)
+			{
+				my $outer = $1||"{}";
+				_parse("an perl codeblock marker", $aftererror,$line);
+				$item = new Parse::RecDescent::Directive(
+					'Text::Balanced::extract_codeblock($text,undef,$skip,\''.$outer.'\');
+					', $lookahead,$line,"<perl_codeblock>");
+				$prod and $prod->additem($item)
+				      or  _no_rule("<perl_codeblock>",$line);
+			}
+			elsif ($grammar =~ m/$VARIABLEMK/gco)
+			{
+				_parse("an perl variable marker", $aftererror,$line);
+				$item = new Parse::RecDescent::Directive(
+					'Text::Balanced::extract_variable($text,$skip);
+					', $lookahead,$line,"<perl_variable>");
+				$prod and $prod->additem($item)
+				      or  _no_rule("<perl_variable>",$line);
+			}
+			elsif ($grammar =~ m/$NOCHECKMK/gco)
+			{
+				_parse("a disable checking marker", $aftererror,$line);
+				if ($rule)
+				{
+					_error("<nocheck> directive not at start of grammar", $line);
+					_hint("The <nocheck> directive can only
+					       be specified at the start of a
+					       grammar (before the first rule 
+					       is defined.");
+				}
+				else
+				{
+					local $::RD_CHECK = 1;
+				}
+			}
+			elsif ($grammar =~ m/$AUTOSTUBMK/gco)
+			{
+				_parse("an autostub marker", $aftererror,$line);
+				$::RD_AUTOSTUB = "";
+			}
+			elsif ($grammar =~ m/$AUTORULEMK/gco)
+			{
+				_parse("an autorule marker", $aftererror,$line);
+				$::RD_AUTOSTUB = $1;
+			}
+			elsif ($grammar =~ m/$AUTOTREEMK/gco)
+			{
+				_parse("an autotree marker", $aftererror,$line);
+				if ($rule)
+				{
+					_error("<autotree> directive not at start of grammar", $line);
+					_hint("The <autotree> directive can only
+					       be specified at the start of a
+					       grammar (before the first rule 
+					       is defined.");
+				}
+				else
+				{
+					undef $self->{_AUTOACTION};
+					$self->{_AUTOTREE}{NODE}
+						= new Parse::RecDescent::Action(q{{bless \%item, $item[0]}},0,-1);
+					$self->{_AUTOTREE}{TERMINAL}
+						= new Parse::RecDescent::Action(q{{bless {__VALUE__=>$item[1]}, $item[0]}},0,-1);
+				}
+			}
+
+			elsif ($grammar =~ m/$REJECTMK/gco)
+			{
+				_parse("an reject marker", $aftererror,$line);
+				$item = new Parse::RecDescent::UncondReject($lookahead,$line,"<reject>");
+				$prod and $prod->additem($item)
+				      or  _no_rule("<reject>",$line);
+			}
+			elsif ($grammar =~ m/(?=$CONDREJECTMK)/gco
+				and do { ($code) = extract_codeblock($grammar,'{',undef,'<');
+					  $code })
+			{
+				_parse("a (conditional) reject marker", $aftererror,$line);
+				$code =~ /\A\s*<reject:(.*)>\Z/s;
+				$item = new Parse::RecDescent::Directive(
+					      "($1) ? undef : 1", $lookahead,$line,"<reject:$code>");
+				$prod and $prod->additem($item)
+				      or  _no_rule("<reject:$code>",$line);
+			}
+			elsif ($grammar =~ m/(?=$SCOREMK)/gco
+				and do { ($code) = extract_codeblock($grammar,'{',undef,'<');
+					  $code })
+			{
+				_parse("a score marker", $aftererror,$line);
+				$code =~ /\A\s*<score:(.*)>\Z/s;
+				$prod and $prod->addscore($1, $lookahead, $line)
+				      or  _no_rule($code,$line);
+			}
+			elsif ($grammar =~ m/(?=$AUTOSCOREMK)/gco
+				and do { ($code) = extract_codeblock($grammar,'{',undef,'<');
+					 $code;
+				       } )
+			{
+				_parse("an autoscore specifier", $aftererror,$line,$code);
+				$code =~ /\A\s*<autoscore:(.*)>\Z/s;
+
+				$rule and $rule->addautoscore($1,$self)
+				      or  _no_rule($code,$line);
+
+				$item = new Parse::RecDescent::UncondReject($lookahead,$line,$code);
+				$prod and $prod->additem($item)
+				      or  _no_rule($code,$line);
+			}
+			elsif ($grammar =~ m/$RESYNCMK/gco)
+			{
+				_parse("a resync to newline marker", $aftererror,$line);
+				$item = new Parse::RecDescent::Directive(
+					      'if ($text =~ s/\A[^\n]*\n//) { $return = 0; $& } else { undef }',
+					      $lookahead,$line,"<resync>");
+				$prod and $prod->additem($item)
+				      or  _no_rule("<resync>",$line);
+			}
+			elsif ($grammar =~ m/(?=$RESYNCPATMK)/gco
+				and do { ($code) = extract_bracketed($grammar,'<');
+					  $code })
+			{
+				_parse("a resync with pattern marker", $aftererror,$line);
+				$code =~ /\A\s*<resync:(.*)>\Z/s;
+				$item = new Parse::RecDescent::Directive(
+					      'if ($text =~ s/\A'.$1.'//) { $return = 0; $& } else { undef }',
+					      $lookahead,$line,$code);
+				$prod and $prod->additem($item)
+				      or  _no_rule($code,$line);
+			}
+			elsif ($grammar =~ m/(?=$SKIPMK)/gco
+				and do { ($code) = extract_codeblock($grammar,'<');
+					  $code })
+			{
+				_parse("a skip marker", $aftererror,$line);
+				$code =~ /\A\s*<skip:(.*)>\Z/s;
+				$item = new Parse::RecDescent::Directive(
+					      'my $oldskip = $skip; $skip='.$1.'; $oldskip',
+					      $lookahead,$line,$code);
+				$prod and $prod->additem($item)
+				      or  _no_rule($code,$line);
+			}
+			elsif ($grammar =~ m/(?=$RULEVARPATMK)/gco
+				and do { ($code) = extract_codeblock($grammar,'{',undef,'<');
+					 $code;
+				       } )
+			{
+				_parse("a rule variable specifier", $aftererror,$line,$code);
+				$code =~ /\A\s*<rulevar:(.*)>\Z/s;
+
+				$rule and $rule->addvar($1,$self)
+				      or  _no_rule($code,$line);
+
+				$item = new Parse::RecDescent::UncondReject($lookahead,$line,$code);
+				$prod and $prod->additem($item)
+				      or  _no_rule($code,$line);
+			}
+			elsif ($grammar =~ m/(?=$DEFERPATMK)/gco
+				and do { ($code) = extract_codeblock($grammar,'{',undef,'<');
+					 $code;
+				       } )
+			{
+				_parse("a deferred action specifier", $aftererror,$line,$code);
+				$code =~ s/\A\s*<defer:(.*)>\Z/$1/s;
+				if ($code =~ /\A\s*[^{]|[^}]\s*\Z/)
+				{
+					$code = "{ $code }"
+				}
+
+				$item = new Parse::RecDescent::Directive(
+					      "push \@{\$thisparser->{deferred}}, sub $code;",
+					      $lookahead,$line,"<defer:$code>");
+				$prod and $prod->additem($item)
+				      or  _no_rule("<defer:$code>",$line);
+
+				$self->{deferrable} = 1;
+			}
+			elsif ($grammar =~ m/(?=$TOKENPATMK)/gco
+				and do { ($code) = extract_codeblock($grammar,'{',undef,'<');
+					 $code;
+				       } )
+			{
+				_parse("a token constructor", $aftererror,$line,$code);
+				$code =~ s/\A\s*<token:(.*)>\Z/$1/s;
+
+				my $types = eval 'no strict; local $SIG{__WARN__} = sub {0}; my @arr=('.$code.'); @arr' || (); 
+				if (!$types)
+				{
+					_error("Incorrect token specification: \"$@\"", $line);
+					_hint("The <token:...> directive requires a list
+					       of one or more strings representing possible
+					       types of the specified token. For example:
+					       <token:NOUN,VERB>");
+				}
+				else
+				{
+					$item = new Parse::RecDescent::Directive(
+						      'no strict;
+						       $return = { text => $item[-1] };
+						       @{$return->{type}}{'.$code.'} = (1..'.$types.');',
+						      $lookahead,$line,"<token:$code>");
+					$prod and $prod->additem($item)
+					      or  _no_rule("<token:$code>",$line);
+				}
+			}
+			elsif ($grammar =~ m/$COMMITMK/gco)
+			{
+				_parse("an commit marker", $aftererror,$line);
+				$item = new Parse::RecDescent::Directive('$commit = 1',
+								  $lookahead,$line,"<commit>");
+				$prod and $prod->additem($item)
+				      or  _no_rule("<commit>",$line);
+			}
+			elsif ($grammar =~ m/$AUTOERRORMK/gco)
+			{
+				$commitonly = $1;
+				_parse("an error marker", $aftererror,$line);
+				$item = new Parse::RecDescent::Error('',$lookahead,$1,$line);
+				$prod and $prod->additem($item)
+				      or  _no_rule("<error>",$line);
+				$aftererror = !$commitonly;
+			}
+			elsif ($grammar =~ m/(?=$MSGERRORMK)/gco
+				and do { $commitonly = $1;
+					 ($code) = extract_bracketed($grammar,'<');
+					$code })
+			{
+				_parse("an error marker", $aftererror,$line,$code);
+				$code =~ /\A\s*<error\??:(.*)>\Z/s;
+				$item = new Parse::RecDescent::Error($1,$lookahead,$commitonly,$line);
+				$prod and $prod->additem($item)
+				      or  _no_rule("$code",$line);
+				$aftererror = !$commitonly;
+			}
+			elsif (do { $commitonly = $1;
+					 ($code) = extract_bracketed($grammar,'<');
+					$code })
+			{
+				if ($code =~ /^<[A-Z_]+>$/)
+				{
+					_error("Token items are not yet
+					supported: \"$code\"",
+					       $line);
+					_hint("Items like $code that consist of angle
+					brackets enclosing a sequence of
+					uppercase characters will eventually
+					be used to specify pre-lexed tokens
+					in a grammar. That functionality is not
+					yet implemented. Or did you misspell
+					\"$code\"?");
+				}
+				else
+				{
+					_error("Untranslatable item encountered: \"$code\"",
+					       $line);
+					_hint("Did you misspell \"$code\"
+						   or forget to comment it out?");
+				}
+			}
+		}
+		elsif ($grammar =~ m/$RULE/gco)
+		{
+			_parseunneg("a rule declaration", 0,
+				    $lookahead,$line) or next;
+			my $rulename = $1;
+			if ($rulename =~ /Replace|Extend|Precompile|Save/ )
+			{	
+				_warn(2,"Rule \"$rulename\" hidden by method
+				       Parse::RecDescent::$rulename",$line)
+				and
+				_hint("The rule named \"$rulename\" cannot be directly
+                                       called through the Parse::RecDescent object
+                                       for this grammar (although it may still
+                                       be used as a subrule of other rules).
+                                       It can't be directly called because
+				       Parse::RecDescent::$rulename is already defined (it
+				       is the standard method of all
+				       parsers).");
+			}
+			$rule = new Parse::RecDescent::Rule($rulename,$self,$line,$replace);
+			$prod->check_pending($line) if $prod;
+			$prod = $rule->addprod( new Parse::RecDescent::Production );
+			$aftererror = 0;
+		}
+		elsif ($grammar =~ m/$UNCOMMITPROD/gco)
+		{
+			pos($grammar)-=9;
+			_parseunneg("a new (uncommitted) production",
+				    0, $lookahead, $line) or next;
+
+			$prod->check_pending($line) if $prod;
+			$prod = new Parse::RecDescent::Production($line,1);
+			$rule and $rule->addprod($prod)
+			      or  _no_rule("<uncommit>",$line);
+			$aftererror = 0;
+		}
+		elsif ($grammar =~ m/$ERRORPROD/gco)
+		{
+			pos($grammar)-=6;
+			_parseunneg("a new (error) production", $aftererror,
+				    $lookahead,$line) or next;
+			$prod->check_pending($line) if $prod;
+			$prod = new Parse::RecDescent::Production($line,0,1);
+			$rule and $rule->addprod($prod)
+			      or  _no_rule("<error>",$line);
+			$aftererror = 0;
+		}
+		elsif ($grammar =~ m/$PROD/gco)
+		{
+			_parseunneg("a new production", 0,
+				    $lookahead,$line) or next;
+			$rule
+			  and (!$prod || $prod->check_pending($line))
+			  and $prod = $rule->addprod(new Parse::RecDescent::Production($line))
+			or  _no_rule("production",$line);
+			$aftererror = 0;
+		}
+		elsif ($grammar =~ m/$LITERAL/gco)
+		{
+			($code = $1) =~ s/\\\\/\\/g;
+			_parse("a literal terminal", $aftererror,$line,$1);
+			$item = new Parse::RecDescent::Literal($code,$lookahead,$line);
+			$prod and $prod->additem($item)
+			      or  _no_rule("literal terminal",$line,"'$1'");
+		}
+		elsif ($grammar =~ m/$INTERPLIT/gco)
+		{
+			_parse("an interpolated literal terminal", $aftererror,$line);
+			$item = new Parse::RecDescent::InterpLit($1,$lookahead,$line);
+			$prod and $prod->additem($item)
+			      or  _no_rule("interpolated literal terminal",$line,"'$1'");
+		}
+		elsif ($grammar =~ m/$TOKEN/gco)
+		{
+			_parse("a /../ pattern terminal", $aftererror,$line);
+			$item = new Parse::RecDescent::Token($1,'/',$3?$3:'',$lookahead,$line);
+			$prod and $prod->additem($item)
+			      or  _no_rule("pattern terminal",$line,"/$1/");
+		}
+		elsif ($grammar =~ m/(?=$MTOKEN)/gco
+			and do { ($code, undef, @components)
+					= extract_quotelike($grammar);
+				 $code }
+		      )
+
+		{
+			_parse("an m/../ pattern terminal", $aftererror,$line,$code);
+			$item = new Parse::RecDescent::Token(@components[3,2,8],
+							     $lookahead,$line);
+			$prod and $prod->additem($item)
+			      or  _no_rule("pattern terminal",$line,$code);
+		}
+		elsif ($grammar =~ m/(?=$MATCHRULE)/gco
+				and do { ($code) = extract_bracketed($grammar,'<');
+					 $code
+				       }
+		       or $grammar =~ m/$SUBRULE/gco
+				and $code = $1)
+		{
+			my $name = $code;
+			my $matchrule = 0;
+			if (substr($name,0,1) eq '<')
+			{
+				$name =~ s/$MATCHRULE\s*//;
+				$name =~ s/\s*>\Z//;
+				$matchrule = 1;
+			}
+
+		# EXTRACT TRAILING ARG LIST (IF ANY)
+
+			my ($argcode) = extract_codeblock($grammar, "[]",'') || '';
+
+		# EXTRACT TRAILING REPETITION SPECIFIER (IF ANY)
+
+			if ($grammar =~ m/\G[(]/gc)
+			{
+				pos($grammar)--;
+
+				if ($grammar =~ m/$OPTIONAL/gco)
+				{
+					_parse("an zero-or-one subrule match", $aftererror,$line,"$code$argcode($1)");
+					$item = new Parse::RecDescent::Repetition($name,$1,0,1,
+									   $lookahead,$line,
+									   $self,
+									   $matchrule,
+									   $argcode);
+					$prod and $prod->additem($item)
+					      or  _no_rule("repetition",$line,"$code$argcode($1)");
+
+					!$matchrule and $rule and $rule->addcall($name);
+				}
+				elsif ($grammar =~ m/$ANY/gco)
+				{
+					_parse("a zero-or-more subrule match", $aftererror,$line,"$code$argcode($1)");
+					if ($2)
+					{
+						my $pos = pos $grammar;
+						substr($grammar,$pos,0,
+						       "<leftop='$name(s?)': $name $2 $name>(s?) ");
+
+						pos $grammar = $pos;
+					}
+					else
+					{
+						$item = new Parse::RecDescent::Repetition($name,$1,0,$MAXREP,
+										   $lookahead,$line,
+										   $self,
+										   $matchrule,
+										   $argcode);
+						$prod and $prod->additem($item)
+						      or  _no_rule("repetition",$line,"$code$argcode($1)");
+
+						!$matchrule and $rule and $rule->addcall($name);
+
+						_check_insatiable($name,$1,$grammar,$line) if $::RD_CHECK;
+					}
+				}
+				elsif ($grammar =~ m/$MANY/gco)
+				{
+					_parse("a one-or-more subrule match", $aftererror,$line,"$code$argcode($1)");
+					if ($2)
+					{
+						# $DB::single=1;
+						my $pos = pos $grammar;
+						substr($grammar,$pos,0,
+						       "<leftop='$name(s)': $name $2 $name> ");
+
+						pos $grammar = $pos;
+					}
+					else
+					{
+						$item = new Parse::RecDescent::Repetition($name,$1,1,$MAXREP,
+										   $lookahead,$line,
+										   $self,
+										   $matchrule,
+										   $argcode);
+										   
+						$prod and $prod->additem($item)
+						      or  _no_rule("repetition",$line,"$code$argcode($1)");
+
+						!$matchrule and $rule and $rule->addcall($name);
+
+						_check_insatiable($name,$1,$grammar,$line) if $::RD_CHECK;
+					}
+				}
+				elsif ($grammar =~ m/$EXACTLY/gco)
+				{
+					_parse("an exactly-$1-times subrule match", $aftererror,$line,"$code$argcode($1)");
+					if ($2)
+					{
+						my $pos = pos $grammar;
+						substr($grammar,$pos,0,
+						       "<leftop='$name($1)': $name $2 $name>($1) ");
+
+						pos $grammar = $pos;
+					}
+					else
+					{
+						$item = new Parse::RecDescent::Repetition($name,$1,$1,$1,
+										   $lookahead,$line,
+										   $self,
+										   $matchrule,
+										   $argcode);
+						$prod and $prod->additem($item)
+						      or  _no_rule("repetition",$line,"$code$argcode($1)");
+
+						!$matchrule and $rule and $rule->addcall($name);
+					}
+				}
+				elsif ($grammar =~ m/$BETWEEN/gco)
+				{
+					_parse("a $1-to-$2 subrule match", $aftererror,$line,"$code$argcode($1..$2)");
+					if ($3)
+					{
+						my $pos = pos $grammar;
+						substr($grammar,$pos,0,
+						       "<leftop='$name($1..$2)': $name $3 $name>($1..$2) ");
+
+						pos $grammar = $pos;
+					}
+					else
+					{
+						$item = new Parse::RecDescent::Repetition($name,"$1..$2",$1,$2,
+										   $lookahead,$line,
+										   $self,
+										   $matchrule,
+										   $argcode);
+						$prod and $prod->additem($item)
+						      or  _no_rule("repetition",$line,"$code$argcode($1..$2)");
+
+						!$matchrule and $rule and $rule->addcall($name);
+					}
+				}
+				elsif ($grammar =~ m/$ATLEAST/gco)
+				{
+					_parse("a $1-or-more subrule match", $aftererror,$line,"$code$argcode($1..)");
+					if ($2)
+					{
+						my $pos = pos $grammar;
+						substr($grammar,$pos,0,
+						       "<leftop='$name($1..)': $name $2 $name>($1..) ");
+
+						pos $grammar = $pos;
+					}
+					else
+					{
+						$item = new Parse::RecDescent::Repetition($name,"$1..",$1,$MAXREP,
+										   $lookahead,$line,
+										   $self,
+										   $matchrule,
+										   $argcode);
+						$prod and $prod->additem($item)
+						      or  _no_rule("repetition",$line,"$code$argcode($1..)");
+
+						!$matchrule and $rule and $rule->addcall($name);
+						_check_insatiable($name,"$1..",$grammar,$line) if $::RD_CHECK;
+					}
+				}
+				elsif ($grammar =~ m/$ATMOST/gco)
+				{
+					_parse("a one-to-$1 subrule match", $aftererror,$line,"$code$argcode(..$1)");
+					if ($2)
+					{
+						my $pos = pos $grammar;
+						substr($grammar,$pos,0,
+						       "<leftop='$name(..$1)': $name $2 $name>(..$1) ");
+
+						pos $grammar = $pos;
+					}
+					else
+					{
+						$item = new Parse::RecDescent::Repetition($name,"..$1",1,$1,
+										   $lookahead,$line,
+										   $self,
+										   $matchrule,
+										   $argcode);
+						$prod and $prod->additem($item)
+						      or  _no_rule("repetition",$line,"$code$argcode(..$1)");
+
+						!$matchrule and $rule and $rule->addcall($name);
+					}
+				}
+				elsif ($grammar =~ m/$BADREP/gco)
+				{
+					_parse("an subrule match with invalid repetition specifier", 0,$line);
+					_error("Incorrect specification of a repeated subrule",
+					       $line);
+					_hint("Repeated subrules like \"$code$argcode$&\" cannot have
+					       a maximum repetition of zero, nor can they have
+					       negative components in their ranges.");
+				}
+			}
+			else
+			{
+				_parse("a subrule match", $aftererror,$line,$code);
+				my $desc;
+				if ($name=~/\A_alternation_\d+_of_production_\d+_of_rule/)
+					{ $desc = $self->{"rules"}{$name}->expected }
+				$item = new Parse::RecDescent::Subrule($name,
+								       $lookahead,
+								       $line,
+								       $desc,
+								       $matchrule,
+								       $argcode);
+	 
+				$prod and $prod->additem($item)
+				      or  _no_rule("(sub)rule",$line,$name);
+
+				!$matchrule and $rule and $rule->addcall($name);
+			}
+		}
+		elsif ($grammar =~ m/$LONECOLON/gco   )
+		{
+			_error("Unexpected colon encountered", $line);
+			_hint("Did you mean \"|\" (to start a new production)?
+			           Or perhaps you forgot that the colon
+				   in a rule definition must be
+				   on the same line as the rule name?");
+		}
+		elsif ($grammar =~ m/$ACTION/gco   ) # BAD ACTION, ALREADY FAILED
+		{
+			_error("Malformed action encountered",
+			       $line);
+			_hint("Did you forget the closing curly bracket
+			       or is there a syntax error in the action?");
+		}
+		elsif ($grammar =~ m/$OTHER/gco   )
+		{
+			_error("Untranslatable item encountered: \"$1\"",
+			       $line);
+			_hint("Did you misspell \"$1\"
+			           or forget to comment it out?");
+		}
+
+		if ($lookaheadspec =~ tr /././ > 3)
+		{
+			$lookaheadspec =~ s/\A\s+//;
+			$lookahead = $lookahead<0
+					? 'a negative lookahead ("...!")'
+					: 'a positive lookahead ("...")' ;
+			_warn(1,"Found two or more lookahead specifiers in a
+			       row.",$line)
+			and
+			_hint("Multiple positive and/or negative lookaheads
+			       are simply multiplied together to produce a
+			       single positive or negative lookahead
+			       specification. In this case the sequence
+			       \"$lookaheadspec\" was reduced to $lookahead.
+			       Was this your intention?");
+		}
+		$lookahead = 0;
+		$lookaheadspec = "";
+
+		$grammar =~ m/\G\s+/gc;
+	}
+
+	unless ($ERRORS or $isimplicit or !$::RD_CHECK)
+	{
+		$self->_check_grammar();
+	}
+
+	unless ($ERRORS or $isimplicit or $Parse::RecDescent::compiling)
+	{
+		my $code = $self->_code();
+		if (defined $::RD_TRACE)
+		{
+			print STDERR "printing code (", length($code),") to RD_TRACE\n";
+			local *TRACE_FILE;
+			open TRACE_FILE, ">RD_TRACE"
+			and print TRACE_FILE "my \$ERRORS;\n$code"
+			and close TRACE_FILE;
+		}
+
+		unless ( eval "$code 1" )
+		{
+			_error("Internal error in generated parser code!");
+			$@ =~ s/at grammar/in grammar at/;
+			_hint($@);
+		}
+	}
+
+	if ($ERRORS and !_verbosity("HINT"))
+	{
+		local $::RD_HINT = 1;
+		_hint('Set $::RD_HINT (or -RD_HINT if you\'re using "perl -s")
+		       for hints on fixing these problems.');
+	}
+	if ($ERRORS) { $ERRORS=0; return }
+	return $self;
+}
+
+
+sub _addstartcode($$)
+{
+	my ($self, $code) = @_;
+	$code =~ s/\A\s*\{(.*)\}\Z/$1/s;
+
+	$self->{"startcode"} .= "$code;\n";
+}
+
+# CHECK FOR GRAMMAR PROBLEMS....
+
+sub _check_insatiable($$$$)
+{
+	my ($subrule,$repspec,$grammar,$line) = @_;
+	pos($grammar)=pos($_[2]);
+	return if $grammar =~ m/$OPTIONAL/gco || $grammar =~ m/$ANY/gco;
+	my $min = 1;
+	if ( $grammar =~ m/$MANY/gco
+	  || $grammar =~ m/$EXACTLY/gco
+	  || $grammar =~ m/$ATMOST/gco
+	  || $grammar =~ m/$BETWEEN/gco && do { $min=$2; 1 }
+	  || $grammar =~ m/$ATLEAST/gco && do { $min=$2; 1 }
+	  || $grammar =~ m/$SUBRULE(?!\s*:)/gco
+	   )
+	{
+		return unless $1 eq $subrule && $min > 0;
+		_warn(3,"Subrule sequence \"$subrule($repspec) $&\" will
+		       (almost certainly) fail.",$line)
+		and
+		_hint("Unless subrule \"$subrule\" performs some cunning
+		       lookahead, the repetition \"$subrule($repspec)\" will
+		       insatiably consume as many matches of \"$subrule\" as it
+		       can, leaving none to match the \"$&\" that follows.");
+	}
+}
+
+sub _check_grammar ($)
+{
+	my $self = shift;
+	my $rules = $self->{"rules"};
+	my $rule;
+	foreach $rule ( values %$rules )
+	{
+		next if ! $rule->{"changed"};
+
+	# CHECK FOR UNDEFINED RULES
+
+		my $call;
+		foreach $call ( @{$rule->{"calls"}} )
+		{
+			if (!defined ${$rules}{$call}
+			  &&!defined &{"Parse::RecDescent::$call"})
+			{
+				if (!defined $::RD_AUTOSTUB)
+				{
+					_warn(3,"Undefined (sub)rule \"$call\"
+					      used in a production.")
+					and
+					_hint("Will you be providing this rule
+					       later, or did you perhaps
+					       misspell \"$call\"? Otherwise
+					       it will be treated as an 
+					       immediate <reject>.");
+					eval "sub $self->{namespace}::$call {undef}";
+				}
+				else	# EXPERIMENTAL
+				{
+					my $rule = $::RD_AUTOSTUB || qq{'$call'};
+					_warn(1,"Autogenerating rule: $call")
+					and
+					_hint("A call was made to a subrule
+					       named \"$call\", but no such
+					       rule was specified. However,
+					       since \$::RD_AUTOSTUB
+					       was defined, a rule stub
+					       ($call : $rule) was
+					       automatically created.");
+
+					$self->_generate("$call : $rule",0,1);
+				}
+			}
+		}
+
+	# CHECK FOR LEFT RECURSION
+
+		if ($rule->isleftrec($rules))
+		{
+			_error("Rule \"$rule->{name}\" is left-recursive.");
+			_hint("Redesign the grammar so it's not left-recursive.
+			       That will probably mean you need to re-implement
+			       repetitions using the '(s)' notation.
+			       For example: \"$rule->{name}(s)\".");
+			next;
+		}
+	}
+}
+	
+# GENERATE ACTUAL PARSER CODE
+
+sub _code($)
+{
+	my $self = shift;
+	my $code = qq{
+package $self->{namespace};
+use strict;
+use vars qw(\$skip \$AUTOLOAD $self->{localvars} );
+\$skip = '$skip';
+$self->{startcode}
+
+{
+local \$SIG{__WARN__} = sub {0};
+# PRETEND TO BE IN Parse::RecDescent NAMESPACE
+*$self->{namespace}::AUTOLOAD	= sub
+{
+	no strict 'refs';
+	\$AUTOLOAD =~ s/^$self->{namespace}/Parse::RecDescent/;
+	goto &{\$AUTOLOAD};
+}
+}
+
+};
+	$code .= "push \@$self->{namespace}\::ISA, 'Parse::RecDescent';";
+	$self->{"startcode"} = '';
+
+	my $rule;
+	foreach $rule ( values %{$self->{"rules"}} )
+	{
+		if ($rule->{"changed"})
+		{
+			$code .= $rule->code($self->{"namespace"},$self);
+			$rule->{"changed"} = 0;
+		}
+	}
+
+	return $code;
+}
+
+
+# EXECUTING A PARSE....
+
+sub AUTOLOAD	# ($parser, $text; $linenum, @args)
+{
+	croak "Could not find method: $AUTOLOAD\n" unless ref $_[0];
+	my $class = ref($_[0]) || $_[0];
+	my $text = ref($_[1]) ? ${$_[1]} : $_[1];
+	$_[0]->{lastlinenum} = $_[2]||_linecount($_[1]);
+	$_[0]->{lastlinenum} = _linecount($_[1]);
+	$_[0]->{lastlinenum} += $_[2] if @_ > 2;
+	$_[0]->{offsetlinenum} = $_[0]->{lastlinenum};
+	$_[0]->{fulltext} = $text;
+	$_[0]->{fulltextlen} = length $text;
+	$_[0]->{deferred} = [];
+	$_[0]->{errors} = [];
+	my @args = @_[3..$#_];
+	my $args = sub { [ @args ] };
+				 
+	$AUTOLOAD =~ s/$class/$_[0]->{namespace}/;
+	no strict "refs";
+	
+	croak "Unknown starting rule ($AUTOLOAD) called\n"
+		unless defined &$AUTOLOAD;
+	my $retval = &{$AUTOLOAD}($_[0],$text,undef,undef,$args);
+
+	if (defined $retval)
+	{
+		foreach ( @{$_[0]->{deferred}} ) { &$_; }
+	}
+	else
+	{
+		foreach ( @{$_[0]->{errors}} ) { _error(@$_); }
+	}
+
+	if (ref $_[1]) { ${$_[1]} = $text }
+
+	$ERRORS = 0;
+	return $retval;
+}
+
+sub _parserepeat($$$$$$$$$$)	# RETURNS A REF TO AN ARRAY OF MATCHES
+{
+	my ($parser, $text, $prod, $min, $max, $_noactions, $expectation, $argcode) = @_;
+	my @tokens = ();
+	
+	my $reps;
+	for ($reps=0; $reps<$max;)
+	{
+		$_[6]->at($text);	 # $_[6] IS $expectation FROM CALLER
+		my $_savetext = $text;
+		my $prevtextlen = length $text;
+		my $_tok;
+		if (! defined ($_tok = &$prod($parser,$text,1,$_noactions,$argcode)))
+		{
+			$text = $_savetext;
+			last;
+		}
+		push @tokens, $_tok if defined $_tok;
+		last if ++$reps >= $min and $prevtextlen == length $text;
+	}
+
+	do { $_[6]->failed(); return undef} if $reps<$min;
+
+	$_[1] = $text;
+	return [@tokens];
+}
+
+
+# ERROR REPORTING....
+
+my $errortext;
+my $errorprefix;
+
+open (ERROR, ">&STDERR");
+format ERROR =
+@>>>>>>>>>>>>>>>>>>>>: ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+$errorprefix,          $errortext
+~~                     ^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+                       $errortext
+.
+
+select ERROR;
+$| = 1;
+
+# TRACING
+
+my $tracemsg;
+my $tracecontext;
+my $tracerulename;
+use vars '$tracelevel';
+
+open (TRACE, ">&STDERR");
+format TRACE =
+@>|@|||||||||@^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<|
+$tracelevel, $tracerulename, '|', $tracemsg
+  | ~~       |^<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<|
+              $tracemsg
+.
+
+select TRACE;
+$| = 1;
+
+open (TRACECONTEXT, ">&STDERR");
+format TRACECONTEXT =
+@>|@|||||||||@                                      |^<<<<<<<<<<<<<<<<<<<<<<<<<<<
+$tracelevel, $tracerulename, '|',	           $tracecontext
+  | ~~       |                                      |^<<<<<<<<<<<<<<<<<<<<<<<<<<<
+						   $tracecontext
+.
+
+
+select TRACECONTEXT;
+$| = 1;
+
+select STDOUT;
+
+sub _verbosity($)
+{
+	   defined $::RD_TRACE
+	or defined $::RD_HINT    and  $_[0] =~ /ERRORS|WARN|HINT/
+	or defined $::RD_WARN    and  $_[0] =~ /ERRORS|WARN/
+	or defined $::RD_ERRORS  and  $_[0] =~ /ERRORS/
+}
+
+sub _error($;$)
+{
+	$ERRORS++;
+	return 0 if ! _verbosity("ERRORS");
+	$errortext   = $_[0];
+	$errorprefix = "ERROR" .  ($_[1] ? " (line $_[1])" : "");
+	$errortext =~ s/\s+/ /g;
+	print ERROR "\n" if _verbosity("WARN");
+	write ERROR;
+	return 1;
+}
+
+sub _warn($$;$)
+{
+	return 0 unless _verbosity("WARN") && ($::RD_HINT || $_[0] >= ($::RD_WARN||1));
+	$errortext   = $_[1];
+	$errorprefix = "Warning" .  ($_[2] ? " (line $_[2])" : "");
+	print ERROR "\n";
+	$errortext =~ s/\s+/ /g;
+	write ERROR;
+	return 1;
+}
+
+sub _hint($)
+{
+	return 0 unless defined $::RD_HINT;
+	$errortext = "$_[0])";
+	$errorprefix = "(Hint";
+	$errortext =~ s/\s+/ /g;
+	write ERROR;
+	return 1;
+}
+
+sub _tracemax($)
+{
+	if (defined $::RD_TRACE
+	    && $::RD_TRACE =~ /\d+/
+	    && $::RD_TRACE>1
+	    && $::RD_TRACE+10<length($_[0]))
+	{
+		my $count = length($_[0]) - $::RD_TRACE;
+		return substr($_[0],0,$::RD_TRACE/2)
+			. "...<$count>..."
+			. substr($_[0],-$::RD_TRACE/2);
+	}
+	else
+	{
+		return $_[0];
+	}
+}
+
+sub _tracefirst($)
+{
+	if (defined $::RD_TRACE
+	    && $::RD_TRACE =~ /\d+/
+	    && $::RD_TRACE>1
+	    && $::RD_TRACE+10<length($_[0]))
+	{
+		my $count = length($_[0]) - $::RD_TRACE;
+		return substr($_[0],0,$::RD_TRACE) . "...<+$count>";
+	}
+	else
+	{
+		return $_[0];
+	}
+}
+
+my $lastcontext = '';
+my $lastrulename = '';
+my $lastlevel = '';
+
+sub _trace($;$$$)
+{
+	$tracemsg      = $_[0];
+	$tracecontext  = $_[1]||$lastcontext;
+	$tracerulename = $_[2]||$lastrulename;
+	$tracelevel    = $_[3]||$lastlevel;
+	if ($tracerulename) { $lastrulename = $tracerulename }
+	if ($tracelevel)    { $lastlevel = $tracelevel }
+
+	$tracecontext =~ s/\n/\\n/g;
+	$tracecontext =~ s/\s+/ /g;
+	$tracerulename = qq{$tracerulename};
+	write TRACE;
+	if ($tracecontext ne $lastcontext)
+	{
+		if ($tracecontext)
+		{
+			$lastcontext = _tracefirst($tracecontext);
+			$tracecontext = qq{"$tracecontext"};
+		}
+		else
+		{
+			$tracecontext = qq{<NO TEXT LEFT>};
+		}
+		write TRACECONTEXT;
+	}
+}
+
+sub _parseunneg($$$$)
+{
+	_parse($_[0],$_[1],$_[3]);
+	if ($_[2]<0)
+	{
+		_error("Can't negate \"$&\".",$_[3]);
+		_hint("You can't negate $_[0]. Remove the \"...!\" before
+		       \"$&\".");
+		return 0;
+	}
+	return 1;
+}
+
+sub _parse($$$;$)
+{
+	my $what = $_[3] || $&;
+	   $what =~ s/^\s+//;
+	if ($_[1])
+	{
+		_warn(3,"Found $_[0] ($what) after an unconditional <error>",$_[2])
+		and
+		_hint("An unconditional <error> always causes the
+		       production containing it to immediately fail.
+		       \u$_[0] that follows an <error>
+		       will never be reached.  Did you mean to use
+		       <error?> instead?");
+	}
+
+	return if ! _verbosity("TRACE");
+	$errortext = "Treating \"$what\" as $_[0]";
+	$errorprefix = "Parse::RecDescent";
+	$errortext =~ s/\s+/ /g;
+	write ERROR;
+}
+
+sub _linecount($) {
+	scalar substr($_[0], pos $_[0]||0) =~ tr/\n//
+}
+
+
+package main;
+
+use vars qw ( $RD_ERRORS $RD_WARN $RD_HINT $RD_TRACE $RD_CHECK );
+$::RD_CHECK = 1;
+$::RD_ERRORS = 1;
+$::RD_WARN = 3;
+
+1;
+
diff --git a/bench/perl/Template.pm b/bench/perl/Template.pm
new file mode 100644
index 0000000..76c84c7
--- /dev/null
+++ b/bench/perl/Template.pm
@@ -0,0 +1,916 @@
+#============================================================= -*-perl-*-
+#
+# Template
+#
+# DESCRIPTION
+#   Module implementing a simple, user-oriented front-end to the Template 
+#   Toolkit.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2009 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#========================================================================
+
+package Template;
+
+use strict;
+use warnings;
+use 5.006;
+use base 'Template::Base';
+
+use Template::Config;
+use Template::Constants;
+use Template::Provider;  
+use Template::Service;
+use File::Basename;
+use File::Path;
+use Scalar::Util qw(blessed);
+
+our $VERSION = '2.22';
+our $ERROR   = '';
+our $DEBUG   = 0;
+our $BINMODE = 0 unless defined $BINMODE;
+our $AUTOLOAD;
+
+# preload all modules if we're running under mod_perl
+Template::Config->preload() if $ENV{ MOD_PERL };
+
+
+#------------------------------------------------------------------------
+# process($input, \%replace, $output)
+#
+# Main entry point for the Template Toolkit.  The Template module 
+# delegates most of the processing effort to the underlying SERVICE
+# object, an instance of the Template::Service class.  
+#------------------------------------------------------------------------
+
+sub process {
+    my ($self, $template, $vars, $outstream, @opts) = @_;
+    my ($output, $error);
+    my $options = (@opts == 1) && ref($opts[0]) eq 'HASH'
+        ? shift(@opts) : { @opts };
+
+    $options->{ binmode } = $BINMODE
+        unless defined $options->{ binmode };
+    
+    # we're using this for testing in t/output.t and t/filter.t so 
+    # don't remove it if you don't want tests to fail...
+    $self->DEBUG("set binmode\n") if $DEBUG && $options->{ binmode };
+
+    $output = $self->{ SERVICE }->process($template, $vars);
+    
+    if (defined $output) {
+        $outstream ||= $self->{ OUTPUT };
+        unless (ref $outstream) {
+            my $outpath = $self->{ OUTPUT_PATH };
+            $outstream = "$outpath/$outstream" if $outpath;
+        }   
+
+        # send processed template to output stream, checking for error
+        return ($self->error($error))
+            if ($error = &_output($outstream, \$output, $options));
+        
+        return 1;
+    }
+    else {
+        return $self->error($self->{ SERVICE }->error);
+    }
+}
+
+
+#------------------------------------------------------------------------
+# service()
+#
+# Returns a reference to the the internal SERVICE object which handles
+# all requests for this Template object
+#------------------------------------------------------------------------
+
+sub service {
+    my $self = shift;
+    return $self->{ SERVICE };
+}
+
+
+#------------------------------------------------------------------------
+# context()
+#
+# Returns a reference to the the CONTEXT object withint the SERVICE 
+# object.
+#------------------------------------------------------------------------
+
+sub context {
+    my $self = shift;
+    return $self->{ SERVICE }->{ CONTEXT };
+}
+
+
+#========================================================================
+#                     -- PRIVATE METHODS --
+#========================================================================
+
+#------------------------------------------------------------------------
+# _init(\%config)
+#------------------------------------------------------------------------
+sub _init {
+    my ($self, $config) = @_;
+
+    # convert any textual DEBUG args to numerical form
+    my $debug = $config->{ DEBUG };
+    $config->{ DEBUG } = Template::Constants::debug_flags($self, $debug)
+        || return if defined $debug && $debug !~ /^\d+$/;
+    
+    # prepare a namespace handler for any CONSTANTS definition
+    if (my $constants = $config->{ CONSTANTS }) {
+        my $ns  = $config->{ NAMESPACE } ||= { };
+        my $cns = $config->{ CONSTANTS_NAMESPACE } || 'constants';
+        $constants = Template::Config->constants($constants)
+            || return $self->error(Template::Config->error);
+        $ns->{ $cns } = $constants;
+    }
+    
+    $self->{ SERVICE } = $config->{ SERVICE }
+        || Template::Config->service($config)
+        || return $self->error(Template::Config->error);
+    
+    $self->{ OUTPUT      } = $config->{ OUTPUT } || \*STDOUT;
+    $self->{ OUTPUT_PATH } = $config->{ OUTPUT_PATH };
+
+    return $self;
+}
+
+
+#------------------------------------------------------------------------
+# _output($where, $text)
+#------------------------------------------------------------------------
+
+sub _output {
+    my ($where, $textref, $options) = @_;
+    my $reftype;
+    my $error = 0;
+    
+    # call a CODE reference
+    if (($reftype = ref($where)) eq 'CODE') {
+        &$where($$textref);
+    }
+    # print to a glob (such as \*STDOUT)
+    elsif ($reftype eq 'GLOB') {
+        print $where $$textref;
+    }   
+    # append output to a SCALAR ref
+    elsif ($reftype eq 'SCALAR') {
+        $$where .= $$textref;
+    }
+    # push onto ARRAY ref
+    elsif ($reftype eq 'ARRAY') {
+        push @$where, $$textref;
+    }
+    # call the print() method on an object that implements the method
+    # (e.g. IO::Handle, Apache::Request, etc)
+    elsif (blessed($where) && $where->can('print')) {
+        $where->print($$textref);
+    }
+    # a simple string is taken as a filename
+    elsif (! $reftype) {
+        local *FP;
+        # make destination directory if it doesn't exist
+        my $dir = dirname($where);
+        eval { mkpath($dir) unless -d $dir; };
+        if ($@) {
+            # strip file name and line number from error raised by die()
+            ($error = $@) =~ s/ at \S+ line \d+\n?$//;
+        }
+        elsif (open(FP, ">$where")) { 
+            # binmode option can be 1 or a specific layer, e.g. :utf8
+            my $bm = $options->{ binmode  };
+            if ($bm && $bm eq 1) { 
+                binmode FP;
+            }
+            elsif ($bm){ 
+                binmode FP, $bm;
+            }
+            print FP $$textref;
+            close FP;
+        }
+        else {
+            $error  = "$where: $!";
+        }
+    }
+    # give up, we've done our best
+    else {
+        $error = "output_handler() cannot determine target type ($where)\n";
+    }
+
+    return $error;
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template - Front-end module to the Template Toolkit
+
+=head1 SYNOPSIS 
+
+    use Template;
+    
+    # some useful options (see below for full list)
+    my $config = {
+        INCLUDE_PATH => '/search/path',  # or list ref
+        INTERPOLATE  => 1,               # expand "$var" in plain text
+        POST_CHOMP   => 1,               # cleanup whitespace 
+        PRE_PROCESS  => 'header',        # prefix each template
+        EVAL_PERL    => 1,               # evaluate Perl code blocks
+    };
+    
+    # create Template object
+    my $template = Template->new($config);
+    
+    # define template variables for replacement
+    my $vars = {
+        var1  => $value,
+        var2  => \%hash,
+        var3  => \@list,
+        var4  => \&code,
+        var5  => $object,
+    };
+    
+    # specify input filename, or file handle, text reference, etc.
+    my $input = 'myfile.html';
+    
+    # process input template, substituting variables
+    $template->process($input, $vars)
+        || die $template->error();
+
+=head1 DESCRIPTION
+
+This documentation describes the Template module which is the direct
+Perl interface into the Template Toolkit.  It covers the use of the
+module and gives a brief summary of configuration options and template
+directives.  Please see L<Template::Manual> for the complete reference
+manual which goes into much greater depth about the features and use
+of the Template Toolkit.  The L<Template::Tutorial> is also available
+as an introductory guide to using the Template Toolkit.
+
+=head1 METHODS
+
+=head2 new(\%config)
+
+The C<new()> constructor method (implemented by the
+L<Template::Base|Template::Base#new()> base class) instantiates a new
+C<Template> object. A reference to a hash array of configuration items may be
+passed as a parameter.
+
+    my $tt = Template->new({
+        INCLUDE_PATH => '/usr/local/templates',
+        EVAL_PERL    => 1,
+    }) || die $Template::ERROR, "\n";
+
+A reference to a new C<Template> object is returned, or undef on error. In the
+latter case, the error message can be retrieved by calling L<error()> as a
+class method or by examining the C<$Template::ERROR> package variable
+directly.
+
+    my $tt = Template->new(\%config)
+        || die Template->error(), "\n";
+
+    my $tt = Template->new(\%config)
+        || die $Template::ERROR, "\n";
+
+For convenience, configuration items may also be specified as a list
+of items instead of a hash array reference.  These are automatically
+folded into a hash array by the constructor.
+
+    my $tt = Template->new(INCLUDE_PATH => '/tmp', POST_CHOMP => 1)
+        || die $Template::ERROR, "\n";
+
+=head2 process($template, \%vars, $output, %options)
+
+The C<process()> method is called to process a template. The first parameter
+indicates the input template as one of: a filename relative to
+C<INCLUDE_PATH>, if defined; a reference to a text string containing the
+template text; or a file handle reference (e.g. C<IO::Handle> or sub-class) or
+C<GLOB> (e.g. C<\*STDIN>), from which the template can be read. A reference to
+a hash array may be passed as the second parameter, containing definitions of
+template variables.
+
+    # filename
+    $tt->process('welcome.tt2')
+        || die $tt->error(), "\n";
+
+    # text reference
+    $text = "[% INCLUDE header %]\nHello world!\n[% INCLUDE footer %]";
+    $tt->process(\$text)
+        || die $tt->error(), "\n";
+
+    # file handle (GLOB)
+    $tt->process(\*DATA)
+        || die $tt->error(), "\n";
+    
+    __END__
+    [% INCLUDE header %]
+    This is a template defined in the __END__ section which is 
+    accessible via the DATA "file handle".
+    [% INCLUDE footer %]
+
+By default, the processed template output is printed to C<STDOUT>. The
+C<process()> method then returns C<1> to indicate success. A third parameter
+may be passed to the C<process()> method to specify a different output location.
+This value may be one of: a plain string indicating a filename which will be
+opened (relative to C<OUTPUT_PATH>, if defined) and the output written to; a file
+GLOB opened ready for output; a reference to a scalar (e.g. a text string) to
+which output/error is appended; a reference to a subroutine which is called,
+passing the output as a parameter; or any object reference which implements a
+C<print()> method (e.g. C<IO::Handle>, C<Apache::Request>, etc.) which will be called,
+passing the generated output as a parameter.
+
+Examples:
+
+    # output filename
+    $tt->process('welcome.tt2', $vars, 'welcome.html')
+        || die $tt->error(), "\n";
+
+    # reference to output subroutine
+    sub myout {
+        my $output = shift;
+        ...
+    }
+    $tt->process('welcome.tt2', $vars, \&myout)
+        || die $tt->error(), "\n";
+
+    # reference to output text string
+    my $output = '';
+    $tt->process('welcome.tt2', $vars, \$output)
+        || die $tt->error(), "\n";
+    
+    print "output: $output\n";
+
+In an Apache/mod_perl handler:
+
+    sub handler {
+        my $req = shift;
+        
+        # ...your code here...
+        
+        # direct output to Apache::Request via $req->print($output)
+        $tt->process($file, $vars, $req) || do {
+            $req->log_reason($tt->error());
+            return SERVER_ERROR;
+        };
+        return OK;
+    }
+
+After the optional third output argument can come an optional
+reference to a hash or a list of C<(name, value)> pairs providing further
+options for the output.  The only option currently supported is
+C<binmode> which, when set to any true value will ensure that files
+created (but not any existing file handles passed) will be set to
+binary mode.
+
+    # either: hash reference of options
+    $tt->process($infile, $vars, $outfile, { binmode => 1 })
+        || die $tt->error(), "\n";
+    
+    # or: list of name, value pairs
+    $tt->process($infile, $vars, $outfile, binmode => 1)
+        || die $tt->error(), "\n";
+
+Alternately, the C<binmode> argument can specify a particular IO layer such 
+as C<:utf8>.
+
+    $tt->process($infile, $vars, $outfile, binmode => ':utf8')
+        || die $tt->error(), "\n";
+
+The C<OUTPUT> configuration item can be used to specify a default output 
+location other than C<\*STDOUT>.  The C<OUTPUT_PATH> specifies a directory
+which should be prefixed to all output locations specified as filenames.
+
+    my $tt = Template->new({
+        OUTPUT      => sub { ... },       # default
+        OUTPUT_PATH => '/tmp',
+    ...
+    }) || die Template->error(), "\n";
+    
+    # use default OUTPUT (sub is called)
+    $tt->process('welcome.tt2', $vars)
+        || die $tt->error(), "\n";
+        
+    # write file to '/tmp/welcome.html'
+    $tt->process('welcome.tt2', $vars, 'welcome.html')
+        || die $tt->error(), "\n";
+
+The C<process()> method returns C<1> on success or C<undef> on error. The
+error message generated in the latter case can be retrieved by calling the
+L<error()> method. See also L<CONFIGURATION SUMMARY> which describes how error
+handling may be further customised.
+
+=head2 error()
+
+When called as a class method, it returns the value of the C<$ERROR> package
+variable.  Thus, the following are equivalent.
+
+    my $tt = Template->new()
+        || die Template->error(), "\n";
+
+    my $tt = Template->new()
+        || die $Template::ERROR, "\n";
+
+When called as an object method, it returns the value of the internal
+C<_ERROR> variable, as set by an error condition in a previous call to
+process().
+
+    $tt->process('welcome.tt2')
+        || die $tt->error(), "\n";
+
+Errors are represented in the Template Toolkit by objects of the
+L<Template::Exception> class. If the L<process()> method returns a false value
+then the C<error()> method can be called to return an object of this class.
+The L<type()|Template::Exception#type()> and
+L<info()|Template::Exception#info()> methods can called on the object to
+retrieve the error type and information string, respectively. The 
+L<as_string()|Template::Exception#as_string()>
+method can be called to return a string of the form C<$type - $info>. This
+method is also overloaded onto the stringification operator allowing the
+object reference itself to be printed to return the formatted error string.
+
+    $tt->process('somefile') || do {
+        my $error = $tt->error();
+        print "error type: ", $error->type(), "\n";
+        print "error info: ", $error->info(), "\n";
+        print $error, "\n";
+    };
+
+=head2 service()
+
+The C<Template> module delegates most of the effort of processing templates
+to an underlying L<Template::Service> object.  This method returns a reference
+to that object.
+
+=head2 context()
+
+The L<Template::Service> module uses a core L<Template::Context> object for
+runtime processing of templates.  This method returns a reference to 
+that object and is equivalent to C<< $template-E<gt>service-E<gt>context() >>.
+
+=head1 CONFIGURATION SUMMARY
+
+The following list gives a short summary of each Template Toolkit 
+configuration option.  See L<Template::Manual::Config> for full details.
+
+=head2 Template Style and Parsing Options
+
+=head3 START_TAG, END_TAG
+
+Define tokens that indicate start and end of directives 
+(default: 'C<[%>' and 'C<%]>').
+
+=head3 TAG_STYLE
+
+Set C<START_TAG> and C<END_TAG> according to a pre-defined style (default:
+'C<template>', as above).
+
+=head3 PRE_CHOMP, POST_CHOMP
+
+Removes whitespace before/after directives (default: 0/0).
+
+=head3 TRIM
+
+Remove leading and trailing whitespace from template output (default: 0).
+
+=head3 INTERPOLATE
+
+Interpolate variables embedded like C<$this> or C<${this}> (default: 0).
+
+=head3 ANYCASE
+
+Allow directive keywords in lower case (default: 0 - UPPER only).
+
+=head2 Template Files and Blocks
+
+=head3 INCLUDE_PATH
+
+One or more directories to search for templates.
+
+=head3 DELIMITER
+
+Delimiter for separating paths in C<INCLUDE_PATH> (default: 'C<:>').
+
+=head3 ABSOLUTE
+
+Allow absolute file names, e.g. C</foo/bar.html> (default: 0).
+
+=head3 RELATIVE
+
+Allow relative filenames, e.g. C<../foo/bar.html> (default: 0).
+
+=head3 DEFAULT
+
+Default template to use when another not found.
+
+=head3 BLOCKS
+
+Hash array pre-defining template blocks.
+
+=head3 AUTO_RESET
+
+Enabled by default causing C<BLOCK> definitions to be reset each time a 
+template is processed.  Disable to allow C<BLOCK> definitions to persist.
+
+=head3 RECURSION
+
+Flag to permit recursion into templates (default: 0).
+
+=head2 Template Variables
+
+=head3 VARIABLES
+
+Hash array of variables and values to pre-define in the stash.
+
+=head2 Runtime Processing Options
+
+=head3 EVAL_PERL
+
+Flag to indicate if C<PERL>/C<RAWPERL> blocks should be processed (default: 0).
+
+=head3 PRE_PROCESS, POST_PROCESS
+
+Name of template(s) to process before/after main template.
+
+=head3 PROCESS
+
+Name of template(s) to process instead of main template.
+
+=head3 ERROR
+
+Name of error template or reference to hash array mapping error types to
+templates.
+
+=head3 OUTPUT
+
+Default output location or handler.
+
+=head3 OUTPUT_PATH
+
+Directory into which output files can be written.
+
+=head3 DEBUG
+
+Enable debugging messages.
+
+=head2 Caching and Compiling Options
+
+=head3 CACHE_SIZE
+
+Maximum number of compiled templates to cache in memory (default:
+undef - cache all)
+
+=head3 COMPILE_EXT
+
+Filename extension for compiled template files (default: undef - don't
+compile).
+
+=head3 COMPILE_DIR
+
+Root of directory in which compiled template files should be written
+(default: undef - don't compile).
+
+=head2 Plugins and Filters
+
+=head3 PLUGINS
+
+Reference to a hash array mapping plugin names to Perl packages.
+
+=head3 PLUGIN_BASE
+
+One or more base classes under which plugins may be found.
+
+=head3 LOAD_PERL
+
+Flag to indicate regular Perl modules should be loaded if a named plugin 
+can't be found  (default: 0).
+
+=head3 FILTERS
+
+Hash array mapping filter names to filter subroutines or factories.
+
+=head2 Customisation and Extension
+
+=head3 LOAD_TEMPLATES
+
+List of template providers.
+
+=head3 LOAD_PLUGINS
+
+List of plugin providers.
+
+=head3 LOAD_FILTERS
+
+List of filter providers.
+
+=head3 TOLERANT
+
+Set providers to tolerate errors as declinations (default: 0).
+
+=head3 SERVICE
+
+Reference to a custom service object (default: L<Template::Service>).
+
+=head3 CONTEXT
+
+Reference to a custom context object (default: L<Template::Context>).
+
+=head3 STASH
+
+Reference to a custom stash object (default: L<Template::Stash>).
+
+=head3 PARSER
+
+Reference to a custom parser object (default: L<Template::Parser>).
+
+=head3 GRAMMAR
+
+Reference to a custom grammar object (default: L<Template::Grammar>).
+
+=head1 DIRECTIVE SUMMARY
+
+The following list gives a short summary of each Template Toolkit directive.
+See L<Template::Manual::Directives> for full details.
+
+=head2 GET
+
+Evaluate and print a variable or value.
+
+    [%   GET variable %]    # 'GET' keyword is optional
+    [%       variable %]
+    [%       hash.key %]
+    [%         list.n %]
+    [%     code(args) %]
+    [% obj.meth(args) %]
+    [%  "value: $var" %]
+
+=head2 CALL
+
+As per L<GET> but without printing result (e.g. call code)
+
+    [%  CALL variable %]
+
+=head2 SET
+
+Assign a values to variables.
+
+    [% SET variable = value %]    # 'SET' also optional
+    [%     variable = other_variable
+           variable = 'literal text @ $100'
+           variable = "interpolated text: $var"
+           list     = [ val, val, val, val, ... ]
+           list     = [ val..val ]
+           hash     = { var => val, var => val, ... }
+    %]
+
+=head2 DEFAULT
+
+Like L<SET>, but variables are only set if currently unset (i.e. have no
+true value).
+
+    [% DEFAULT variable = value %]
+
+=head2 INSERT
+
+Insert a file without any processing performed on the contents.
+
+    [% INSERT legalese.txt %]
+
+=head2 PROCESS
+
+Process another template file or block and insert the generated output.
+Any template L<BLOCK>s or variables defined or updated in the C<PROCESS>ed
+template will thereafter be defined in the calling template.
+
+    [% PROCESS template %]
+    [% PROCESS template  var = val, ... %]
+
+=head2 INCLUDE
+
+Similar to C<PROCESS>, but using a local copy of the current variables.
+Any template C<BLOCK>s or variables defined in the C<INCLUDE>d template
+remain local to it.
+
+    [% INCLUDE template %]
+    [% INCLUDE template  var = val, ... %]
+
+=head2 WRAPPER
+
+The content between the C<WRAPPER> and correspondng C<END> directives is first
+evaluated, with the output generated being stored in the C<content> variable.
+The named template is then process as per C<INCLUDE>.
+
+    [% WRAPPER layout %]
+       Some template markup [% blah %]...
+    [% END %]
+
+A simple F<layout> template might look something like this:
+
+    Your header here...
+    [% content %]
+    Your footer here...
+
+=head2 BLOCK
+
+Define a named template block for L<INCLUDE>, L<PROCESS> and L<WRAPPER>
+to use.
+
+    [% BLOCK hello %]
+       Hello World
+    [% END %]
+    
+    [% INCLUDE hello %]
+
+=head2 FOREACH
+
+Repeat the enclosed C<FOREACH> ... C<END> block for each value in the list.
+
+    [% FOREACH variable IN [ val, val, val ] %]    # either
+    [% FOREACH variable IN list %]                 # or
+       The variable is set to [% variable %]
+    [% END %]
+
+=head2 WHILE
+
+The block enclosed between C<WHILE> and C<END> block is processed while 
+the specified condition is true.
+
+    [% WHILE condition %]
+       content
+    [% END %]
+
+=head2 IF / UNLESS / ELSIF / ELSE
+
+The enclosed block is processed if the condition is true / false.
+
+    [% IF condition %]
+       content
+    [% ELSIF condition %]
+     content
+    [% ELSE %]
+     content
+    [% END %]
+
+    [% UNLESS condition %]
+       content
+    [% # ELSIF/ELSE as per IF, above %]
+       content
+    [% END %]
+
+=head2 SWITCH / CASE
+
+Multi-way switch/case statement.
+
+    [% SWITCH variable %]
+    [%   CASE val1 %]
+           content
+    [%   CASE [ val2, val3 ] %]
+           content
+    [%   CASE %]         # or [% CASE DEFAULT %]
+           content
+    [% END %]
+
+=head2 MACRO
+
+Define a named macro.
+
+    [% MACRO name <directive> %]
+    [% MACRO name(arg1, arg2) <directive> %]
+    ...
+    [% name %]
+    [% name(val1, val2) %]
+
+=head2 FILTER
+
+Process enclosed C<FILTER> ... C<END> block then pipe through a filter.
+
+    [% FILTER name %]                       # either
+    [% FILTER name( params ) %]             # or
+    [% FILTER alias = name( params ) %]     # or
+       content
+    [% END %]
+
+=head2 USE
+
+Load a plugin module (see C<Template::<Manual::Plugins>), or any regular Perl
+module when the C<LOAD_PERL> option is set.
+
+    [% USE name %]                      # either
+    [% USE name( params ) %]            # or
+    [% USE var = name( params ) %]      # or
+    ...
+    [% name.method %]
+    [% var.method %]
+
+=head2 PERL / RAWPERL
+
+Evaluate enclosed blocks as Perl code (requires the C<EVAL_PERL> option to be
+set).
+
+    [% PERL %]
+     # perl code goes here
+     $stash->set('foo', 10);
+     print "set 'foo' to ", $stash->get('foo'), "\n";
+     print $context->include('footer', { var => $val });
+    [% END %]
+
+    [% RAWPERL %]
+       # raw perl code goes here, no magic but fast.
+       $output .= 'some output';
+    [% END %]
+
+=head2 TRY / THROW / CATCH / FINAL
+
+Exception handling.
+
+    [% TRY %]
+     content
+       [% THROW type info %]
+    [% CATCH type %]
+     catch content
+       [% error.type %] [% error.info %]
+    [% CATCH %] # or [% CATCH DEFAULT %]
+     content
+    [% FINAL %]
+       this block is always processed
+    [% END %]
+
+=head2 NEXT
+
+Jump straight to the next item in a C<FOREACH> or C<WHILE> loop.
+
+    [% NEXT %]
+
+=head2 LAST
+
+Break out of C<FOREACH> or C<WHILE> loop.
+
+    [% LAST %]
+
+=head2 RETURN
+
+Stop processing current template and return to including templates.
+
+    [% RETURN %]
+
+=head2 STOP
+
+Stop processing all templates and return to caller.
+
+    [% STOP %]
+
+=head2 TAGS
+
+Define new tag style or characters (default: C<[%> C<%]>).
+
+    [% TAGS html %]
+    [% TAGS <!-- --> %]
+
+=head2 COMMENTS
+
+Ignored and deleted.
+
+    [% # this is a comment to the end of line
+       foo = 'bar'
+    %]
+
+    [%# placing the '#' immediately inside the directive
+        tag comments out the entire directive
+    %]
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 VERSION
+
+Template Toolkit version 2.20_1, released April 2009.
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2009 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Base.pm b/bench/perl/Template/Base.pm
new file mode 100644
index 0000000..b29b3c9
--- /dev/null
+++ b/bench/perl/Template/Base.pm
@@ -0,0 +1,283 @@
+#============================================================= -*-perl-*-
+#
+# Template::Base
+#
+# DESCRIPTION
+#   Base class module implementing common functionality for various other
+#   Template Toolkit modules.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#========================================================================
+ 
+package Template::Base;
+
+use strict;
+use warnings;
+use Template::Constants;
+
+our $VERSION = 2.78;
+
+
+#------------------------------------------------------------------------
+# new(\%params)
+#
+# General purpose constructor method which expects a hash reference of 
+# configuration parameters, or a list of name => value pairs which are 
+# folded into a hash.  Blesses a hash into an object and calls its 
+# _init() method, passing the parameter hash reference.  Returns a new
+# object derived from Template::Base, or undef on error.
+#------------------------------------------------------------------------
+
+sub new {
+    my $class = shift;
+    my ($argnames, @args, $arg, $cfg);
+#    $class->error('');         # always clear package $ERROR var?
+
+    {   no strict 'refs';
+        no warnings 'once';
+        $argnames = \@{"$class\::BASEARGS"} || [ ];
+    }
+
+    # shift off all mandatory args, returning error if undefined or null
+    foreach $arg (@$argnames) {
+        return $class->error("no $arg specified")
+            unless ($cfg = shift);
+        push(@args, $cfg);
+    }
+
+    # fold all remaining args into a hash, or use provided hash ref
+    $cfg  = defined $_[0] && ref($_[0]) eq 'HASH' ? shift : { @_ };
+
+    my $self = bless {
+        (map { ($_ => shift @args) } @$argnames),
+        _ERROR  => '',
+        DEBUG   => 0,
+    }, $class;
+    
+    return $self->_init($cfg) ? $self : $class->error($self->error);
+}
+
+
+#------------------------------------------------------------------------
+# error()
+# error($msg, ...)
+# 
+# May be called as a class or object method to set or retrieve the 
+# package variable $ERROR (class method) or internal member 
+# $self->{ _ERROR } (object method).  The presence of parameters indicates
+# that the error value should be set.  Undef is then returned.  In the
+# abscence of parameters, the current error value is returned.
+#------------------------------------------------------------------------
+
+sub error {
+    my $self = shift;
+    my $errvar;
+
+    { 
+        no strict qw( refs );
+        $errvar = ref $self ? \$self->{ _ERROR } : \${"$self\::ERROR"};
+    }
+    if (@_) {
+        $$errvar = ref($_[0]) ? shift : join('', @_);
+        return undef;
+    }
+    else {
+        return $$errvar;
+    }
+}
+
+
+#------------------------------------------------------------------------
+# _init()
+#
+# Initialisation method called by the new() constructor and passing a 
+# reference to a hash array containing any configuration items specified
+# as constructor arguments.  Should return $self on success or undef on 
+# error, via a call to the error() method to set the error message.
+#------------------------------------------------------------------------
+
+sub _init {
+    my ($self, $config) = @_;
+    return $self;
+}
+
+
+sub debug {
+    my $self = shift;
+    my $msg  = join('', @_);
+    my ($pkg, $file, $line) = caller();
+
+    unless ($msg =~ /\n$/) {
+        $msg .= ($self->{ DEBUG } & Template::Constants::DEBUG_CALLER)
+            ? " at $file line $line\n"
+            : "\n";
+    }
+
+    print STDERR "[$pkg] $msg";
+}
+
+
+#------------------------------------------------------------------------
+# module_version()
+#
+# Returns the current version number.
+#------------------------------------------------------------------------
+
+sub module_version {
+    my $self = shift;
+    my $class = ref $self || $self;
+    no strict 'refs';
+    return ${"${class}::VERSION"};
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Base - Base class module implementing common functionality
+
+=head1 SYNOPSIS
+
+    package My::Module;
+    use base qw( Template::Base );
+    
+    sub _init {
+        my ($self, $config) = @_;
+        $self->{ doodah } = $config->{ doodah }
+            || return $self->error("No 'doodah' specified");
+        return $self;
+    }
+    
+    package main;
+    
+    my $object = My::Module->new({ doodah => 'foobar' })
+        || die My::Module->error();
+
+=head1 DESCRIPTION
+
+Base class module which implements a constructor and error reporting 
+functionality for various Template Toolkit modules.
+
+=head1 PUBLIC METHODS
+
+=head2 new(\%config)
+
+Constructor method which accepts a reference to a hash array or a list 
+of C<name =E<gt> value> parameters which are folded into a hash.  The 
+C<_init()> method is then called, passing the configuration hash and should
+return true/false to indicate success or failure.  A new object reference
+is returned, or undef on error.  Any error message raised can be examined
+via the L<error()> class method or directly via the C<$ERROR> package variable 
+in the derived class.
+
+    my $module = My::Module->new({ ... })
+        || die My::Module->error(), "\n";
+
+    my $module = My::Module->new({ ... })
+        || die "constructor error: $My::Module::ERROR\n";
+
+=head2 error($msg, ...)
+
+May be called as an object method to get/set the internal C<_ERROR> member
+or as a class method to get/set the C<$ERROR> variable in the derived class's
+package.
+
+    my $module = My::Module->new({ ... })
+        || die My::Module->error(), "\n";
+
+    $module->do_something() 
+        || die $module->error(), "\n";
+
+When called with parameters (multiple params are concatenated), this
+method will set the relevant variable and return undef.  This is most
+often used within object methods to report errors to the caller.
+
+    package My::Module;
+    
+    sub foobar {
+        my $self = shift;
+        
+        # some other code...
+        
+        return $self->error('some kind of error...')
+            if $some_condition;
+    }
+
+=head2 debug($msg, ...)
+
+Generates a debugging message by concatenating all arguments
+passed into a string and printing it to C<STDERR>.  A prefix is
+added to indicate the module of the caller.
+
+    package My::Module;
+    
+    sub foobar {
+        my $self = shift;
+        
+        $self->debug('called foobar()');
+        
+        # some other code...
+    }
+
+When the C<foobar()> method is called, the following message
+is sent to C<STDERR>:
+
+    [My::Module] called foobar()
+
+Objects can set an internal C<DEBUG> value which the C<debug()>
+method will examine.  If this value sets the relevant bits
+to indicate C<DEBUG_CALLER> then the file and line number of
+the caller will be appened to the message.
+
+    use Template::Constants qw( :debug );
+    
+    my $module = My::Module->new({
+        DEBUG => DEBUG_SERVICE | DEBUG_CONTEXT | DEBUG_CALLER,
+    });
+    
+    $module->foobar();
+
+This generates an error message such as:
+
+    [My::Module] called foobar() at My/Module.pm line 6
+
+=head2 module_version()
+
+Returns the version number for a module, as defined by the C<$VERSION>
+package variable.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Config.pm b/bench/perl/Template/Config.pm
new file mode 100644
index 0000000..ef4fe23
--- /dev/null
+++ b/bench/perl/Template/Config.pm
@@ -0,0 +1,428 @@
+#============================================================= -*-perl-*-
+#
+# Template::Config
+#
+# DESCRIPTION
+#   Template Toolkit configuration module.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#========================================================================
+ 
+package Template::Config;
+
+use strict;
+use warnings;
+use base 'Template::Base';
+use vars qw( $VERSION $DEBUG $ERROR $INSTDIR
+             $PARSER $PROVIDER $PLUGINS $FILTERS $ITERATOR 
+             $LATEX_PATH $PDFLATEX_PATH $DVIPS_PATH
+             $STASH $SERVICE $CONTEXT $CONSTANTS @PRELOAD );
+
+$VERSION   = 2.75;
+$DEBUG     = 0 unless defined $DEBUG;
+$ERROR     = '';
+$CONTEXT   = 'Template::Context';
+$FILTERS   = 'Template::Filters';
+$ITERATOR  = 'Template::Iterator';
+$PARSER    = 'Template::Parser';
+$PLUGINS   = 'Template::Plugins';
+$PROVIDER  = 'Template::Provider';
+$SERVICE   = 'Template::Service';
+$STASH     = 'Template::Stash';
+$CONSTANTS = 'Template::Namespace::Constants';
+
+ at PRELOAD   = ( $CONTEXT, $FILTERS, $ITERATOR, $PARSER,
+               $PLUGINS, $PROVIDER, $SERVICE, $STASH );
+
+# the following is set at installation time by the Makefile.PL 
+$INSTDIR  = '';
+
+
+#========================================================================
+#                       --- CLASS METHODS ---
+#========================================================================
+
+#------------------------------------------------------------------------
+# preload($module, $module, ...)
+#
+# Preloads all the standard TT modules that are likely to be used, along
+# with any other passed as arguments.
+#------------------------------------------------------------------------
+
+sub preload {
+    my $class = shift;
+
+    foreach my $module (@PRELOAD, @_) {
+        $class->load($module) || return;
+    };
+    return 1;
+}
+
+
+#------------------------------------------------------------------------
+# load($module)
+#
+# Load a module via require().  Any occurences of '::' in the module name
+# are be converted to '/' and '.pm' is appended.  Returns 1 on success
+# or undef on error.  Use $class->error() to examine the error string.
+#------------------------------------------------------------------------
+
+sub load {
+    my ($class, $module) = @_;
+    $module =~ s[::][/]g;
+    $module .= '.pm';
+    eval { require $module; };
+    return $@ ? $class->error("failed to load $module: $@") : 1;
+}
+
+
+#------------------------------------------------------------------------
+# parser(\%params)
+#
+# Instantiate a new parser object of the class whose name is denoted by
+# the package variable $PARSER (default: Template::Parser).  Returns
+# a reference to a newly instantiated parser object or undef on error.
+# The class error() method can be called without arguments to examine
+# the error message generated by this failure.
+#------------------------------------------------------------------------
+
+sub parser {
+    my $class  = shift;
+    my $params = defined($_[0]) && ref($_[0]) eq 'HASH'
+               ? shift : { @_ };
+
+    return undef unless $class->load($PARSER);
+    return $PARSER->new($params) 
+        || $class->error("failed to create parser: ", $PARSER->error);
+}
+
+
+#------------------------------------------------------------------------
+# provider(\%params)
+#
+# Instantiate a new template provider object (default: Template::Provider).
+# Returns an object reference or undef on error, as above.
+#------------------------------------------------------------------------
+
+sub provider {
+    my $class  = shift;
+    my $params = defined($_[0]) && ref($_[0]) eq 'HASH' 
+               ? shift : { @_ };
+
+    return undef unless $class->load($PROVIDER);
+    return $PROVIDER->new($params) 
+        || $class->error("failed to create template provider: ",
+                         $PROVIDER->error);
+}
+
+
+#------------------------------------------------------------------------
+# plugins(\%params)
+#
+# Instantiate a new plugins provider object (default: Template::Plugins).
+# Returns an object reference or undef on error, as above.
+#------------------------------------------------------------------------
+
+sub plugins {
+    my $class  = shift;
+    my $params = defined($_[0]) && ref($_[0]) eq 'HASH' 
+               ? shift : { @_ };
+
+    return undef unless $class->load($PLUGINS);
+    return $PLUGINS->new($params)
+        || $class->error("failed to create plugin provider: ",
+                         $PLUGINS->error);
+}
+
+
+#------------------------------------------------------------------------
+# filters(\%params)
+#
+# Instantiate a new filters provider object (default: Template::Filters).
+# Returns an object reference or undef on error, as above.
+#------------------------------------------------------------------------
+
+sub filters {
+    my $class  = shift;
+    my $params = defined($_[0]) && ref($_[0]) eq 'HASH' 
+               ? shift : { @_ };
+
+    return undef unless $class->load($FILTERS);
+    return $FILTERS->new($params)
+        || $class->error("failed to create filter provider: ",
+                         $FILTERS->error);
+}
+
+
+#------------------------------------------------------------------------
+# iterator(\@list)
+#
+# Instantiate a new Template::Iterator object (default: Template::Iterator).
+# Returns an object reference or undef on error, as above.
+#------------------------------------------------------------------------
+
+sub iterator {
+    my $class = shift;
+    my $list  = shift;
+
+    return undef unless $class->load($ITERATOR);
+    return $ITERATOR->new($list, @_)
+        || $class->error("failed to create iterator: ", $ITERATOR->error);
+}
+
+
+#------------------------------------------------------------------------
+# stash(\%vars)
+#
+# Instantiate a new template variable stash object (default: 
+# Template::Stash). Returns object or undef, as above.
+#------------------------------------------------------------------------
+
+sub stash {
+    my $class  = shift;
+    my $params = defined($_[0]) && ref($_[0]) eq 'HASH' 
+               ? shift : { @_ };
+
+    return undef unless $class->load($STASH);
+    return $STASH->new($params) 
+        || $class->error("failed to create stash: ", $STASH->error);
+}
+
+
+#------------------------------------------------------------------------
+# context(\%params)
+#
+# Instantiate a new template context object (default: Template::Context). 
+# Returns object or undef, as above.
+#------------------------------------------------------------------------
+
+sub context {
+    my $class  = shift;
+    my $params = defined($_[0]) && ref($_[0]) eq 'HASH' 
+               ? shift : { @_ };
+
+    return undef unless $class->load($CONTEXT);
+    return $CONTEXT->new($params) 
+        || $class->error("failed to create context: ", $CONTEXT->error);
+}
+
+
+#------------------------------------------------------------------------
+# service(\%params)
+#
+# Instantiate a new template context object (default: Template::Service). 
+# Returns object or undef, as above.
+#------------------------------------------------------------------------
+
+sub service {
+    my $class  = shift;
+    my $params = defined($_[0]) && ref($_[0]) eq 'HASH' 
+               ? shift : { @_ };
+
+    return undef unless $class->load($SERVICE);
+    return $SERVICE->new($params) 
+        || $class->error("failed to create context: ", $SERVICE->error);
+}
+
+
+#------------------------------------------------------------------------
+# constants(\%params)
+#
+# Instantiate a new namespace handler for compile time constant folding
+# (default: Template::Namespace::Constants). 
+# Returns object or undef, as above.
+#------------------------------------------------------------------------
+
+sub constants {
+    my $class  = shift;
+    my $params = defined($_[0]) && ref($_[0]) eq 'HASH' 
+               ? shift : { @_ };
+
+    return undef unless $class->load($CONSTANTS);
+    return $CONSTANTS->new($params) 
+        || $class->error("failed to create constants namespace: ", 
+                         $CONSTANTS->error);
+}
+
+
+#------------------------------------------------------------------------
+# instdir($dir)
+#
+# Returns the root installation directory appended with any local 
+# component directory passed as an argument.
+#------------------------------------------------------------------------
+
+sub instdir {
+    my ($class, $dir) = @_;
+    my $inst = $INSTDIR 
+        || return $class->error("no installation directory");
+    $inst =~ s[/$][]g;
+    $inst .= "/$dir" if $dir;
+    return $inst;
+}
+
+
+#========================================================================
+# This should probably be moved somewhere else in the long term, but for
+# now it ensures that Template::TieString is available even if the 
+# Template::Directive module hasn't been loaded, as is the case when 
+# using compiled templates and Template::Parser hasn't yet been loaded
+# on demand.
+#========================================================================
+
+#------------------------------------------------------------------------
+# simple package for tying $output variable to STDOUT, used by perl()
+#------------------------------------------------------------------------
+
+package Template::TieString;
+
+sub TIEHANDLE {
+    my ($class, $textref) = @_;
+    bless $textref, $class;
+}
+sub PRINT {
+    my $self = shift;
+    $$self .= join('', @_);
+}
+
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Config - Factory module for instantiating other TT2 modules
+
+=head1 SYNOPSIS
+
+    use Template::Config;
+
+=head1 DESCRIPTION
+
+This module implements various methods for loading and instantiating
+other modules that comprise the Template Toolkit.  It provides a consistent
+way to create toolkit components and allows custom modules to be used in 
+place of the regular ones.
+
+Package variables such as C<$STASH>, C<$SERVICE>, C<$CONTEXT>, etc., contain
+the default module/package name for each component (L<Template::Stash>,
+L<Template::Service> and L<Template::Context>, respectively) and are used by
+the various factory methods (L<stash()>, L<service()> and L<context()>) to
+load the appropriate module. Changing these package variables will cause
+subsequent calls to the relevant factory method to load and instantiate an
+object from the new class.
+
+=head1 PUBLIC METHODS
+
+=head2 load($module)
+
+Load a module using Perl's L<require()>. Any occurences of 'C<::>' in the module
+name are be converted to 'C</>', and 'C<.pm>' is appended. Returns 1 on success or
+undef on error.  Use C<$class-E<gt>error()> to examine the error string.
+
+=head2 preload()
+
+This method preloads all the other C<Template::*> modules that are likely to
+be used. It is called automatically by the L<Template> module when running
+under mod_perl (C<$ENV{MOD_PERL}> is set).
+
+=head2 parser(\%config)
+
+Instantiate a new parser object of the class whose name is denoted by
+the package variable C<$PARSER> (default: L<Template::Parser>).  Returns
+a reference to a newly instantiated parser object or undef on error.
+
+=head2 provider(\%config)
+
+Instantiate a new template provider object (default: L<Template::Provider>).
+Returns an object reference or undef on error, as above.
+
+=head2 plugins(\%config)
+
+Instantiate a new plugins provider object (default: L<Template::Plugins>).
+Returns an object reference or undef on error, as above.
+
+=head2 filters(\%config)
+
+Instantiate a new filter provider object (default: L<Template::Filters>).
+Returns an object reference or undef on error, as above.
+
+=head2 stash(\%vars)
+
+Instantiate a new stash object (L<Template::Stash> or L<Template::Stash::XS>
+depending on the default set at installation time) using the contents of the
+optional hash array passed by parameter as initial variable definitions.
+Returns an object reference or undef on error, as above.
+
+=head2 context(\%config)
+
+Instantiate a new template context object (default: L<Template::Context>).
+Returns an object reference or undef on error, as above.
+
+=head2 service(\%config)
+
+Instantiate a new template service object (default: L<Template::Service>).
+Returns an object reference or undef on error, as above.
+
+=head2 iterator(\%config)
+
+Instantiate a new template iterator object (default: L<Template::Iterator>).
+Returns an object reference or undef on error, as above.
+
+=head2 constants(\%config)
+
+Instantiate a new namespace handler for compile time constant folding
+(default: L<Template::Namespace::Constants>). Returns an object reference or
+undef on error, as above.
+
+=head2 instdir($dir)
+
+Returns the root directory of the Template Toolkit installation under
+which optional components are installed.  Any relative directory specified
+as an argument will be appended to the returned directory.
+
+    # e.g. returns '/usr/local/tt2'
+    my $ttroot = Template::Config->instdir()
+        || die "$Template::Config::ERROR\n";
+
+    # e.g. returns '/usr/local/tt2/templates'
+    my $template = Template::Config->instdir('templates')
+        || die "$Template::Config::ERROR\n";
+
+Returns C<undef> and sets C<$Template::Config::ERROR> appropriately if the 
+optional components of the Template Toolkit have not been installed.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Constants.pm b/bench/perl/Template/Constants.pm
new file mode 100644
index 0000000..788ab5a
--- /dev/null
+++ b/bench/perl/Template/Constants.pm
@@ -0,0 +1,265 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Constants.pm
+#
+# DESCRIPTION
+#   Definition of constants for the Template Toolkit.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+ 
+package Template::Constants;
+
+require Exporter;
+use strict;
+use warnings;
+use Exporter;
+# Perl::MinimumVersion seems to think this is a Perl 5.008ism...
+# use base qw( Exporter );
+use vars qw( @EXPORT_OK %EXPORT_TAGS );
+use vars qw( $DEBUG_OPTIONS @STATUS @ERROR @CHOMP @DEBUG @ISA );
+# ... so we'll do it the Old Skool way just to keep it quiet
+ at ISA = qw( Exporter );
+
+our $VERSION = 2.75;
+
+
+#========================================================================
+#                         ----- EXPORTER -----
+#========================================================================
+
+# STATUS constants returned by directives
+use constant STATUS_OK       =>   0;      # ok
+use constant STATUS_RETURN   =>   1;      # ok, block ended by RETURN
+use constant STATUS_STOP     =>   2;      # ok, stoppped by STOP 
+use constant STATUS_DONE     =>   3;      # ok, iterator done
+use constant STATUS_DECLINED =>   4;      # ok, declined to service request
+use constant STATUS_ERROR    => 255;      # error condition
+
+# ERROR constants for indicating exception types
+use constant ERROR_RETURN    =>  'return'; # return a status code
+use constant ERROR_FILE      =>  'file';   # file error: I/O, parse, recursion
+use constant ERROR_VIEW      =>  'view';   # view error
+use constant ERROR_UNDEF     =>  'undef';  # undefined variable value used
+use constant ERROR_PERL      =>  'perl';   # error in [% PERL %] block
+use constant ERROR_FILTER    =>  'filter'; # filter error
+use constant ERROR_PLUGIN    =>  'plugin'; # plugin error
+
+# CHOMP constants for PRE_CHOMP and POST_CHOMP
+use constant CHOMP_NONE      => 0; # do not remove whitespace
+use constant CHOMP_ALL       => 1; # remove whitespace up to newline
+use constant CHOMP_ONE       => 1; # new name for CHOMP_ALL
+use constant CHOMP_COLLAPSE  => 2; # collapse whitespace to a single space
+use constant CHOMP_GREEDY    => 3; # remove all whitespace including newlines
+
+# DEBUG constants to enable various debugging options
+use constant DEBUG_OFF       =>    0; # do nothing
+use constant DEBUG_ON        =>    1; # basic debugging flag
+use constant DEBUG_UNDEF     =>    2; # throw undef on undefined variables
+use constant DEBUG_VARS      =>    4; # general variable debugging
+use constant DEBUG_DIRS      =>    8; # directive debugging
+use constant DEBUG_STASH     =>   16; # general stash debugging
+use constant DEBUG_CONTEXT   =>   32; # context debugging
+use constant DEBUG_PARSER    =>   64; # parser debugging
+use constant DEBUG_PROVIDER  =>  128; # provider debugging
+use constant DEBUG_PLUGINS   =>  256; # plugins debugging
+use constant DEBUG_FILTERS   =>  512; # filters debugging
+use constant DEBUG_SERVICE   => 1024; # context debugging
+use constant DEBUG_ALL       => 2047; # everything
+
+# extra debugging flags
+use constant DEBUG_CALLER    => 4096; # add caller file/line
+use constant DEBUG_FLAGS     => 4096; # bitmask to extraxt flags
+
+$DEBUG_OPTIONS  = {
+    &DEBUG_OFF      => off      => off      => &DEBUG_OFF,
+    &DEBUG_ON       => on       => on       => &DEBUG_ON,
+    &DEBUG_UNDEF    => undef    => undef    => &DEBUG_UNDEF,
+    &DEBUG_VARS     => vars     => vars     => &DEBUG_VARS,
+    &DEBUG_DIRS     => dirs     => dirs     => &DEBUG_DIRS,
+    &DEBUG_STASH    => stash    => stash    => &DEBUG_STASH,
+    &DEBUG_CONTEXT  => context  => context  => &DEBUG_CONTEXT,
+    &DEBUG_PARSER   => parser   => parser   => &DEBUG_PARSER,
+    &DEBUG_PROVIDER => provider => provider => &DEBUG_PROVIDER,
+    &DEBUG_PLUGINS  => plugins  => plugins  => &DEBUG_PLUGINS,
+    &DEBUG_FILTERS  => filters  => filters  => &DEBUG_FILTERS,
+    &DEBUG_SERVICE  => service  => service  => &DEBUG_SERVICE,
+    &DEBUG_ALL      => all      => all      => &DEBUG_ALL,
+    &DEBUG_CALLER   => caller   => caller   => &DEBUG_CALLER,
+};
+
+ at STATUS  = qw( STATUS_OK STATUS_RETURN STATUS_STOP STATUS_DONE
+               STATUS_DECLINED STATUS_ERROR );
+ at ERROR   = qw( ERROR_FILE ERROR_VIEW ERROR_UNDEF ERROR_PERL 
+               ERROR_RETURN ERROR_FILTER ERROR_PLUGIN );
+ at CHOMP   = qw( CHOMP_NONE CHOMP_ALL CHOMP_ONE CHOMP_COLLAPSE CHOMP_GREEDY );
+ at DEBUG   = qw( DEBUG_OFF DEBUG_ON DEBUG_UNDEF DEBUG_VARS 
+               DEBUG_DIRS DEBUG_STASH DEBUG_CONTEXT DEBUG_PARSER
+               DEBUG_PROVIDER DEBUG_PLUGINS DEBUG_FILTERS DEBUG_SERVICE
+               DEBUG_ALL DEBUG_CALLER DEBUG_FLAGS );
+
+ at EXPORT_OK   = ( @STATUS, @ERROR, @CHOMP, @DEBUG );
+%EXPORT_TAGS = (
+    'all'      => [ @EXPORT_OK ],
+    'status'   => [ @STATUS    ],
+    'error'    => [ @ERROR     ],
+    'chomp'    => [ @CHOMP     ],
+    'debug'    => [ @DEBUG     ],
+);
+
+
+sub debug_flags {
+    my ($self, $debug) = @_;
+    my (@flags, $flag, $value);
+    $debug = $self unless defined($debug) || ref($self);
+    
+    if ($debug =~ /^\d+$/) {
+        foreach $flag (@DEBUG) {
+            next if $flag =~ /^DEBUG_(OFF|ALL|FLAGS)$/;
+
+            # don't trash the original
+            my $copy = $flag;
+            $flag =~ s/^DEBUG_//;
+            $flag = lc $flag;
+            return $self->error("no value for flag: $flag")
+                unless defined($value = $DEBUG_OPTIONS->{ $flag });
+            $flag = $value;
+
+            if ($debug & $flag) {
+                $value = $DEBUG_OPTIONS->{ $flag };
+                return $self->error("no value for flag: $flag") unless defined $value;
+                push(@flags, $value);
+            }
+        }
+        return wantarray ? @flags : join(', ', @flags);
+    }
+    else {
+        @flags = split(/\W+/, $debug);
+        $debug = 0;
+        foreach $flag (@flags) {
+            $value = $DEBUG_OPTIONS->{ $flag };
+            return $self->error("unknown debug flag: $flag") unless defined $value;
+            $debug |= $value;
+        }
+        return $debug;
+    }
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Constants - Defines constants for the Template Toolkit
+
+=head1 SYNOPSIS
+
+    use Template::Constants qw( :status :error :all );
+
+=head1 DESCRIPTION
+
+The C<Template::Constants> modules defines, and optionally exports into the
+caller's namespace, a number of constants used by the L<Template> package.
+
+Constants may be used by specifying the C<Template::Constants> package 
+explicitly:
+
+    use Template::Constants;
+    print Template::Constants::STATUS_DECLINED;
+
+Constants may be imported into the caller's namespace by naming them as 
+options to the C<use Template::Constants> statement:
+
+    use Template::Constants qw( STATUS_DECLINED );
+    print STATUS_DECLINED;
+
+Alternatively, one of the following tagset identifiers may be specified
+to import sets of constants: 'C<:status>', 'C<:error>', 'C<:all>'.
+
+    use Template::Constants qw( :status );
+    print STATUS_DECLINED;
+
+Consult the documentation for the C<Exporter> module for more information 
+on exporting variables.
+
+=head1 EXPORTABLE TAG SETS
+
+The following tag sets and associated constants are defined: 
+
+    :status
+        STATUS_OK             # no problem, continue
+        STATUS_RETURN         # ended current block then continue (ok)
+        STATUS_STOP           # controlled stop (ok) 
+        STATUS_DONE           # iterator is all done (ok)
+        STATUS_DECLINED       # provider declined to service request (ok)
+        STATUS_ERROR          # general error condition (not ok)
+
+    :error
+        ERROR_RETURN          # return a status code (e.g. 'stop')
+        ERROR_FILE            # file error: I/O, parse, recursion
+        ERROR_UNDEF           # undefined variable value used
+        ERROR_PERL            # error in [% PERL %] block
+        ERROR_FILTER          # filter error
+        ERROR_PLUGIN          # plugin error
+
+    :chomp                  # for PRE_CHOMP and POST_CHOMP
+        CHOMP_NONE            # do not remove whitespace
+        CHOMP_ONE             # remove whitespace to newline
+        CHOMP_ALL             # old name for CHOMP_ONE (deprecated)
+        CHOMP_COLLAPSE        # collapse whitespace to a single space
+        CHOMP_GREEDY          # remove all whitespace including newlines
+
+    :debug
+        DEBUG_OFF             # do nothing
+        DEBUG_ON              # basic debugging flag
+        DEBUG_UNDEF           # throw undef on undefined variables
+        DEBUG_VARS            # general variable debugging
+        DEBUG_DIRS            # directive debugging
+        DEBUG_STASH           # general stash debugging
+        DEBUG_CONTEXT         # context debugging
+        DEBUG_PARSER          # parser debugging
+        DEBUG_PROVIDER        # provider debugging
+        DEBUG_PLUGINS         # plugins debugging
+        DEBUG_FILTERS         # filters debugging
+        DEBUG_SERVICE         # context debugging
+        DEBUG_ALL             # everything
+        DEBUG_CALLER          # add caller file/line info
+        DEBUG_FLAGS           # bitmap used internally
+
+    :all
+        All the above constants.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template>, C<Exporter>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Context.pm b/bench/perl/Template/Context.pm
new file mode 100644
index 0000000..c3de7d9
--- /dev/null
+++ b/bench/perl/Template/Context.pm
@@ -0,0 +1,1477 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Context
+#
+# DESCRIPTION
+#   Module defining a context in which a template document is processed.
+#   This is the runtime processing interface through which templates 
+#   can access the functionality of the Template Toolkit.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+# 
+#============================================================================
+
+package Template::Context;
+
+use strict;
+use warnings;
+use base 'Template::Base';
+
+use Template::Base;
+use Template::Config;
+use Template::Constants;
+use Template::Exception;
+use Scalar::Util 'blessed';
+
+use constant DOCUMENT         => 'Template::Document';
+use constant EXCEPTION        => 'Template::Exception';
+use constant BADGER_EXCEPTION => 'Badger::Exception';
+
+our $VERSION = 2.98;
+our $DEBUG   = 0 unless defined $DEBUG;
+our $DEBUG_FORMAT = "\n## \$file line \$line : [% \$text %] ##\n";
+our $VIEW_CLASS   = 'Template::View';
+our $AUTOLOAD;
+
+#========================================================================
+#                     -----  PUBLIC METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# template($name) 
+#
+# General purpose method to fetch a template and return it in compiled 
+# form.  In the usual case, the $name parameter will be a simple string
+# containing the name of a template (e.g. 'header').  It may also be 
+# a reference to Template::Document object (or sub-class) or a Perl 
+# sub-routine.  These are considered to be compiled templates and are
+# returned intact.  Finally, it may be a reference to any other kind 
+# of valid input source accepted by Template::Provider (e.g. scalar
+# ref, glob, IO handle, etc).
+#
+# Templates may be cached at one of 3 different levels.  The internal
+# BLOCKS member is a local cache which holds references to all
+# template blocks used or imported via PROCESS since the context's
+# reset() method was last called.  This is checked first and if the
+# template is not found, the method then walks down the BLOCKSTACK
+# list.  This contains references to the block definition tables in
+# any enclosing Template::Documents that we're visiting (e.g. we've
+# been called via an INCLUDE and we want to access a BLOCK defined in
+# the template that INCLUDE'd us).  If nothing is defined, then we
+# iterate through the LOAD_TEMPLATES providers list as a 'chain of 
+# responsibility' (see Design Patterns) asking each object to fetch() 
+# the template if it can.
+#
+# Returns the compiled template.  On error, undef is returned and 
+# the internal ERROR value (read via error()) is set to contain an
+# error message of the form "$name: $error".
+#------------------------------------------------------------------------
+
+sub template {
+    my ($self, $name) = @_;
+    my ($prefix, $blocks, $defblocks, $provider, $template, $error);
+    my ($shortname, $blockname, $providers);
+
+    $self->debug("template($name)") if $self->{ DEBUG };
+
+    # references to Template::Document (or sub-class) objects objects, or
+    # CODE references are assumed to be pre-compiled templates and are
+    # returned intact
+    return $name
+        if (blessed($name) && $name->isa(DOCUMENT))
+        || ref($name) eq 'CODE';
+
+    $shortname = $name;
+
+    unless (ref $name) {
+        
+        $self->debug("looking for block [$name]") if $self->{ DEBUG };
+
+        # we first look in the BLOCKS hash for a BLOCK that may have 
+        # been imported from a template (via PROCESS)
+        return $template
+            if ($template = $self->{ BLOCKS }->{ $name });
+        
+        # then we iterate through the BLKSTACK list to see if any of the
+        # Template::Documents we're visiting define this BLOCK
+        foreach $blocks (@{ $self->{ BLKSTACK } }) {
+            return $template
+                if $blocks && ($template = $blocks->{ $name });
+        }
+        
+        # now it's time to ask the providers, so we look to see if any 
+        # prefix is specified to indicate the desired provider set.
+        if ($^O eq 'MSWin32') {
+            # let C:/foo through
+            $prefix = $1 if $shortname =~ s/^(\w{2,})://o;
+        }
+        else {
+            $prefix = $1 if $shortname =~ s/^(\w+)://;
+        }
+        
+        if (defined $prefix) {
+            $providers = $self->{ PREFIX_MAP }->{ $prefix } 
+            || return $self->throw( Template::Constants::ERROR_FILE,
+                                    "no providers for template prefix '$prefix'");
+        }
+    }
+    $providers = $self->{ PREFIX_MAP }->{ default }
+        || $self->{ LOAD_TEMPLATES }
+            unless $providers;
+
+
+    # Finally we try the regular template providers which will 
+    # handle references to files, text, etc., as well as templates
+    # reference by name.  If
+
+    $blockname = '';
+    while ($shortname) {
+        $self->debug("asking providers for [$shortname] [$blockname]") 
+            if $self->{ DEBUG };
+
+        foreach my $provider (@$providers) {
+            ($template, $error) = $provider->fetch($shortname, $prefix);
+            if ($error) {
+                if ($error == Template::Constants::STATUS_ERROR) {
+                    # $template contains exception object
+                    if (blessed($template) && $template->isa(EXCEPTION)
+                        && $template->type eq Template::Constants::ERROR_FILE) {
+                        $self->throw($template);
+                    }
+                    else {
+                        $self->throw( Template::Constants::ERROR_FILE, $template );
+                    }
+                }
+                # DECLINE is ok, carry on
+            }
+            elsif (length $blockname) {
+                return $template 
+                    if $template = $template->blocks->{ $blockname };
+            }
+            else {
+                return $template;
+            }
+        }
+        
+        last if ref $shortname || ! $self->{ EXPOSE_BLOCKS };
+        $shortname =~ s{/([^/]+)$}{} || last;
+        $blockname = length $blockname ? "$1/$blockname" : $1;
+    }
+        
+    $self->throw(Template::Constants::ERROR_FILE, "$name: not found");
+}
+
+
+#------------------------------------------------------------------------
+# plugin($name, \@args)
+#
+# Calls on each of the LOAD_PLUGINS providers in turn to fetch() (i.e. load
+# and instantiate) a plugin of the specified name.  Additional parameters 
+# passed are propagated to the new() constructor for the plugin.  
+# Returns a reference to a new plugin object or other reference.  On 
+# error, undef is returned and the appropriate error message is set for
+# subsequent retrieval via error().
+#------------------------------------------------------------------------
+
+sub plugin {
+    my ($self, $name, $args) = @_;
+    my ($provider, $plugin, $error);
+    
+    $self->debug("plugin($name, ", defined $args ? @$args : '[ ]', ')')
+        if $self->{ DEBUG };
+    
+    # request the named plugin from each of the LOAD_PLUGINS providers in turn
+    foreach my $provider (@{ $self->{ LOAD_PLUGINS } }) {
+        ($plugin, $error) = $provider->fetch($name, $args, $self);
+        return $plugin unless $error;
+        if ($error == Template::Constants::STATUS_ERROR) {
+            $self->throw($plugin) if ref $plugin;
+            $self->throw(Template::Constants::ERROR_PLUGIN, $plugin);
+        }
+    }
+    
+    $self->throw(Template::Constants::ERROR_PLUGIN, "$name: plugin not found");
+}
+
+
+#------------------------------------------------------------------------
+# filter($name, \@args, $alias)
+#
+# Similar to plugin() above, but querying the LOAD_FILTERS providers to 
+# return filter instances.  An alias may be provided which is used to
+# save the returned filter in a local cache.
+#------------------------------------------------------------------------
+
+sub filter {
+    my ($self, $name, $args, $alias) = @_;
+    my ($provider, $filter, $error);
+    
+    $self->debug("filter($name, ", 
+                 defined $args  ? @$args : '[ ]', 
+                 defined $alias ? $alias : '<no alias>', ')')
+        if $self->{ DEBUG };
+    
+    # use any cached version of the filter if no params provided
+    return $filter 
+        if ! $args && ! ref $name
+            && ($filter = $self->{ FILTER_CACHE }->{ $name });
+    
+    # request the named filter from each of the FILTERS providers in turn
+    foreach my $provider (@{ $self->{ LOAD_FILTERS } }) {
+        ($filter, $error) = $provider->fetch($name, $args, $self);
+        last unless $error;
+        if ($error == Template::Constants::STATUS_ERROR) {
+            $self->throw($filter) if ref $filter;
+            $self->throw(Template::Constants::ERROR_FILTER, $filter);
+        }
+        # return $self->error($filter)
+        #    if $error == &Template::Constants::STATUS_ERROR;
+    }
+    
+    return $self->error("$name: filter not found")
+        unless $filter;
+    
+    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    # commented out by abw on 19 Nov 2001 to fix problem with xmlstyle
+    # plugin which may re-define a filter by calling define_filter()
+    # multiple times.  With the automatic aliasing/caching below, any
+    # new filter definition isn't seen.  Don't think this will cause
+    # any problems as filters explicitly supplied with aliases will
+    # still work as expected.
+    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    # alias defaults to name if undefined
+    # $alias = $name
+    #     unless defined($alias) or ref($name) or $args;
+
+    # cache FILTER if alias is valid
+    $self->{ FILTER_CACHE }->{ $alias } = $filter
+        if $alias;
+
+    return $filter;
+}
+
+
+#------------------------------------------------------------------------
+# view(\%config)
+# 
+# Create a new Template::View bound to this context.
+#------------------------------------------------------------------------
+
+sub view {
+    my $self = shift;
+    require Template::View;
+    return $VIEW_CLASS->new($self, @_)
+        || $self->throw(&Template::Constants::ERROR_VIEW, 
+                        $VIEW_CLASS->error);
+}
+
+
+#------------------------------------------------------------------------
+# process($template, \%params)         [% PROCESS template var=val ... %]
+# process($template, \%params, $local) [% INCLUDE template var=val ... %]
+#
+# Processes the template named or referenced by the first parameter.
+# The optional second parameter may reference a hash array of variable
+# definitions.  These are set before the template is processed by
+# calling update() on the stash.  Note that, unless the third parameter
+# is true, the context is not localised and these, and any other
+# variables set in the template will retain their new values after this
+# method returns.  The third parameter is in place so that this method
+# can handle INCLUDE calls: the stash will be localized.
+#
+# Returns the output of processing the template.  Errors are thrown
+# as Template::Exception objects via die().  
+#------------------------------------------------------------------------
+
+sub process {
+    my ($self, $template, $params, $localize) = @_;
+    my ($trim, $blocks) = @$self{ qw( TRIM BLOCKS ) };
+    my (@compiled, $name, $compiled);
+    my ($stash, $component, $tblocks, $error, $tmpout);
+    my $output = '';
+    
+    $template = [ $template ] unless ref $template eq 'ARRAY';
+    
+    $self->debug("process([ ", join(', '), @$template, ' ], ', 
+                 defined $params ? $params : '<no params>', ', ', 
+                 $localize ? '<localized>' : '<unlocalized>', ')')
+        if $self->{ DEBUG };
+    
+    # fetch compiled template for each name specified
+    foreach $name (@$template) {
+        push(@compiled, $self->template($name));
+    }
+
+    if ($localize) {
+        # localise the variable stash with any parameters passed
+        $stash = $self->{ STASH } = $self->{ STASH }->clone($params);
+    } else {
+        # update stash with any new parameters passed
+        $self->{ STASH }->update($params);
+        $stash = $self->{ STASH };
+    }
+
+    eval {
+        # save current component
+        eval { $component = $stash->get('component') };
+
+        foreach $name (@$template) {
+            $compiled = shift @compiled;
+            my $element = ref $compiled eq 'CODE' 
+                ? { (name => (ref $name ? '' : $name), modtime => time()) }
+                : $compiled;
+
+            if (blessed($component) && $component->isa(DOCUMENT)) {
+                $element->{ caller } = $component->{ name };
+                $element->{ callers } = $component->{ callers } || [];
+                push(@{$element->{ callers }}, $element->{ caller });
+            }
+
+            $stash->set('component', $element);
+            
+            unless ($localize) {
+                # merge any local blocks defined in the Template::Document
+                # into our local BLOCKS cache
+                @$blocks{ keys %$tblocks } = values %$tblocks
+                    if (blessed($compiled) && $compiled->isa(DOCUMENT))
+                    && ($tblocks = $compiled->blocks);
+            }
+            
+            if (ref $compiled eq 'CODE') {
+                $tmpout = &$compiled($self);
+            }
+            elsif (ref $compiled) {
+                $tmpout = $compiled->process($self);
+            }
+            else {
+                $self->throw('file', 
+                             "invalid template reference: $compiled");
+            }
+            
+            if ($trim) {
+                for ($tmpout) {
+                    s/^\s+//;
+                    s/\s+$//;
+                }
+            }
+            $output .= $tmpout;
+
+            # pop last item from callers.  
+            # NOTE - this will not be called if template throws an 
+            # error.  The whole issue of caller and callers should be 
+            # revisited to try and avoid putting this info directly into
+            # the component data structure.  Perhaps use a local element
+            # instead?
+
+            pop(@{$element->{ callers }})
+                if (blessed($component) && $component->isa(DOCUMENT));
+        }
+        $stash->set('component', $component);
+    };
+    $error = $@;
+    
+    if ($localize) {
+        # ensure stash is delocalised before dying
+        $self->{ STASH } = $self->{ STASH }->declone();
+    }
+    
+    $self->throw(ref $error 
+                 ? $error : (Template::Constants::ERROR_FILE, $error))
+        if $error;
+    
+    return $output;
+}
+
+
+#------------------------------------------------------------------------
+# include($template, \%params)    [% INCLUDE template   var = val, ... %]
+#
+# Similar to process() above but processing the template in a local 
+# context.  Any variables passed by reference to a hash as the second
+# parameter will be set before the template is processed and then 
+# revert to their original values before the method returns.  Similarly,
+# any changes made to non-global variables within the template will 
+# persist only until the template is processed.
+#
+# Returns the output of processing the template.  Errors are thrown
+# as Template::Exception objects via die().  
+#------------------------------------------------------------------------
+
+sub include {
+    my ($self, $template, $params) = @_;
+    return $self->process($template, $params, 'localize me!');
+}
+
+#------------------------------------------------------------------------
+# insert($file)
+#
+# Insert the contents of a file without parsing.
+#------------------------------------------------------------------------
+
+sub insert {
+    my ($self, $file) = @_;
+    my ($prefix, $providers, $text, $error);
+    my $output = '';
+
+    my $files = ref $file eq 'ARRAY' ? $file : [ $file ];
+
+    $self->debug("insert([ ", join(', '), @$files, " ])") 
+        if $self->{ DEBUG };
+
+
+    FILE: foreach $file (@$files) {
+        my $name = $file;
+
+        if ($^O eq 'MSWin32') {
+            # let C:/foo through
+            $prefix = $1 if $name =~ s/^(\w{2,})://o;
+        }
+        else {
+            $prefix = $1 if $name =~ s/^(\w+)://;
+        }
+
+        if (defined $prefix) {
+            $providers = $self->{ PREFIX_MAP }->{ $prefix } 
+                || return $self->throw(Template::Constants::ERROR_FILE,
+                    "no providers for file prefix '$prefix'");
+        }
+        else {
+            $providers = $self->{ PREFIX_MAP }->{ default }
+                || $self->{ LOAD_TEMPLATES };
+        }
+
+        foreach my $provider (@$providers) {
+            ($text, $error) = $provider->load($name, $prefix);
+            next FILE unless $error;
+            if ($error == Template::Constants::STATUS_ERROR) {
+                $self->throw($text) if ref $text;
+                $self->throw(Template::Constants::ERROR_FILE, $text);
+            }
+        }
+        $self->throw(Template::Constants::ERROR_FILE, "$file: not found");
+    }
+    continue {
+        $output .= $text;
+    }
+    return $output;
+}
+
+
+#------------------------------------------------------------------------
+# throw($type, $info, \$output)          [% THROW errtype "Error info" %]
+#
+# Throws a Template::Exception object by calling die().  This method
+# may be passed a reference to an existing Template::Exception object;
+# a single value containing an error message which is used to
+# instantiate a Template::Exception of type 'undef'; or a pair of
+# values representing the exception type and info from which a
+# Template::Exception object is instantiated.  e.g.
+#
+#   $context->throw($exception);
+#   $context->throw("I'm sorry Dave, I can't do that");
+#   $context->throw('denied', "I'm sorry Dave, I can't do that");
+#
+# An optional third parameter can be supplied in the last case which 
+# is a reference to the current output buffer containing the results
+# of processing the template up to the point at which the exception 
+# was thrown.  The RETURN and STOP directives, for example, use this 
+# to propagate output back to the user, but it can safely be ignored
+# in most cases.
+# 
+# This method rides on a one-way ticket to die() oblivion.  It does not 
+# return in any real sense of the word, but should get caught by a 
+# surrounding eval { } block (e.g. a BLOCK or TRY) and handled 
+# accordingly, or returned to the caller as an uncaught exception.
+#------------------------------------------------------------------------
+
+sub throw {
+    my ($self, $error, $info, $output) = @_;
+    local $" = ', ';
+
+    # die! die! die!
+    if (blessed($error) && $error->isa(EXCEPTION)) {
+        die $error;
+    }
+    elsif (blessed($error) && $error->isa(BADGER_EXCEPTION)) {
+        # convert a Badger::Exception to a Template::Exception so that
+        # things continue to work during the transition to Badger
+        die EXCEPTION->new($error->type, $error->info);
+    }
+    elsif (defined $info) {
+        die (EXCEPTION->new($error, $info, $output));
+    }
+    else {
+        $error ||= '';
+        die (EXCEPTION->new('undef', $error, $output));
+    }
+
+    # not reached
+}
+
+
+#------------------------------------------------------------------------
+# catch($error, \$output)
+#
+# Called by various directives after catching an error thrown via die()
+# from within an eval { } block.  The first parameter contains the errror
+# which may be a sanitized reference to a Template::Exception object
+# (such as that raised by the throw() method above, a plugin object, 
+# and so on) or an error message thrown via die from somewhere in user
+# code.  The latter are coerced into 'undef' Template::Exception objects.
+# Like throw() above, a reference to a scalar may be passed as an
+# additional parameter to represent the current output buffer
+# localised within the eval block.  As exceptions are thrown upwards
+# and outwards from nested blocks, the catch() method reconstructs the
+# correct output buffer from these fragments, storing it in the
+# exception object for passing further onwards and upwards.
+#
+# Returns a reference to a Template::Exception object..
+#------------------------------------------------------------------------
+
+sub catch {
+    my ($self, $error, $output) = @_;
+
+    if ( blessed($error) 
+      && ( $error->isa(EXCEPTION) || $error->isa(BADGER_EXCEPTION) ) ) {
+        $error->text($output) if $output;
+        return $error;
+    }
+    else {
+        return EXCEPTION->new('undef', $error, $output);
+    }
+}
+
+
+#------------------------------------------------------------------------
+# localise(\%params)
+# delocalise()
+#
+# The localise() method creates a local copy of the current stash,
+# allowing the existing state of variables to be saved and later 
+# restored via delocalise().
+#
+# A reference to a hash array may be passed containing local variable 
+# definitions which should be added to the cloned namespace.  These 
+# values persist until delocalisation.
+#------------------------------------------------------------------------
+
+sub localise {
+    my $self = shift;
+    $self->{ STASH } = $self->{ STASH }->clone(@_);
+}
+
+sub delocalise {
+    my $self = shift;
+    $self->{ STASH } = $self->{ STASH }->declone();
+}
+
+
+#------------------------------------------------------------------------
+# visit($document, $blocks)
+#
+# Each Template::Document calls the visit() method on the context
+# before processing itself.  It passes a reference to the hash array
+# of named BLOCKs defined within the document, allowing them to be 
+# added to the internal BLKSTACK list which is subsequently used by
+# template() to resolve templates.
+# from a provider.
+#------------------------------------------------------------------------
+
+sub visit {
+    my ($self, $document, $blocks) = @_;
+    unshift(@{ $self->{ BLKSTACK } }, $blocks)
+}
+
+
+#------------------------------------------------------------------------
+# leave()
+#
+# The leave() method is called when the document has finished
+# processing itself.  This removes the entry from the BLKSTACK list
+# that was added visit() above.  For persistence of BLOCK definitions,
+# the process() method (i.e. the PROCESS directive) does some extra
+# magic to copy BLOCKs into a shared hash.
+#------------------------------------------------------------------------
+
+sub leave {
+    my $self = shift;
+    shift(@{ $self->{ BLKSTACK } });
+}
+
+
+#------------------------------------------------------------------------
+# define_block($name, $block)
+#
+# Adds a new BLOCK definition to the local BLOCKS cache.  $block may
+# be specified as a reference to a sub-routine or Template::Document
+# object or as text which is compiled into a template.  Returns a true
+# value (the $block reference or compiled block reference) if
+# successful or undef on failure.  Call error() to retrieve the
+# relevent error message (i.e. compilation failure).
+#------------------------------------------------------------------------
+
+sub define_block {
+    my ($self, $name, $block) = @_;
+    $block = $self->template(\$block)
+    || return undef
+        unless ref $block;
+    $self->{ BLOCKS }->{ $name } = $block;
+}
+
+
+#------------------------------------------------------------------------
+# define_filter($name, $filter, $is_dynamic)
+#
+# Adds a new FILTER definition to the local FILTER_CACHE.
+#------------------------------------------------------------------------
+
+sub define_filter {
+    my ($self, $name, $filter, $is_dynamic) = @_;
+    my ($result, $error);
+    $filter = [ $filter, 1 ] if $is_dynamic;
+
+    foreach my $provider (@{ $self->{ LOAD_FILTERS } }) {
+    ($result, $error) = $provider->store($name, $filter);
+    return 1 unless $error;
+    $self->throw(&Template::Constants::ERROR_FILTER, $result)
+        if $error == &Template::Constants::STATUS_ERROR;
+    }
+    $self->throw(&Template::Constants::ERROR_FILTER, 
+         "FILTER providers declined to store filter $name");
+}
+
+sub define_view {
+    my ($self, $name, $params) = @_;
+    my $base;
+
+    if (defined $params->{ base }) {
+        my $base = $self->{ STASH }->get($params->{ base });
+
+        return $self->throw(
+            &Template::Constants::ERROR_VIEW, 
+            "view base is not defined: $params->{ base }"
+        ) unless $base;
+
+        return $self->throw(
+            &Template::Constants::ERROR_VIEW, 
+            "view base is not a $VIEW_CLASS object: $params->{ base } => $base"
+        ) unless blessed($base) && $base->isa($VIEW_CLASS);
+        
+        $params->{ base } = $base;
+    }
+    my $view = $self->view($params);
+    $view->seal();
+    $self->{ STASH }->set($name, $view);
+}
+
+sub define_views {
+    my ($self, $views) = @_;
+    
+    # a list reference is better because the order is deterministic (and so
+    # allows an earlier VIEW to be the base for a later VIEW), but we'll 
+    # accept a hash reference and assume that the user knows the order of
+    # processing is undefined
+    $views = [ %$views ] 
+        if ref $views eq 'HASH';
+    
+    # make of copy so we don't destroy the original list reference
+    my @items = @$views;
+    my ($name, $view);
+    
+    while (@items) {
+        $self->define_view(splice(@items, 0, 2));
+    }
+}
+
+
+#------------------------------------------------------------------------
+# reset()
+# 
+# Reset the state of the internal BLOCKS hash to clear any BLOCK 
+# definitions imported via the PROCESS directive.  Any original 
+# BLOCKS definitions passed to the constructor will be restored.
+#------------------------------------------------------------------------
+
+sub reset {
+    my ($self, $blocks) = @_;
+    $self->{ BLKSTACK } = [ ];
+    $self->{ BLOCKS   } = { %{ $self->{ INIT_BLOCKS } } };
+}
+
+
+#------------------------------------------------------------------------
+# stash()
+#
+# Simple accessor methods to return the STASH values.  This is likely
+# to be called quite often so we provide a direct method rather than
+# relying on the slower AUTOLOAD.
+#------------------------------------------------------------------------
+
+sub stash {
+    return $_[0]->{ STASH };
+}
+
+
+#------------------------------------------------------------------------
+# define_vmethod($type, $name, \&sub)
+#
+# Passes $type, $name, and &sub on to stash->define_vmethod().
+#------------------------------------------------------------------------
+sub define_vmethod {
+    my $self = shift;
+    $self->stash->define_vmethod(@_);
+}
+
+
+#------------------------------------------------------------------------
+# debugging($command, @args, \%params)
+#
+# Method for controlling the debugging status of the context.  The first
+# argument can be 'on' or 'off' to enable/disable debugging, 'format'
+# to define the format of the debug message, or 'msg' to generate a 
+# debugging message reporting the file, line, message text, etc., 
+# according to the current debug format.
+#------------------------------------------------------------------------
+
+sub debugging {
+    my $self = shift;
+    my $hash = ref $_[-1] eq 'HASH' ? pop : { };
+    my @args = @_;
+
+#    print "*** debug(@args)\n";
+    if (@args) {
+    if ($args[0] =~ /^on|1$/i) {
+        $self->{ DEBUG_DIRS } = 1;
+        shift(@args);
+    }
+    elsif ($args[0] =~ /^off|0$/i) {
+        $self->{ DEBUG_DIRS } = 0;
+        shift(@args);
+    }
+    }
+
+    if (@args) {
+    if ($args[0] =~ /^msg$/i) {
+            return unless $self->{ DEBUG_DIRS };
+        my $format = $self->{ DEBUG_FORMAT };
+        $format = $DEBUG_FORMAT unless defined $format;
+        $format =~ s/\$(\w+)/$hash->{ $1 }/ge;
+        return $format;
+    }
+    elsif ($args[0] =~ /^format$/i) {
+        $self->{ DEBUG_FORMAT } = $args[1];
+    }
+    # else ignore
+    }
+
+    return '';
+}
+
+
+#------------------------------------------------------------------------
+# AUTOLOAD
+#
+# Provides pseudo-methods for read-only access to various internal 
+# members.  For example, templates(), plugins(), filters(),
+# eval_perl(), load_perl(), etc.  These aren't called very often, or
+# may never be called at all.
+#------------------------------------------------------------------------
+
+sub AUTOLOAD {
+    my $self   = shift;
+    my $method = $AUTOLOAD;
+    my $result;
+
+    $method =~ s/.*:://;
+    return if $method eq 'DESTROY';
+
+    warn "no such context method/member: $method\n"
+    unless defined ($result = $self->{ uc $method });
+
+    return $result;
+}
+
+
+#------------------------------------------------------------------------
+# DESTROY
+#
+# Stash may contain references back to the Context via macro closures,
+# etc.  This breaks the circular references. 
+#------------------------------------------------------------------------
+
+sub DESTROY {
+    my $self = shift;
+    undef $self->{ STASH };
+}
+
+
+
+#========================================================================
+#                     -- PRIVATE METHODS --
+#========================================================================
+
+#------------------------------------------------------------------------
+# _init(\%config)
+#
+# Initialisation method called by Template::Base::new()
+#------------------------------------------------------------------------
+
+sub _init {
+    my ($self, $config) = @_;
+    my ($name, $item, $method, $block, $blocks);
+    my @itemlut = ( 
+        LOAD_TEMPLATES => 'provider',
+        LOAD_PLUGINS   => 'plugins',
+        LOAD_FILTERS   => 'filters' 
+    );
+
+    # LOAD_TEMPLATE, LOAD_PLUGINS, LOAD_FILTERS - lists of providers
+    while (($name, $method) = splice(@itemlut, 0, 2)) {
+        $item = $config->{ $name } 
+            || Template::Config->$method($config)
+            || return $self->error($Template::Config::ERROR);
+        $self->{ $name } = ref $item eq 'ARRAY' ? $item : [ $item ];
+    }
+
+    my $providers  = $self->{ LOAD_TEMPLATES };
+    my $prefix_map = $self->{ PREFIX_MAP } = $config->{ PREFIX_MAP } || { };
+    while (my ($key, $val) = each %$prefix_map) {
+        $prefix_map->{ $key } = [ ref $val ? $val : 
+                                  map { $providers->[$_] } split(/\D+/, $val) ]
+                                  unless ref $val eq 'ARRAY';
+    }
+
+    # STASH
+    $self->{ STASH } = $config->{ STASH } || do {
+        my $predefs  = $config->{ VARIABLES } 
+            || $config->{ PRE_DEFINE } 
+            || { };
+
+        # hack to get stash to know about debug mode
+        $predefs->{ _DEBUG } = ( ($config->{ DEBUG } || 0)
+                                 & &Template::Constants::DEBUG_UNDEF ) ? 1 : 0
+                                 unless defined $predefs->{ _DEBUG };
+        $predefs->{ _STRICT } = $config->{ STRICT };
+        
+        Template::Config->stash($predefs)
+            || return $self->error($Template::Config::ERROR);
+    };
+    
+    # compile any template BLOCKS specified as text
+    $blocks = $config->{ BLOCKS } || { };
+    $self->{ INIT_BLOCKS } = $self->{ BLOCKS } = { 
+        map {
+            $block = $blocks->{ $_ };
+            $block = $self->template(\$block)
+                || return undef
+                unless ref $block;
+            ($_ => $block);
+        } 
+        keys %$blocks
+    };
+
+    # define any VIEWS
+    $self->define_views( $config->{ VIEWS } )
+        if $config->{ VIEWS };
+
+    # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+    # RECURSION - flag indicating is recursion into templates is supported
+    # EVAL_PERL - flag indicating if PERL blocks should be processed
+    # TRIM      - flag to remove leading and trailing whitespace from output
+    # BLKSTACK  - list of hashes of BLOCKs defined in current template(s)
+    # CONFIG    - original configuration hash
+    # EXPOSE_BLOCKS - make blocks visible as pseudo-files
+    # DEBUG_FORMAT  - format for generating template runtime debugging messages
+    # DEBUG         - format for generating template runtime debugging messages
+
+    $self->{ RECURSION } = $config->{ RECURSION } || 0;
+    $self->{ EVAL_PERL } = $config->{ EVAL_PERL } || 0;
+    $self->{ TRIM      } = $config->{ TRIM } || 0;
+    $self->{ BLKSTACK  } = [ ];
+    $self->{ CONFIG    } = $config;
+    $self->{ EXPOSE_BLOCKS } = defined $config->{ EXPOSE_BLOCKS }
+                                     ? $config->{ EXPOSE_BLOCKS } 
+                                     : 0;
+
+    $self->{ DEBUG_FORMAT  } =  $config->{ DEBUG_FORMAT };
+    $self->{ DEBUG_DIRS    } = ($config->{ DEBUG } || 0) 
+                               & Template::Constants::DEBUG_DIRS;
+    $self->{ DEBUG } = defined $config->{ DEBUG } 
+        ? $config->{ DEBUG } & ( Template::Constants::DEBUG_CONTEXT
+                               | Template::Constants::DEBUG_FLAGS )
+        : $DEBUG;
+
+    return $self;
+}
+
+
+#------------------------------------------------------------------------
+# _dump()
+#
+# Debug method which returns a string representing the internal state
+# of the context object.
+#------------------------------------------------------------------------
+
+sub _dump {
+    my $self = shift;
+    my $output = "[Template::Context] {\n";
+    my $format = "    %-16s => %s\n";
+    my $key;
+
+    foreach $key (qw( RECURSION EVAL_PERL TRIM )) {
+    $output .= sprintf($format, $key, $self->{ $key });
+    }
+    foreach my $pname (qw( LOAD_TEMPLATES LOAD_PLUGINS LOAD_FILTERS )) {
+    my $provtext = "[\n";
+    foreach my $prov (@{ $self->{ $pname } }) {
+        $provtext .= $prov->_dump();
+#       $provtext .= ",\n";
+    }
+    $provtext =~ s/\n/\n        /g;
+    $provtext =~ s/\s+$//;
+    $provtext .= ",\n    ]";
+    $output .= sprintf($format, $pname, $provtext);
+    }
+    $output .= sprintf($format, STASH => $self->{ STASH }->_dump());
+    $output .= '}';
+    return $output;
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Context - Runtime context in which templates are processed
+
+=head1 SYNOPSIS
+
+    use Template::Context;
+    
+    # constructor
+    $context = Template::Context->new(\%config)
+        || die $Template::Context::ERROR;
+    
+    # fetch (load and compile) a template
+    $template = $context->template($template_name);
+    
+    # fetch (load and instantiate) a plugin object
+    $plugin = $context->plugin($name, \@args);
+    
+    # fetch (return or create) a filter subroutine
+    $filter = $context->filter($name, \@args, $alias);
+    
+    # process/include a template, errors are thrown via die()
+    $output = $context->process($template, \%vars);
+    $output = $context->include($template, \%vars);
+    
+    # raise an exception via die()
+    $context->throw($error_type, $error_message, \$output_buffer);
+    
+    # catch an exception, clean it up and fix output buffer
+    $exception = $context->catch($exception, \$output_buffer);
+    
+    # save/restore the stash to effect variable localisation
+    $new_stash = $context->localise(\%vars);
+    $old_stash = $context->delocalise();
+    
+    # add new BLOCK or FILTER definitions
+    $context->define_block($name, $block);
+    $context->define_filter($name, \&filtersub, $is_dynamic);
+    
+    # reset context, clearing any imported BLOCK definitions
+    $context->reset();
+    
+    # methods for accessing internal items
+    $stash     = $context->stash();
+    $tflag     = $context->trim();
+    $epflag    = $context->eval_perl();
+    $providers = $context->templates();
+    $providers = $context->plugins();
+    $providers = $context->filters();
+    ...
+
+=head1 DESCRIPTION
+
+The C<Template::Context> module defines an object class for representing
+a runtime context in which templates are processed.  It provides an
+interface to the fundamental operations of the Template Toolkit
+processing engine through which compiled templates (i.e. Perl code
+constructed from the template source) can process templates, load
+plugins and filters, raise exceptions and so on.
+
+A default C<Template::Context> object is created by the L<Template> module.
+Any C<Template::Context> options may be passed to the L<Template>
+L<new()|Template#new()> constructor method and will be forwarded to the
+C<Template::Context> constructor.
+
+    use Template;
+    
+    my $template = Template->new({
+        TRIM      => 1,
+        EVAL_PERL => 1,
+        BLOCKS    => {
+            header => 'This is the header',
+            footer => 'This is the footer',
+        },
+    });
+
+Similarly, the C<Template::Context> constructor will forward all configuration
+parameters onto other default objects (e.g. L<Template::Provider>,
+L<Template::Plugins>, L<Template::Filters>, etc.) that it may need to
+instantiate.
+
+    $context = Template::Context->new({
+        INCLUDE_PATH => '/home/abw/templates', # provider option
+        TAG_STYLE    => 'html',                # parser option
+    });
+
+A C<Template::Context> object (or subclass) can be explicitly instantiated and
+passed to the L<Template> L<new()|Template#new()> constructor method as the
+C<CONTEXT> configuration item.
+
+    use Template;
+    use Template::Context;
+    
+    my $context  = Template::Context->new({ TRIM => 1 });
+    my $template = Template->new({ CONTEXT => $context });
+
+The L<Template> module uses the L<Template::Config>
+L<context()|Template::Config#context()> factory method to create a default
+context object when required. The C<$Template::Config::CONTEXT> package
+variable may be set to specify an alternate context module. This will be
+loaded automatically and its L<new()> constructor method called by the
+L<context()|Template::Config#context()> factory method when a default context
+object is required.
+
+    use Template;
+    
+    $Template::Config::CONTEXT = 'MyOrg::Template::Context';
+    
+    my $template = Template->new({
+        EVAL_PERL   => 1,
+        EXTRA_MAGIC => 'red hot',  # your extra config items
+        ...
+    });
+
+=head1 METHODS
+
+=head2 new(\%params) 
+
+The C<new()> constructor method is called to instantiate a
+C<Template::Context> object. Configuration parameters may be specified as a
+HASH reference or as a list of C<name =E<gt> value> pairs.
+
+    my $context = Template::Context->new({
+        INCLUDE_PATH => 'header',
+        POST_PROCESS => 'footer',
+    });
+    
+    my $context = Template::Context->new( EVAL_PERL => 1 );
+
+The C<new()> method returns a C<Template::Context> object or C<undef> on
+error. In the latter case, a relevant error message can be retrieved by the
+L<error()|Template::Base#error()> class method or directly from the
+C<$Template::Context::ERROR> package variable.
+
+    my $context = Template::Context->new(\%config)
+        || die Template::Context->error();
+    
+    my $context = Template::Context->new(\%config)
+        || die $Template::Context::ERROR;
+
+The following configuration items may be specified.  Please see 
+L<Template::Manual::Config> for further details.
+
+=head3 VARIABLES
+
+The L<VARIABLES|Template::Manual::Config#VARIABLES> option can be used to
+specify a hash array of template variables.
+
+    my $context = Template::Context->new({
+        VARIABLES => {
+            title   => 'A Demo Page',
+            author  => 'Joe Random Hacker',
+            version => 3.14,
+        },
+    };
+
+=head3 BLOCKS
+
+The L<BLOCKS|Template::Manual::Config#BLOCKS> option can be used to pre-define
+a default set of template blocks.
+
+    my $context = Template::Context->new({
+        BLOCKS => {
+            header  => 'The Header.  [% title %]',
+            footer  => sub { return $some_output_text },
+            another => Template::Document->new({ ... }),
+        },
+    }); 
+
+=head3 VIEWS
+
+The L<VIEWS|Template::Manual::Config#VIEWS> option can be used to pre-define 
+one or more L<Template::View> objects.
+
+    my $context = Template::Context->new({
+        VIEWS => [
+            bottom => { prefix => 'bottom/' },
+            middle => { prefix => 'middle/', base => 'bottom' },
+            top    => { prefix => 'top/',    base => 'middle' },
+        ],
+    });
+
+=head3 TRIM
+
+The L<TRIM|Template::Manual::Config#TRIM> option can be set to have any
+leading and trailing whitespace automatically removed from the output of all
+template files and C<BLOCK>s.
+
+example:
+
+    [% BLOCK foo %]
+    
+    Line 1 of foo
+    
+    [% END %]
+    
+    before 
+    [% INCLUDE foo %]
+    after
+
+output:
+
+    before
+    Line 1 of foo
+    after
+
+=head3 EVAL_PERL
+
+The L<EVAL_PERL|Template::Manual::Config#EVAL_PERL> is used to indicate if
+C<PERL> and/or C<RAWPERL> blocks should be evaluated. It is disabled by
+default.
+
+=head3 RECURSION
+
+The L<RECURSION|Template::Manual::Config#RECURSION> can be set to 
+allow templates to recursively process themselves, either directly
+(e.g. template C<foo> calls C<INCLUDE foo>) or indirectly (e.g. 
+C<foo> calls C<INCLUDE bar> which calls C<INCLUDE foo>).
+
+=head3 LOAD_TEMPLATES
+
+The L<LOAD_TEMPLATES|Template::Manual::Config#LOAD_TEMPLATES> option can be
+used to provide a reference to a list of L<Template::Provider> objects or
+sub-classes thereof which will take responsibility for loading and compiling
+templates.
+
+    my $context = Template::Context->new({
+        LOAD_TEMPLATES => [
+            MyOrg::Template::Provider->new({ ... }),
+            Template::Provider->new({ ... }),
+        ],
+    });
+
+=head3 LOAD_PLUGINS
+
+The L<LOAD_PLUGINS|Template::Manual::Config#LOAD_PLUGINS> options can be used
+to specify a list of provider objects responsible for loading and
+instantiating template plugin objects.
+
+    my $context = Template::Context->new({
+        LOAD_PLUGINS => [
+            MyOrg::Template::Plugins->new({ ... }),
+            Template::Plugins->new({ ... }),
+        ],
+    });
+
+=head3 LOAD_FILTERS
+
+The L<LOAD_FILTERS|Template::Manual::Config#LOAD_FILTERS> option can be used
+to specify a list of provider objects for returning and/or creating filter
+subroutines.
+
+    my $context = Template::Context->new({
+        LOAD_FILTERS => [
+            MyTemplate::Filters->new(),
+            Template::Filters->new(),
+        ],
+    });
+
+=head3 STASH
+
+The L<STASH|Template::Manual::Config#STASH> option can be used to 
+specify a L<Template::Stash> object or sub-class which will take
+responsibility for managing template variables.  
+
+    my $stash = MyOrg::Template::Stash->new({ ... });
+    my $context = Template::Context->new({
+        STASH => $stash,
+    });
+
+=head3 DEBUG
+
+The L<DEBUG|Template::Manual::Config#DEBUG> option can be used to enable
+various debugging features of the L<Template::Context> module.
+
+    use Template::Constants qw( :debug );
+    
+    my $template = Template->new({
+        DEBUG => DEBUG_CONTEXT | DEBUG_DIRS,
+    });
+
+=head2 template($name) 
+
+Returns a compiled template by querying each of the L<LOAD_TEMPLATES> providers
+(instances of L<Template::Provider>, or sub-class) in turn.  
+
+    $template = $context->template('header');
+
+On error, a L<Template::Exception> object of type 'C<file>' is thrown via
+C<die()>.  This can be caught by enclosing the call to C<template()> in an
+C<eval> block and examining C<$@>.
+
+    eval { $template = $context->template('header') };
+    if ($@) {
+        print "failed to fetch template: $@\n";
+    }
+
+=head2 plugin($name, \@args)
+
+Instantiates a plugin object by querying each of the L<LOAD_PLUGINS>
+providers. The default L<LOAD_PLUGINS> provider is a L<Template::Plugins>
+object which attempts to load plugin modules, according the various
+configuration items such as L<PLUGIN_BASE|Template::Plugins#PLUGIN_BASE>,
+L<LOAD_PERL|Template::Plugins#LOAD_PERL>, etc., and then instantiate an object
+via L<new()|Template::Plugin#new()>. A reference to a list of constructor
+arguments may be passed as the second parameter. These are forwarded to the
+plugin constructor.
+
+Returns a reference to a plugin (which is generally an object, but
+doesn't have to be).  Errors are thrown as L<Template::Exception> objects
+with the type set to 'C<plugin>'.
+
+    $plugin = $context->plugin('DBI', 'dbi:msql:mydbname');
+
+=head2 filter($name, \@args, $alias)
+
+Instantiates a filter subroutine by querying the L<LOAD_FILTERS> providers.
+The default L<LOAD_FILTERS> provider is a L<Template::Filters> object.
+
+Additional arguments may be passed by list reference along with an optional
+alias under which the filter will be cached for subsequent use. The filter is
+cached under its own C<$name> if C<$alias> is undefined. Subsequent calls to
+C<filter($name)> will return the cached entry, if defined. Specifying arguments
+bypasses the caching mechanism and always creates a new filter. Errors are
+thrown as L<Template::Exception> objects with the type set to 'C<filter>'.
+
+    # static filter (no args)
+    $filter = $context->filter('html');
+    
+    # dynamic filter (args) aliased to 'padright'
+    $filter = $context->filter('format', '%60s', 'padright');
+    
+    # retrieve previous filter via 'padright' alias
+    $filter = $context->filter('padright');
+
+=head2 process($template, \%vars)
+
+Processes a template named or referenced by the first parameter and returns
+the output generated.  An optional reference to a hash array may be passed
+as the second parameter, containing variable definitions which will be set
+before the template is processed.  The template is processed in the current
+context, with no localisation of variables performed.   Errors are thrown
+as L<Template::Exception> objects via C<die()>.  
+
+    $output = $context->process('header', { title => 'Hello World' });
+
+=head2 include($template, \%vars)
+
+Similar to L<process()>, but using localised variables.  Changes made to
+any variables will only persist until the C<include()> method completes.
+
+    $output = $context->include('header', { title => 'Hello World' });
+
+=head2 throw($error_type, $error_message, \$output)
+
+Raises an exception in the form of a L<Template::Exception> object by calling
+C<die()>. This method may be passed a reference to an existing
+L<Template::Exception> object; a single value containing an error message
+which is used to instantiate a L<Template::Exception> of type 'C<undef>'; or a
+pair of values representing the exception C<type> and C<info> from which a
+L<Template::Exception> object is instantiated. e.g.
+
+    $context->throw($exception);
+    $context->throw("I'm sorry Dave, I can't do that");
+    $context->throw('denied', "I'm sorry Dave, I can't do that");
+
+The optional third parameter may be a reference to the current output
+buffer.  This is then stored in the exception object when created,
+allowing the catcher to examine and use the output up to the point at
+which the exception was raised.
+
+    $output .= 'blah blah blah';
+    $output .= 'more rhubarb';
+    $context->throw('yack', 'Too much yacking', \$output);
+
+=head2 catch($exception, \$output)
+
+Catches an exception thrown, either as a reference to a L<Template::Exception>
+object or some other value. In the latter case, the error string is promoted
+to a L<Template::Exception> object of 'C<undef>' type. This method also
+accepts a reference to the current output buffer which is passed to the
+L<Template::Exception> constructor, or is appended to the output buffer stored
+in an existing L<Template::Exception> object, if unique (i.e. not the same
+reference). By this process, the correct state of the output buffer can be
+reconstructed for simple or nested throws.
+
+=head2 define_block($name, $block)
+
+Adds a new block definition to the internal L<BLOCKS> cache.  The first 
+argument should contain the name of the block and the second a reference
+to a L<Template::Document> object or template sub-routine, or template text
+which is automatically compiled into a template sub-routine.  
+
+Returns a true value (the sub-routine or L<Template::Document> reference) on
+success or undef on failure. The relevant error message can be retrieved by
+calling the L<error()|Template::Base#error()> method.
+
+=head2 define_filter($name, \&filter, $is_dynamic)
+
+Adds a new filter definition by calling the
+L<store()|Template::Filters#store()> method on each of the L<LOAD_FILTERS>
+providers until accepted (in the usual case, this is accepted straight away by
+the one and only L<Template::Filters> provider). The first argument should
+contain the name of the filter and the second a reference to a filter
+subroutine. The optional third argument can be set to any true value to
+indicate that the subroutine is a dynamic filter factory. 
+
+Returns a true value or throws a 'C<filter>' exception on error.
+
+=head2 define_view($name, \%params)
+
+This method allows you to define a named L<view|Template::View>.
+
+    $context->define_view( 
+        my_view => { 
+            prefix => 'my_templates/' 
+        } 
+    );
+
+The view is then accessible as a template variable.
+
+    [% my_view.print(some_data) %]
+
+=head2 define_views($views)
+
+This method allows you to define multiple named L<views|Template::View>.
+A reference to a hash array or list reference should be passed as an argument.
+
+    $context->define_view({     # hash reference
+        my_view_one => { 
+            prefix => 'my_templates_one/' 
+        },
+        my_view_two => { 
+            prefix => 'my_templates_two/' 
+        } 
+    });
+
+If you're defining multiple views of which one or more are based on other 
+views in the same definition then you should pass them as a list reference.
+This ensures that they get created in the right order (Perl does not preserve
+the order of items defined in a hash reference so you can't guarantee that
+your base class view will be defined before your subclass view).
+
+    $context->define_view([     # list referenence
+        my_view_one => {
+            prefix => 'my_templates_one/' 
+        },
+        my_view_two => { 
+            prefix => 'my_templates_two/' ,
+            base   => 'my_view_one',
+        } 
+    ]);
+
+The views are then accessible as template variables.
+
+    [% my_view_one.print(some_data) %]
+    [% my_view_two.print(some_data) %]
+
+See also the L<VIEWS> option.
+
+=head2 localise(\%vars)
+
+Clones the stash to create a context with localised variables.  Returns a 
+reference to the newly cloned stash object which is also stored
+internally.
+
+    $stash = $context->localise();
+
+=head2 delocalise()
+
+Restore the stash to its state prior to localisation.
+
+    $stash = $context->delocalise();
+
+=head2 visit(\%blocks)
+
+This method is called by L<Template::Document> objects immediately before
+they process their content.  It is called to register any local C<BLOCK>
+definitions with the context object so that they may be subsequently
+delivered on request.
+
+=head2 leave()
+
+Compliment to the L<visit()> method. Called by L<Template::Document> objects
+immediately after they process their content.
+
+=head2 reset()
+
+Clears the local L<BLOCKS> cache of any C<BLOCK> definitions.  Any initial set of
+L<BLOCKS> specified as a configuration item to the constructor will be reinstated.
+
+=head2 AUTOLOAD
+
+An C<AUTOLOAD> method provides access to context configuration items.
+
+    $stash     = $context->stash();
+    $tflag     = $context->trim();
+    $epflag    = $context->eval_perl();
+    ...
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template>, L<Template::Document>, L<Template::Exception>,
+L<Template::Filters>, L<Template::Plugins>, L<Template::Provider>,
+L<Template::Service>, L<Template::Stash>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Directive.pm b/bench/perl/Template/Directive.pm
new file mode 100644
index 0000000..07a9593
--- /dev/null
+++ b/bench/perl/Template/Directive.pm
@@ -0,0 +1,1040 @@
+#================================================================= -*-Perl-*- 
+#
+# Template::Directive
+#
+# DESCRIPTION
+#   Factory module for constructing templates from Perl code.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# WARNING
+#   Much of this module is hairy, even furry in places.  It needs
+#   a lot of tidying up and may even be moved into a different place 
+#   altogether.  The generator code is often inefficient, particulary in 
+#   being very anal about pretty-printing the Perl code all neatly, but 
+#   at the moment, that's still high priority for the sake of easier
+#   debugging.
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Directive;
+
+use strict;
+use warnings;
+use base 'Template::Base';
+use Template::Constants;
+use Template::Exception;
+
+our $VERSION   = 2.20;
+our $DEBUG     = 0 unless defined $DEBUG;
+our $WHILE_MAX = 1000 unless defined $WHILE_MAX;
+our $PRETTY    = 0 unless defined $PRETTY;
+our $OUTPUT    = '$output .= ';
+
+
+sub _init {
+    my ($self, $config) = @_;
+    $self->{ NAMESPACE } = $config->{ NAMESPACE };
+    return $self;
+}
+
+
+sub pad {
+    my ($text, $pad) = @_;
+    $pad = ' ' x ($pad * 4);
+    $text =~ s/^(?!#line)/$pad/gm;
+    $text;
+}
+
+#========================================================================
+# FACTORY METHODS
+#
+# These methods are called by the parser to construct directive instances.
+#========================================================================
+
+#------------------------------------------------------------------------
+# template($block)
+#------------------------------------------------------------------------
+
+sub template {
+    my ($class, $block) = @_;
+    $block = pad($block, 2) if $PRETTY;
+
+    return "sub { return '' }" unless $block =~ /\S/;
+
+    return <<EOF;
+sub {
+    my \$context = shift || die "template sub called without context\\n";
+    my \$stash   = \$context->stash;
+    my \$output  = '';
+    my \$_tt_error;
+    
+    eval { BLOCK: {
+$block
+    } };
+    if (\$@) {
+        \$_tt_error = \$context->catch(\$@, \\\$output);
+        die \$_tt_error unless \$_tt_error->type eq 'return';
+    }
+
+    return \$output;
+}
+EOF
+}
+
+
+#------------------------------------------------------------------------
+# anon_block($block)                            [% BLOCK %] ... [% END %]
+#------------------------------------------------------------------------
+
+sub anon_block {
+    my ($class, $block) = @_;
+    $block = pad($block, 2) if $PRETTY;
+
+    return <<EOF;
+
+# BLOCK
+$OUTPUT do {
+    my \$output  = '';
+    my \$_tt_error;
+    
+    eval { BLOCK: {
+$block
+    } };
+    if (\$@) {
+        \$_tt_error = \$context->catch(\$@, \\\$output);
+        die \$_tt_error unless \$_tt_error->type eq 'return';
+    }
+
+    \$output;
+};
+EOF
+}
+
+
+#------------------------------------------------------------------------
+# block($blocktext)
+#------------------------------------------------------------------------
+
+sub block {
+    my ($class, $block) = @_;
+    return join("\n", @{ $block || [] });
+}
+
+
+#------------------------------------------------------------------------
+# textblock($text)
+#------------------------------------------------------------------------
+
+sub textblock {
+    my ($class, $text) = @_;
+    return "$OUTPUT " . &text($class, $text) . ';';
+}
+
+
+#------------------------------------------------------------------------
+# text($text)
+#------------------------------------------------------------------------
+
+sub text {
+    my ($class, $text) = @_;
+    for ($text) {
+        s/(["\$\@\\])/\\$1/g;
+        s/\n/\\n/g;
+    }
+    return '"' . $text . '"';
+}
+
+
+#------------------------------------------------------------------------
+# quoted(\@items)                                               "foo$bar"
+#------------------------------------------------------------------------
+
+sub quoted {
+    my ($class, $items) = @_;
+    return '' unless @$items;
+    return ("('' . " . $items->[0] . ')') if scalar @$items == 1;
+    return '(' . join(' . ', @$items) . ')';
+#    my $r = '(' . join(' . ', @$items) . ' . "")';
+#    print STDERR "[$r]\n";
+#    return $r;
+}
+
+
+#------------------------------------------------------------------------
+# ident(\@ident)                                             foo.bar(baz)
+#------------------------------------------------------------------------
+
+sub ident {
+    my ($class, $ident) = @_;
+    return "''" unless @$ident;
+    my $ns;
+
+    # does the first element of the identifier have a NAMESPACE
+    # handler defined?
+    if (ref $class && @$ident > 2 && ($ns = $class->{ NAMESPACE })) {
+        my $key = $ident->[0];
+        $key =~ s/^'(.+)'$/$1/s;
+        if ($ns = $ns->{ $key }) {
+            return $ns->ident($ident);
+        }
+    }
+        
+    if (scalar @$ident <= 2 && ! $ident->[1]) {
+        $ident = $ident->[0];
+    }
+    else {
+        $ident = '[' . join(', ', @$ident) . ']';
+    }
+    return "\$stash->get($ident)";
+}
+
+#------------------------------------------------------------------------
+# identref(\@ident)                                         \foo.bar(baz)
+#------------------------------------------------------------------------
+
+sub identref {
+    my ($class, $ident) = @_;
+    return "''" unless @$ident;
+    if (scalar @$ident <= 2 && ! $ident->[1]) {
+        $ident = $ident->[0];
+    }
+    else {
+        $ident = '[' . join(', ', @$ident) . ']';
+    }
+    return "\$stash->getref($ident)";
+}
+
+
+#------------------------------------------------------------------------
+# assign(\@ident, $value, $default)                             foo = bar
+#------------------------------------------------------------------------
+
+sub assign {
+    my ($class, $var, $val, $default) = @_;
+
+    if (ref $var) {
+        if (scalar @$var == 2 && ! $var->[1]) {
+            $var = $var->[0];
+        }
+        else {
+            $var = '[' . join(', ', @$var) . ']';
+        }
+    }
+    $val .= ', 1' if $default;
+    return "\$stash->set($var, $val)";
+}
+
+
+#------------------------------------------------------------------------
+# args(\@args)                                        foo, bar, baz = qux
+#------------------------------------------------------------------------
+
+sub args {
+    my ($class, $args) = @_;
+    my $hash = shift @$args;
+    push(@$args, '{ ' . join(', ', @$hash) . ' }')
+        if @$hash;
+
+    return '0' unless @$args;
+    return '[ ' . join(', ', @$args) . ' ]';
+}
+
+#------------------------------------------------------------------------
+# filenames(\@names)
+#------------------------------------------------------------------------
+
+sub filenames {
+    my ($class, $names) = @_;
+    if (@$names > 1) {
+        $names = '[ ' . join(', ', @$names) . ' ]';
+    }
+    else {
+        $names = shift @$names;
+    }
+    return $names;
+}
+
+
+#------------------------------------------------------------------------
+# get($expr)                                                    [% foo %]
+#------------------------------------------------------------------------
+
+sub get {
+    my ($class, $expr) = @_;  
+    return "$OUTPUT $expr;";
+}
+
+
+#------------------------------------------------------------------------
+# call($expr)                                              [% CALL bar %]
+#------------------------------------------------------------------------
+
+sub call {
+    my ($class, $expr) = @_;  
+    $expr .= ';';
+    return $expr;
+}
+
+
+#------------------------------------------------------------------------
+# set(\@setlist)                               [% foo = bar, baz = qux %]
+#------------------------------------------------------------------------
+
+sub set {
+    my ($class, $setlist) = @_;
+    my $output;
+    while (my ($var, $val) = splice(@$setlist, 0, 2)) {
+        $output .= &assign($class, $var, $val) . ";\n";
+    }
+    chomp $output;
+    return $output;
+}
+
+
+#------------------------------------------------------------------------
+# default(\@setlist)                   [% DEFAULT foo = bar, baz = qux %]
+#------------------------------------------------------------------------
+
+sub default {
+    my ($class, $setlist) = @_;  
+    my $output;
+    while (my ($var, $val) = splice(@$setlist, 0, 2)) {
+        $output .= &assign($class, $var, $val, 1) . ";\n";
+    }
+    chomp $output;
+    return $output;
+}
+
+
+#------------------------------------------------------------------------
+# insert(\@nameargs)                                    [% INSERT file %] 
+#         # => [ [ $file, ... ], \@args ]
+#------------------------------------------------------------------------
+
+sub insert {
+    my ($class, $nameargs) = @_;
+    my ($file, $args) = @$nameargs;
+    $file = $class->filenames($file);
+    return "$OUTPUT \$context->insert($file);"; 
+}
+
+
+#------------------------------------------------------------------------
+# include(\@nameargs)                    [% INCLUDE template foo = bar %] 
+#          # => [ [ $file, ... ], \@args ]    
+#------------------------------------------------------------------------
+
+sub include {
+    my ($class, $nameargs) = @_;
+    my ($file, $args) = @$nameargs;
+    my $hash = shift @$args;
+    $file = $class->filenames($file);
+    $file .= @$hash ? ', { ' . join(', ', @$hash) . ' }' : '';
+    return "$OUTPUT \$context->include($file);"; 
+}
+
+
+#------------------------------------------------------------------------
+# process(\@nameargs)                    [% PROCESS template foo = bar %] 
+#         # => [ [ $file, ... ], \@args ]
+#------------------------------------------------------------------------
+
+sub process {
+    my ($class, $nameargs) = @_;
+    my ($file, $args) = @$nameargs;
+    my $hash = shift @$args;
+    $file = $class->filenames($file);
+    $file .= @$hash ? ', { ' . join(', ', @$hash) . ' }' : '';
+    return "$OUTPUT \$context->process($file);"; 
+}
+
+
+#------------------------------------------------------------------------
+# if($expr, $block, $else)                             [% IF foo < bar %]
+#                                                         ...
+#                                                      [% ELSE %]
+#                                                         ...
+#                                                      [% END %]
+#------------------------------------------------------------------------
+
+sub if {
+    my ($class, $expr, $block, $else) = @_;
+    my @else = $else ? @$else : ();
+    $else = pop @else;
+    $block = pad($block, 1) if $PRETTY;
+
+    my $output = "if ($expr) {\n$block\n}\n";
+
+    foreach my $elsif (@else) {
+        ($expr, $block) = @$elsif;
+        $block = pad($block, 1) if $PRETTY;
+        $output .= "elsif ($expr) {\n$block\n}\n";
+    }
+    if (defined $else) {
+        $else = pad($else, 1) if $PRETTY;
+        $output .= "else {\n$else\n}\n";
+    }
+
+    return $output;
+}
+
+
+#------------------------------------------------------------------------
+# foreach($target, $list, $args, $block)    [% FOREACH x = [ foo bar ] %]
+#                                              ...
+#                                           [% END %]
+#------------------------------------------------------------------------
+
+sub foreach {
+    my ($class, $target, $list, $args, $block, $label) = @_;
+    $args  = shift @$args;
+    $args  = @$args ? ', { ' . join(', ', @$args) . ' }' : '';
+    $label ||= 'LOOP';
+
+    my ($loop_save, $loop_set, $loop_restore, $setiter);
+    if ($target) {
+        $loop_save    = 'eval { $_tt_oldloop = ' . &ident($class, ["'loop'"]) . ' }';
+        $loop_set     = "\$stash->{'$target'} = \$_tt_value";
+        $loop_restore = "\$stash->set('loop', \$_tt_oldloop)";
+    }
+    else {
+        $loop_save    = '$stash = $context->localise()';
+#       $loop_set     = "\$stash->set('import', \$_tt_value) "
+#                       . "if ref \$value eq 'HASH'";
+        $loop_set     = "\$stash->get(['import', [\$_tt_value]]) "
+                        . "if ref \$_tt_value eq 'HASH'";
+        $loop_restore = '$stash = $context->delocalise()';
+    }
+    $block = pad($block, 3) if $PRETTY;
+
+    return <<EOF;
+
+# FOREACH 
+do {
+    my (\$_tt_value, \$_tt_error, \$_tt_oldloop);
+    my \$_tt_list = $list;
+    
+    unless (UNIVERSAL::isa(\$_tt_list, 'Template::Iterator')) {
+        \$_tt_list = Template::Config->iterator(\$_tt_list)
+            || die \$Template::Config::ERROR, "\\n"; 
+    }
+
+    (\$_tt_value, \$_tt_error) = \$_tt_list->get_first();
+    $loop_save;
+    \$stash->set('loop', \$_tt_list);
+    eval {
+$label:   while (! \$_tt_error) {
+            $loop_set;
+$block;
+            (\$_tt_value, \$_tt_error) = \$_tt_list->get_next();
+        }
+    };
+    $loop_restore;
+    die \$@ if \$@;
+    \$_tt_error = 0 if \$_tt_error && \$_tt_error eq Template::Constants::STATUS_DONE;
+    die \$_tt_error if \$_tt_error;
+};
+EOF
+}
+
+#------------------------------------------------------------------------
+# next()                                                       [% NEXT %]
+#
+# Next iteration of a FOREACH loop (experimental)
+#------------------------------------------------------------------------
+
+sub next {
+    my ($class, $label) = @_;
+    $label ||= 'LOOP';
+    return <<EOF;
+(\$_tt_value, \$_tt_error) = \$_tt_list->get_next();
+next $label;
+EOF
+}
+
+
+#------------------------------------------------------------------------
+# wrapper(\@nameargs, $block)            [% WRAPPER template foo = bar %] 
+#          # => [ [$file,...], \@args ]    
+#------------------------------------------------------------------------
+
+sub wrapper {
+    my ($class, $nameargs, $block) = @_;
+    my ($file, $args) = @$nameargs;
+    my $hash = shift @$args;
+
+    local $" = ', ';
+#    print STDERR "wrapper([@$file], { @$hash })\n";
+
+    return $class->multi_wrapper($file, $hash, $block)
+        if @$file > 1;
+    $file = shift @$file;
+
+    $block = pad($block, 1) if $PRETTY;
+    push(@$hash, "'content'", '$output');
+    $file .= @$hash ? ', { ' . join(', ', @$hash) . ' }' : '';
+
+    return <<EOF;
+
+# WRAPPER
+$OUTPUT do {
+    my \$output = '';
+$block
+    \$context->include($file); 
+};
+EOF
+}
+
+
+sub multi_wrapper {
+    my ($class, $file, $hash, $block) = @_;
+    $block = pad($block, 1) if $PRETTY;
+
+    push(@$hash, "'content'", '$output');
+    $hash = @$hash ? ', { ' . join(', ', @$hash) . ' }' : '';
+
+    $file = join(', ', reverse @$file);
+#    print STDERR "multi wrapper: $file\n";
+
+    return <<EOF;
+
+# WRAPPER
+$OUTPUT do {
+    my \$output = '';
+$block
+    foreach ($file) {
+        \$output = \$context->include(\$_$hash); 
+    }
+    \$output;
+};
+EOF
+}
+
+
+#------------------------------------------------------------------------
+# while($expr, $block)                                 [% WHILE x < 10 %]
+#                                                         ...
+#                                                      [% END %]
+#------------------------------------------------------------------------
+
+sub while {
+    my ($class, $expr, $block, $label) = @_;
+    $block = pad($block, 2) if $PRETTY;
+    $label ||= 'LOOP';
+
+    return <<EOF;
+
+# WHILE
+do {
+    my \$_tt_failsafe = $WHILE_MAX;
+$label:
+    while (--\$_tt_failsafe && ($expr)) {
+$block
+    }
+    die "WHILE loop terminated (> $WHILE_MAX iterations)\\n"
+        unless \$_tt_failsafe;
+};
+EOF
+}
+
+
+#------------------------------------------------------------------------
+# switch($expr, \@case)                                    [% SWITCH %]
+#                                                          [% CASE foo %]
+#                                                             ...
+#                                                          [% END %]
+#------------------------------------------------------------------------
+
+sub switch {
+    my ($class, $expr, $case) = @_;
+    my @case = @$case;
+    my ($match, $block, $default);
+    my $caseblock = '';
+
+    $default = pop @case;
+
+    foreach $case (@case) {
+        $match = $case->[0];
+        $block = $case->[1];
+        $block = pad($block, 1) if $PRETTY;
+        $caseblock .= <<EOF;
+\$_tt_match = $match;
+\$_tt_match = [ \$_tt_match ] unless ref \$_tt_match eq 'ARRAY';
+if (grep(/^\\Q\$_tt_result\\E\$/, \@\$_tt_match)) {
+$block
+    last SWITCH;
+}
+EOF
+    }
+
+    $caseblock .= $default
+        if defined $default;
+    $caseblock = pad($caseblock, 2) if $PRETTY;
+
+return <<EOF;
+
+# SWITCH
+do {
+    my \$_tt_result = $expr;
+    my \$_tt_match;
+    SWITCH: {
+$caseblock
+    }
+};
+EOF
+}
+
+
+#------------------------------------------------------------------------
+# try($block, \@catch)                                        [% TRY %]
+#                                                                ...
+#                                                             [% CATCH %] 
+#                                                                ...
+#                                                             [% END %]
+#------------------------------------------------------------------------
+
+sub try {
+    my ($class, $block, $catch) = @_;
+    my @catch = @$catch;
+    my ($match, $mblock, $default, $final, $n);
+    my $catchblock = '';
+    my $handlers = [];
+
+    $block = pad($block, 2) if $PRETTY;
+    $final = pop @catch;
+    $final = "# FINAL\n" . ($final ? "$final\n" : '')
+           . 'die $_tt_error if $_tt_error;' . "\n" . '$output;';
+    $final = pad($final, 1) if $PRETTY;
+
+    $n = 0;
+    foreach $catch (@catch) {
+        $match = $catch->[0] || do {
+            $default ||= $catch->[1];
+            next;
+        };
+        $mblock = $catch->[1];
+        $mblock = pad($mblock, 1) if $PRETTY;
+        push(@$handlers, "'$match'");
+        $catchblock .= $n++ 
+            ? "elsif (\$_tt_handler eq '$match') {\n$mblock\n}\n" 
+               : "if (\$_tt_handler eq '$match') {\n$mblock\n}\n";
+    }
+    $catchblock .= "\$_tt_error = 0;";
+    $catchblock = pad($catchblock, 3) if $PRETTY;
+    if ($default) {
+        $default = pad($default, 1) if $PRETTY;
+        $default = "else {\n    # DEFAULT\n$default\n    \$_tt_error = '';\n}";
+    }
+    else {
+        $default = '# NO DEFAULT';
+    }
+    $default = pad($default, 2) if $PRETTY;
+
+    $handlers = join(', ', @$handlers);
+return <<EOF;
+
+# TRY
+$OUTPUT do {
+    my \$output = '';
+    my (\$_tt_error, \$_tt_handler);
+    eval {
+$block
+    };
+    if (\$@) {
+        \$_tt_error = \$context->catch(\$@, \\\$output);
+        die \$_tt_error if \$_tt_error->type =~ /^return|stop\$/;
+        \$stash->set('error', \$_tt_error);
+        \$stash->set('e', \$_tt_error);
+        if (defined (\$_tt_handler = \$_tt_error->select_handler($handlers))) {
+$catchblock
+        }
+$default
+    }
+$final
+};
+EOF
+}
+
+
+#------------------------------------------------------------------------
+# throw(\@nameargs)                           [% THROW foo "bar error" %]
+#       # => [ [$type], \@args ]
+#------------------------------------------------------------------------
+
+sub throw {
+    my ($class, $nameargs) = @_;
+    my ($type, $args) = @$nameargs;
+    my $hash = shift(@$args);
+    my $info = shift(@$args);
+    $type = shift @$type;           # uses same parser production as INCLUDE
+                                    # etc., which allow multiple names
+                                    # e.g. INCLUDE foo+bar+baz
+
+    if (! $info) {
+        $args = "$type, undef";
+    }
+    elsif (@$hash || @$args) {
+        local $" = ', ';
+        my $i = 0;
+        $args = "$type, { args => [ " 
+              . join(', ', $info, @$args) 
+              . ' ], '
+              . join(', ', 
+                     (map { "'" . $i++ . "' => $_" } ($info, @$args)),
+                     @$hash)
+              . ' }';
+    }
+    else {
+        $args = "$type, $info";
+    }
+    
+    return "\$context->throw($args, \\\$output);";
+}
+
+
+#------------------------------------------------------------------------
+# clear()                                                     [% CLEAR %]
+#
+# NOTE: this is redundant, being hard-coded (for now) into Parser.yp
+#------------------------------------------------------------------------
+
+sub clear {
+    return "\$output = '';";
+}
+
+#------------------------------------------------------------------------
+# break()                                                     [% BREAK %]
+#
+# NOTE: this is redundant, being hard-coded (for now) into Parser.yp
+#------------------------------------------------------------------------
+
+sub OLD_break {
+    return 'last LOOP;';
+}
+
+#------------------------------------------------------------------------
+# return()                                                   [% RETURN %]
+#------------------------------------------------------------------------
+
+sub return {
+    return "\$context->throw('return', '', \\\$output);";
+}
+
+#------------------------------------------------------------------------
+# stop()                                                       [% STOP %]
+#------------------------------------------------------------------------
+
+sub stop {
+    return "\$context->throw('stop', '', \\\$output);";
+}
+
+
+#------------------------------------------------------------------------
+# use(\@lnameargs)                         [% USE alias = plugin(args) %]
+#     # => [ [$file, ...], \@args, $alias ]
+#------------------------------------------------------------------------
+
+sub use {
+    my ($class, $lnameargs) = @_;
+    my ($file, $args, $alias) = @$lnameargs;
+    $file = shift @$file;       # same production rule as INCLUDE
+    $alias ||= $file;
+    $args = &args($class, $args);
+    $file .= ", $args" if $args;
+#    my $set = &assign($class, $alias, '$plugin'); 
+    return "# USE\n"
+         . "\$stash->set($alias,\n"
+         . "            \$context->plugin($file));";
+}
+
+#------------------------------------------------------------------------
+# view(\@nameargs, $block)                           [% VIEW name args %]
+#     # => [ [$file, ... ], \@args ]
+#------------------------------------------------------------------------
+
+sub view {
+    my ($class, $nameargs, $block, $defblocks) = @_;
+    my ($name, $args) = @$nameargs;
+    my $hash = shift @$args;
+    $name = shift @$name;       # same production rule as INCLUDE
+    $block = pad($block, 1) if $PRETTY;
+
+    if (%$defblocks) {
+        $defblocks = join(",\n", map { "'$_' => $defblocks->{ $_ }" }
+                                keys %$defblocks);
+        $defblocks = pad($defblocks, 1) if $PRETTY;
+        $defblocks = "{\n$defblocks\n}";
+        push(@$hash, "'blocks'", $defblocks);
+    }
+    $hash = @$hash ? '{ ' . join(', ', @$hash) . ' }' : '';
+
+    return <<EOF;
+# VIEW
+do {
+    my \$output = '';
+    my \$_tt_oldv = \$stash->get('view');
+    my \$_tt_view = \$context->view($hash);
+    \$stash->set($name, \$_tt_view);
+    \$stash->set('view', \$_tt_view);
+
+$block
+
+    \$stash->set('view', \$_tt_oldv);
+    \$_tt_view->seal();
+#    \$output;     # not used - commented out to avoid warning
+};
+EOF
+}
+
+
+#------------------------------------------------------------------------
+# perl($block)
+#------------------------------------------------------------------------
+
+sub perl {
+    my ($class, $block) = @_;
+    $block = pad($block, 1) if $PRETTY;
+
+    return <<EOF;
+
+# PERL
+\$context->throw('perl', 'EVAL_PERL not set')
+    unless \$context->eval_perl();
+
+$OUTPUT do {
+    my \$output = "package Template::Perl;\\n";
+
+$block
+
+    local(\$Template::Perl::context) = \$context;
+    local(\$Template::Perl::stash)   = \$stash;
+
+    my \$_tt_result = '';
+    tie *Template::Perl::PERLOUT, 'Template::TieString', \\\$_tt_result;
+    my \$_tt_save_stdout = select *Template::Perl::PERLOUT;
+
+    eval \$output;
+    select \$_tt_save_stdout;
+    \$context->throw(\$@) if \$@;
+    \$_tt_result;
+};
+EOF
+}
+
+
+#------------------------------------------------------------------------
+# no_perl()
+#------------------------------------------------------------------------
+
+sub no_perl {
+    my $class = shift;
+    return "\$context->throw('perl', 'EVAL_PERL not set');";
+}
+
+
+#------------------------------------------------------------------------
+# rawperl($block)
+#
+# NOTE: perhaps test context EVAL_PERL switch at compile time rather than
+# runtime?
+#------------------------------------------------------------------------
+
+sub rawperl {
+    my ($class, $block, $line) = @_;
+    for ($block) {
+        s/^\n+//;
+        s/\n+$//;
+    }
+    $block = pad($block, 1) if $PRETTY;
+    $line = $line ? " (starting line $line)" : '';
+
+    return <<EOF;
+# RAWPERL
+#line 1 "RAWPERL block$line"
+$block
+EOF
+}
+
+
+
+#------------------------------------------------------------------------
+# filter()
+#------------------------------------------------------------------------
+
+sub filter {
+    my ($class, $lnameargs, $block) = @_;
+    my ($name, $args, $alias) = @$lnameargs;
+    $name = shift @$name;
+    $args = &args($class, $args);
+    $args = $args ? "$args, $alias" : ", undef, $alias"
+        if $alias;
+    $name .= ", $args" if $args;
+    $block = pad($block, 1) if $PRETTY;
+ 
+    return <<EOF;
+
+# FILTER
+$OUTPUT do {
+    my \$output = '';
+    my \$_tt_filter = \$context->filter($name)
+              || \$context->throw(\$context->error);
+
+$block
+    
+    &\$_tt_filter(\$output);
+};
+EOF
+}
+
+
+#------------------------------------------------------------------------
+# capture($name, $block)
+#------------------------------------------------------------------------
+
+sub capture {
+    my ($class, $name, $block) = @_;
+
+    if (ref $name) {
+        if (scalar @$name == 2 && ! $name->[1]) {
+            $name = $name->[0];
+        }
+        else {
+            $name = '[' . join(', ', @$name) . ']';
+        }
+    }
+    $block = pad($block, 1) if $PRETTY;
+
+    return <<EOF;
+
+# CAPTURE
+\$stash->set($name, do {
+    my \$output = '';
+$block
+    \$output;
+});
+EOF
+
+}
+
+
+#------------------------------------------------------------------------
+# macro($name, $block, \@args)
+#------------------------------------------------------------------------
+
+sub macro {
+    my ($class, $ident, $block, $args) = @_;
+    $block = pad($block, 2) if $PRETTY;
+
+    if ($args) {
+        my $nargs = scalar @$args;
+        $args = join(', ', map { "'$_'" } @$args);
+        $args = $nargs > 1 
+            ? "\@_tt_args{ $args } = splice(\@_, 0, $nargs)"
+            : "\$_tt_args{ $args } = shift";
+
+        return <<EOF;
+
+# MACRO
+\$stash->set('$ident', sub {
+    my \$output = '';
+    my (%_tt_args, \$_tt_params);
+    $args;
+    \$_tt_params = shift;
+    \$_tt_params = { } unless ref(\$_tt_params) eq 'HASH';
+    \$_tt_params = { \%_tt_args, %\$_tt_params };
+
+    my \$stash = \$context->localise(\$_tt_params);
+    eval {
+$block
+    };
+    \$stash = \$context->delocalise();
+    die \$@ if \$@;
+    return \$output;
+});
+EOF
+
+    }
+    else {
+        return <<EOF;
+
+# MACRO
+\$stash->set('$ident', sub {
+    my \$_tt_params = \$_[0] if ref(\$_[0]) eq 'HASH';
+    my \$output = '';
+
+    my \$stash = \$context->localise(\$_tt_params);
+    eval {
+$block
+    };
+    \$stash = \$context->delocalise();
+    die \$@ if \$@;
+    return \$output;
+});
+EOF
+    }
+}
+
+
+sub debug {
+    my ($class, $nameargs) = @_;
+    my ($file, $args) = @$nameargs;
+    my $hash = shift @$args;
+    $args  = join(', ', @$file, @$args);
+    $args .= @$hash ? ', { ' . join(', ', @$hash) . ' }' : '';
+    return "$OUTPUT \$context->debugging($args); ## DEBUG ##"; 
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Directive - Perl code generator for template directives
+
+=head1 SYNOPSIS
+
+    # no user serviceable parts inside
+
+=head1 DESCRIPTION
+
+The C<Template::Directive> module defines a number of methods that
+generate Perl code for the runtime representation of the various 
+Template Toolkit directives.
+
+It is used internally by the L<Template::Parser> module.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Parser>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
+
diff --git a/bench/perl/Template/Document.pm b/bench/perl/Template/Document.pm
new file mode 100644
index 0000000..79d26fc
--- /dev/null
+++ b/bench/perl/Template/Document.pm
@@ -0,0 +1,490 @@
+##============================================================= -*-Perl-*-
+#
+# Template::Document
+#
+# DESCRIPTION
+#   Module defining a class of objects which encapsulate compiled
+#   templates, storing additional block definitions and metadata 
+#   as well as the compiled Perl sub-routine representing the main
+#   template content.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Document;
+
+use strict;
+use warnings;
+use base 'Template::Base';
+use Template::Constants;
+
+our $VERSION = 2.79;
+our $DEBUG   = 0 unless defined $DEBUG;
+our $ERROR   = '';
+our ($COMPERR, $AUTOLOAD, $UNICODE);
+
+BEGIN {
+    # UNICODE is supported in versions of Perl from 5.008 onwards
+    if ($UNICODE = $] > 5.007 ? 1 : 0) {
+        if ($] > 5.008) {
+            # utf8::is_utf8() available from Perl 5.8.1 onwards
+            *is_utf8 = \&utf8::is_utf8;
+        }
+        elsif ($] == 5.008) {
+            # use Encode::is_utf8() for Perl 5.8.0
+            require Encode;
+            *is_utf8 = \&Encode::is_utf8;
+        }
+    }
+}
+
+
+#========================================================================
+#                     -----  PUBLIC METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# new(\%document)
+#
+# Creates a new self-contained Template::Document object which 
+# encapsulates a compiled Perl sub-routine, $block, any additional 
+# BLOCKs defined within the document ($defblocks, also Perl sub-routines)
+# and additional $metadata about the document.
+#------------------------------------------------------------------------
+
+sub new {
+    my ($class, $doc) = @_;
+    my ($block, $defblocks, $metadata) = @$doc{ qw( BLOCK DEFBLOCKS METADATA ) };
+    $defblocks ||= { };
+    $metadata  ||= { };
+
+    # evaluate Perl code in $block to create sub-routine reference if necessary
+    unless (ref $block) {
+        local $SIG{__WARN__} = \&catch_warnings;
+        $COMPERR = '';
+
+        # DON'T LOOK NOW! - blindly untainting can make you go blind!
+        $block =~ /(.*)/s;
+        $block = $1;
+        
+        $block = eval $block;
+        return $class->error($@)
+            unless defined $block;
+    }
+
+    # same for any additional BLOCK definitions
+    @$defblocks{ keys %$defblocks } = 
+        # MORE BLIND UNTAINTING - turn away if you're squeamish
+        map { 
+            ref($_) 
+                ? $_ 
+                : ( /(.*)/s && eval($1) or return $class->error($@) )
+            } values %$defblocks;
+    
+    bless {
+        %$metadata,
+        _BLOCK     => $block,
+        _DEFBLOCKS => $defblocks,
+        _HOT       => 0,
+    }, $class;
+}
+
+
+#------------------------------------------------------------------------
+# block()
+#
+# Returns a reference to the internal sub-routine reference, _BLOCK, 
+# that constitutes the main document template.
+#------------------------------------------------------------------------
+
+sub block {
+    return $_[0]->{ _BLOCK };
+}
+
+
+#------------------------------------------------------------------------
+# blocks()
+#
+# Returns a reference to a hash array containing any BLOCK definitions 
+# from the template.  The hash keys are the BLOCK nameand the values
+# are references to Template::Document objects.  Returns 0 (# an empty hash)
+# if no blocks are defined.
+#------------------------------------------------------------------------
+
+sub blocks {
+    return $_[0]->{ _DEFBLOCKS };
+}
+
+
+#------------------------------------------------------------------------
+# process($context)
+#
+# Process the document in a particular context.  Checks for recursion,
+# registers the document with the context via visit(), processes itself,
+# and then unwinds with a large gin and tonic.
+#------------------------------------------------------------------------
+
+sub process {
+    my ($self, $context) = @_;
+    my $defblocks = $self->{ _DEFBLOCKS };
+    my $output;
+
+
+    # check we're not already visiting this template
+    return $context->throw(Template::Constants::ERROR_FILE, 
+                           "recursion into '$self->{ name }'")
+        if $self->{ _HOT } && ! $context->{ RECURSION };   ## RETURN ##
+
+    $context->visit($self, $defblocks);
+
+    $self->{ _HOT } = 1;
+    eval {
+        my $block = $self->{ _BLOCK };
+        $output = &$block($context);
+    };
+    $self->{ _HOT } = 0;
+
+    $context->leave();
+
+    die $context->catch($@)
+        if $@;
+        
+    return $output;
+}
+
+
+#------------------------------------------------------------------------
+# AUTOLOAD
+#
+# Provides pseudo-methods for read-only access to various internal 
+# members. 
+#------------------------------------------------------------------------
+
+sub AUTOLOAD {
+    my $self   = shift;
+    my $method = $AUTOLOAD;
+
+    $method =~ s/.*:://;
+    return if $method eq 'DESTROY';
+#    my ($pkg, $file, $line) = caller();
+#    print STDERR "called $self->AUTOLOAD($method) from $file line $line\n";
+    return $self->{ $method };
+}
+
+
+#========================================================================
+#                     -----  PRIVATE METHODS -----
+#========================================================================
+
+
+#------------------------------------------------------------------------
+# _dump()
+#
+# Debug method which returns a string representing the internal state
+# of the object.
+#------------------------------------------------------------------------
+
+sub _dump {
+    my $self = shift;
+    my $dblks;
+    my $output = "$self : $self->{ name }\n";
+
+    $output .= "BLOCK: $self->{ _BLOCK }\nDEFBLOCKS:\n";
+
+    if ($dblks = $self->{ _DEFBLOCKS }) {
+        foreach my $b (keys %$dblks) {
+            $output .= "    $b: $dblks->{ $b }\n";
+        }
+    }
+
+    return $output;
+}
+
+
+#========================================================================
+#                      ----- CLASS METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# as_perl($content)
+#
+# This method expects a reference to a hash passed as the first argument
+# containing 3 items:
+#     METADATA   # a hash of template metadata
+#     BLOCK      # string containing Perl sub definition for main block
+#     DEFBLOCKS  # hash containing further subs for addional BLOCK defs
+# It returns a string containing Perl code which, when evaluated and 
+# executed, will instantiate a new Template::Document object with the 
+# above data.  On error, it returns undef with an appropriate error
+# message set in $ERROR.
+#------------------------------------------------------------------------
+
+sub as_perl {
+    my ($class, $content) = @_;
+    my ($block, $defblocks, $metadata) = @$content{ qw( BLOCK DEFBLOCKS METADATA ) };
+
+    $block =~ s/\n(?!#line)/\n    /g;
+    $block =~ s/\s+$//;
+
+    $defblocks = join('', map {
+        my $code = $defblocks->{ $_ };
+        $code =~ s/\n(?!#line)/\n        /g;
+        $code =~ s/\s*$//;
+        "        '$_' => $code,\n";
+    } keys %$defblocks);
+    $defblocks =~ s/\s+$//;
+
+    $metadata = join('', map { 
+        my $x = $metadata->{ $_ }; 
+        $x =~ s/(['\\])/\\$1/g; 
+        "        '$_' => '$x',\n";
+    } keys %$metadata);
+    $metadata =~ s/\s+$//;
+
+    return <<EOF
+#------------------------------------------------------------------------
+# Compiled template generated by the Template Toolkit version $Template::VERSION
+#------------------------------------------------------------------------
+
+$class->new({
+    METADATA => {
+$metadata
+    },
+    BLOCK => $block,
+    DEFBLOCKS => {
+$defblocks
+    },
+});
+EOF
+}
+
+
+#------------------------------------------------------------------------
+# write_perl_file($filename, \%content)
+#
+# This method calls as_perl() to generate the Perl code to represent a
+# compiled template with the content passed as the second argument.
+# It then writes this to the file denoted by the first argument.
+#
+# Returns 1 on success.  On error, sets the $ERROR package variable
+# to contain an error message and returns undef.
+#------------------------------------------------------------------------
+
+sub write_perl_file {
+    my ($class, $file, $content) = @_;
+    my ($fh, $tmpfile);
+    
+    return $class->error("invalid filename: $file")
+        unless $file =~ /^(.+)$/s;
+
+    eval {
+        require File::Temp;
+        require File::Basename;
+        ($fh, $tmpfile) = File::Temp::tempfile( 
+            DIR => File::Basename::dirname($file) 
+        );
+        my $perlcode = $class->as_perl($content) || die $!;
+        
+        if ($UNICODE && is_utf8($perlcode)) {
+            $perlcode = "use utf8;\n\n$perlcode";
+            binmode $fh, ":utf8";
+        }
+        print $fh $perlcode;
+        close($fh);
+    };
+    return $class->error($@) if $@;
+    return rename($tmpfile, $file)
+        || $class->error($!);
+}
+
+
+#------------------------------------------------------------------------
+# catch_warnings($msg)
+#
+# Installed as
+#------------------------------------------------------------------------
+
+sub catch_warnings {
+    $COMPERR .= join('', @_); 
+}
+
+    
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Document - Compiled template document object
+
+=head1 SYNOPSIS
+
+    use Template::Document;
+    
+    $doc = Template::Document->new({
+        BLOCK => sub { # some perl code; return $some_text },
+        DEFBLOCKS => {
+            header => sub { # more perl code; return $some_text },
+            footer => sub { # blah blah blah; return $some_text },
+        },
+        METADATA => {
+            author  => 'Andy Wardley',
+            version => 3.14,
+        }
+    }) || die $Template::Document::ERROR;
+    
+    print $doc->process($context);
+
+=head1 DESCRIPTION
+
+This module defines an object class whose instances represent compiled
+template documents.  The L<Template::Parser> module creates a
+C<Template::Document> instance to encapsulate a template as it is compiled
+into Perl code.
+
+The constructor method, L<new()>, expects a reference to a hash array
+containing the C<BLOCK>, C<DEFBLOCKS> and C<METADATA> items.  
+
+The C<BLOCK> item should contain a reference to a Perl subroutine or a textual
+representation of Perl code, as generated by the L<Template::Parser> module.
+This is then evaluated into a subroutine reference using C<eval()>. 
+
+The C<DEFLOCKS> item should reference a hash array containing further named
+C<BLOCK>s which may be defined in the template. The keys represent C<BLOCK>
+names and the values should be subroutine references or text strings of Perl
+code as per the main C<BLOCK> item. 
+
+The C<METADATA> item should reference a hash array of metadata items relevant
+to the document.
+
+The L<process()> method can then be called on the instantiated
+C<Template::Document> object, passing a reference to a L<Template::Context>
+object as the first parameter. This will install any locally defined blocks
+(C<DEFBLOCKS>) in the C<BLOCKS> cache in the context (via a call to
+L<visit()|Template::Context#visit()>) so that they may be subsequently
+resolved by the context. The main C<BLOCK> subroutine is then executed,
+passing the context reference on as a parameter. The text returned from the
+template subroutine is then returned by the L<process()> method, after calling
+the context L<leave()|Template::Context#leave()> method to permit cleanup and
+de-registration of named C<BLOCKS> previously installed.
+
+An C<AUTOLOAD> method provides access to the C<METADATA> items for the
+document. The L<Template::Service> module installs a reference to the main
+C<Template::Document> object in the stash as the C<template> variable. This allows
+metadata items to be accessed from within templates, including C<PRE_PROCESS>
+templates.
+
+header:
+
+    <html>
+    <head>
+    <title>[% template.title %]
+    </head>
+    ...
+
+C<Template::Document> objects are usually created by the L<Template::Parser>
+but can be manually instantiated or sub-classed to provide custom
+template components.
+
+=head1 METHODS
+
+=head2 new(\%config)
+
+Constructor method which accept a reference to a hash array containing the
+structure as shown in this example:
+
+    $doc = Template::Document->new({
+        BLOCK => sub { # some perl code; return $some_text },
+        DEFBLOCKS => {
+            header => sub { # more perl code; return $some_text },
+            footer => sub { # blah blah blah; return $some_text },
+        },
+        METADATA => {
+            author  => 'Andy Wardley',
+            version => 3.14,
+        }
+    }) || die $Template::Document::ERROR;
+
+C<BLOCK> and C<DEFBLOCKS> items may be expressed as references to Perl subroutines
+or as text strings containing Perl subroutine definitions, as is generated
+by the L<Template::Parser> module.  These are evaluated into subroutine references
+using C<eval()>.
+
+Returns a new C<Template::Document> object or C<undef> on error. The
+L<error()|Template::Base#error()> class method can be called, or the C<$ERROR>
+package variable inspected to retrieve the relevant error message.
+
+=head2 process($context)
+
+Main processing routine for the compiled template document. A reference to a
+L<Template::Context> object should be passed as the first parameter. The
+method installs any locally defined blocks via a call to the context
+L<visit()|Template::Context#visit()> method, processes its own template,
+(passing the context reference as a parameter) and then calls
+L<leave()|Template::Context#leave()> in the context to allow cleanup.
+
+    print $doc->process($context);
+
+Returns a text string representing the generated output for the template.
+Errors are thrown via C<die()>.
+
+=head2 block()
+
+Returns a reference to the main C<BLOCK> subroutine.
+
+=head2 blocks()
+
+Returns a reference to the hash array of named C<DEFBLOCKS> subroutines.
+
+=head2 AUTOLOAD
+
+An autoload method returns C<METADATA> items.
+
+    print $doc->author();
+
+=head1 PACKAGE SUB-ROUTINES
+
+=head2 write_perl_file(\%config)
+
+This package subroutine is provided to effect persistence of compiled
+templates.  If the C<COMPILE_EXT> option (to indicate a file extension
+for saving compiled templates) then the L<Template::Parser> module calls
+this subroutine before calling the L<new()> constructor.  At this stage,
+the parser has a representation of the template as text strings
+containing Perl code.  We can write that to a file, enclosed in a
+small wrapper which will allow us to susequently C<require()> the file
+and have Perl parse and compile it into a C<Template::Document>.  Thus we
+have persistence of compiled templates.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template>, L<Template::Parser>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Exception.pm b/bench/perl/Template/Exception.pm
new file mode 100644
index 0000000..5432d64
--- /dev/null
+++ b/bench/perl/Template/Exception.pm
@@ -0,0 +1,229 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Exception
+#
+# DESCRIPTION
+#   Module implementing a generic exception class used for error handling
+#   in the Template Toolkit.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#========================================================================
+
+package Template::Exception;
+
+use strict;
+use warnings;
+use constant TYPE  => 0;
+use constant INFO  => 1;
+use constant TEXT  => 2;
+use overload q|""| => "as_string", fallback => 1;
+
+our $VERSION = 2.70;
+
+
+#------------------------------------------------------------------------
+# new($type, $info, \$text)
+#
+# Constructor method used to instantiate a new Template::Exception
+# object.  The first parameter should contain the exception type.  This
+# can be any arbitrary string of the caller's choice to represent a 
+# specific exception.  The second parameter should contain any 
+# information (i.e. error message or data reference) relevant to the 
+# specific exception event.  The third optional parameter may be a 
+# reference to a scalar containing output text from the template 
+# block up to the point where the exception was thrown.
+#------------------------------------------------------------------------
+
+sub new {
+    my ($class, $type, $info, $textref) = @_;
+    bless [ $type, $info, $textref ], $class;
+}
+
+
+#------------------------------------------------------------------------
+# type()
+# info()
+# type_info()
+#
+# Accessor methods to return the internal TYPE and INFO fields.
+#------------------------------------------------------------------------
+
+sub type {
+    $_[0]->[ TYPE ];
+}
+
+sub info {
+    $_[0]->[ INFO ];
+}
+
+sub type_info {
+    my $self = shift;
+    @$self[ TYPE, INFO ];
+}
+
+#------------------------------------------------------------------------
+# text()
+# text(\$pretext)
+#
+# Method to return the text referenced by the TEXT member.  A text 
+# reference may be passed as a parameter to supercede the existing 
+# member.  The existing text is added to the *end* of the new text
+# before being stored.  This facility is provided for template blocks
+# to gracefully de-nest when an exception occurs and allows them to 
+# reconstruct their output in the correct order. 
+#------------------------------------------------------------------------
+
+sub text {
+    my ($self, $newtextref) = @_;
+    my $textref = $self->[ TEXT ];
+    
+    if ($newtextref) {
+        $$newtextref .= $$textref if $textref && $textref ne $newtextref;
+        $self->[ TEXT ] = $newtextref;
+        return '';
+    }
+    elsif ($textref) {
+        return $$textref;
+    }
+    else {
+        return '';
+    }
+}
+
+
+#------------------------------------------------------------------------
+# as_string()
+#
+# Accessor method to return a string indicating the exception type and
+# information.
+#------------------------------------------------------------------------
+
+sub as_string {
+    my $self = shift;
+    return $self->[ TYPE ] . ' error - ' . $self->[ INFO ];
+}
+
+
+#------------------------------------------------------------------------
+# select_handler(@types)
+# 
+# Selects the most appropriate handler for the exception TYPE, from 
+# the list of types passed in as parameters.  The method returns the
+# item which is an exact match for TYPE or the closest, more 
+# generic handler (e.g. foo being more generic than foo.bar, etc.)
+#------------------------------------------------------------------------
+
+sub select_handler {
+    my ($self, @options) = @_;
+    my $type = $self->[ TYPE ];
+    my %hlut;
+    @hlut{ @options } = (1) x @options;
+
+    while ($type) {
+        return $type if $hlut{ $type };
+
+        # strip .element from the end of the exception type to find a 
+        # more generic handler
+        $type =~ s/\.?[^\.]*$//;
+    }
+    return undef;
+}
+    
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Exception - Exception handling class module
+
+=head1 SYNOPSIS
+
+    use Template::Exception;
+    
+    my $exception = Template::Exception->new($type, $info);
+    $type = $exception->type;
+    $info = $exception->info;
+    ($type, $info) = $exception->type_info;
+    
+    print $exception->as_string();
+    
+    $handler = $exception->select_handler(\@candidates);
+
+=head1 DESCRIPTION
+
+The C<Template::Exception> module defines an object class for
+representing exceptions within the template processing life cycle.
+Exceptions can be raised by modules within the Template Toolkit, or
+can be generated and returned by user code bound to template
+variables.
+
+Exceptions can be raised in a template using the C<THROW> directive,
+
+    [% THROW user.login 'no user id: please login' %]
+
+or by calling the L<throw()|Template::Context#throw()> method on the current
+L<Template::Context> object,
+
+    $context->throw('user.passwd', 'Incorrect Password');
+    $context->throw('Incorrect Password');    # type 'undef'
+
+or from Perl code by calling C<die()> with a C<Template::Exception> object,
+
+    die (Template::Exception->new('user.denied', 'Invalid User ID'));
+
+or by simply calling C<die()> with an error string.  This is
+automagically caught and converted to an  exception of 'C<undef>'
+type (that's the literal string 'C<undef>' rather than Perl's 
+undefined value) which can then be handled in the usual way.
+
+    die "I'm sorry Dave, I can't do that";
+
+Each exception is defined by its type and a information component
+(e.g. error message).  The type can be any identifying string and may
+contain dotted components (e.g. 'C<foo>', 'C<foo.bar>', 'C<foo.bar.baz>').
+Exception types are considered to be hierarchical such that 'C<foo.bar>'
+would be a specific type of the more general 'C<foo>' type.
+
+=head1 METHODS
+
+=head2 type()
+
+Returns the exception type.
+
+=head2 info()
+
+Returns the exception information.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template>, L<Template::Context>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Filters.pm b/bench/perl/Template/Filters.pm
new file mode 100644
index 0000000..380e6e9
--- /dev/null
+++ b/bench/perl/Template/Filters.pm
@@ -0,0 +1,811 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Filters
+#
+# DESCRIPTION
+#   Defines filter plugins as used by the FILTER directive.
+#
+# AUTHORS
+#   Andy Wardley <abw at wardley.org>, with a number of filters contributed
+#   by Leslie Michael Orchard <deus_x at nijacode.com>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Filters;
+
+use strict;
+use warnings;
+use locale;
+use base 'Template::Base';
+use Template::Constants;
+use Scalar::Util 'blessed';
+
+our $VERSION         = 2.87;
+our $AVAILABLE       = { };
+our $TRUNCATE_LENGTH = 32;
+our $TRUNCATE_ADDON  = '...';
+
+
+#------------------------------------------------------------------------
+# standard filters, defined in one of the following forms:
+#   name =>   \&static_filter
+#   name => [ \&subref, $is_dynamic ]
+# If the $is_dynamic flag is set then the sub-routine reference 
+# is called to create a new filter each time it is requested;  if
+# not set, then it is a single, static sub-routine which is returned
+# for every filter request for that name.
+#------------------------------------------------------------------------
+
+our $FILTERS = {
+    # static filters 
+    'html'            => \&html_filter,
+    'html_para'       => \&html_paragraph,
+    'html_break'      => \&html_para_break,
+    'html_para_break' => \&html_para_break,
+    'html_line_break' => \&html_line_break,
+    'xml'             => \&xml_filter,
+    'uri'             => \&uri_filter,
+    'url'             => \&url_filter,
+    'upper'           => sub { uc $_[0] },
+    'lower'           => sub { lc $_[0] },
+    'ucfirst'         => sub { ucfirst $_[0] },
+    'lcfirst'         => sub { lcfirst $_[0] },
+    'stderr'          => sub { print STDERR @_; return '' },
+    'trim'            => sub { for ($_[0]) { s/^\s+//; s/\s+$// }; $_[0] },
+    'null'            => sub { return '' },
+    'collapse'        => sub { for ($_[0]) { s/^\s+//; s/\s+$//; s/\s+/ /g };
+                               $_[0] },
+
+    # dynamic filters
+    'html_entity' => [ \&html_entity_filter_factory, 1 ],
+    'indent'      => [ \&indent_filter_factory,      1 ],
+    'format'      => [ \&format_filter_factory,      1 ],
+    'truncate'    => [ \&truncate_filter_factory,    1 ],
+    'repeat'      => [ \&repeat_filter_factory,      1 ],
+    'replace'     => [ \&replace_filter_factory,     1 ],
+    'remove'      => [ \&remove_filter_factory,      1 ],
+    'eval'        => [ \&eval_filter_factory,        1 ],
+    'evaltt'      => [ \&eval_filter_factory,        1 ],  # alias
+    'perl'        => [ \&perl_filter_factory,        1 ],
+    'evalperl'    => [ \&perl_filter_factory,        1 ],  # alias
+    'redirect'    => [ \&redirect_filter_factory,    1 ],
+    'file'        => [ \&redirect_filter_factory,    1 ],  # alias
+    'stdout'      => [ \&stdout_filter_factory,      1 ],
+};
+
+# name of module implementing plugin filters
+our $PLUGIN_FILTER = 'Template::Plugin::Filter';
+
+
+
+#========================================================================
+#                         -- PUBLIC METHODS --
+#========================================================================
+
+#------------------------------------------------------------------------
+# fetch($name, \@args, $context)
+#
+# Attempts to instantiate or return a reference to a filter sub-routine 
+# named by the first parameter, $name, with additional constructor 
+# arguments passed by reference to a list as the second parameter, 
+# $args.  A reference to the calling Template::Context object is 
+# passed as the third paramter.
+#
+# Returns a reference to a filter sub-routine or a pair of values
+# (undef, STATUS_DECLINED) or ($error, STATUS_ERROR) to decline to
+# deliver the filter or to indicate an error.
+#------------------------------------------------------------------------
+
+sub fetch {
+    my ($self, $name, $args, $context) = @_;
+    my ($factory, $is_dynamic, $filter, $error);
+
+    $self->debug("fetch($name, ", 
+                 defined $args ? ('[ ', join(', ', @$args), ' ]') : '<no args>', ', ',
+                 defined $context ? $context : '<no context>', 
+                 ')') if $self->{ DEBUG };
+
+    # allow $name to be specified as a reference to 
+    # a plugin filter object;  any other ref is 
+    # assumed to be a coderef and hence already a filter;
+    # non-refs are assumed to be regular name lookups
+
+    if (ref $name) {
+        if (blessed($name) && $name->isa($PLUGIN_FILTER)) {
+            $factory = $name->factory()
+                || return $self->error($name->error());
+        }
+        else {
+            return $name;
+        }
+    }
+    else {
+        return (undef, Template::Constants::STATUS_DECLINED)
+            unless ($factory = $self->{ FILTERS }->{ $name }
+                    || $FILTERS->{ $name });
+    }
+
+    # factory can be an [ $code, $dynamic ] or just $code
+    if (ref $factory eq 'ARRAY') {
+        ($factory, $is_dynamic) = @$factory;
+    }
+    else {
+        $is_dynamic = 0;
+    }
+
+    if (ref $factory eq 'CODE') {
+        if ($is_dynamic) {
+            # if the dynamic flag is set then the sub-routine is a 
+            # factory which should be called to create the actual 
+            # filter...
+            eval {
+                ($filter, $error) = &$factory($context, $args ? @$args : ());
+            };
+            $error ||= $@;
+            $error = "invalid FILTER for '$name' (not a CODE ref)"
+                unless $error || ref($filter) eq 'CODE';
+        }
+        else {
+            # ...otherwise, it's a static filter sub-routine
+            $filter = $factory;
+        }
+    }
+    else {
+        $error = "invalid FILTER entry for '$name' (not a CODE ref)";
+    }
+
+    if ($error) {
+        return $self->{ TOLERANT } 
+               ? (undef,  Template::Constants::STATUS_DECLINED) 
+               : ($error, Template::Constants::STATUS_ERROR) ;
+    }
+    else {
+        return $filter;
+    }
+}
+
+
+#------------------------------------------------------------------------
+# store($name, \&filter)
+#
+# Stores a new filter in the internal FILTERS hash.  The first parameter
+# is the filter name, the second a reference to a subroutine or 
+# array, as per the standard $FILTERS entries.
+#------------------------------------------------------------------------
+
+sub store {
+    my ($self, $name, $filter) = @_;
+
+    $self->debug("store($name, $filter)") if $self->{ DEBUG };
+
+    $self->{ FILTERS }->{ $name } = $filter;
+    return 1;
+}
+
+
+#========================================================================
+#                        -- PRIVATE METHODS --
+#========================================================================
+
+#------------------------------------------------------------------------
+# _init(\%config)
+#
+# Private initialisation method.
+#------------------------------------------------------------------------
+
+sub _init {
+    my ($self, $params) = @_;
+
+    $self->{ FILTERS  } = $params->{ FILTERS } || { };
+    $self->{ TOLERANT } = $params->{ TOLERANT }  || 0;
+    $self->{ DEBUG    } = ( $params->{ DEBUG } || 0 )
+                          & Template::Constants::DEBUG_FILTERS;
+
+
+    return $self;
+}
+
+
+
+#------------------------------------------------------------------------
+# _dump()
+# 
+# Debug method
+#------------------------------------------------------------------------
+
+sub _dump {
+    my $self = shift;
+    my $output = "[Template::Filters] {\n";
+    my $format = "    %-16s => %s\n";
+    my $key;
+
+    foreach $key (qw( TOLERANT )) {
+        my $val = $self->{ $key };
+        $val = '<undef>' unless defined $val;
+        $output .= sprintf($format, $key, $val);
+    }
+
+    my $filters = $self->{ FILTERS };
+    $filters = join('', map { 
+        sprintf("    $format", $_, $filters->{ $_ });
+    } keys %$filters);
+    $filters = "{\n$filters    }";
+    
+    $output .= sprintf($format, 'FILTERS (local)' => $filters);
+
+    $filters = $FILTERS;
+    $filters = join('', map { 
+        my $f = $filters->{ $_ };
+        my ($ref, $dynamic) = ref $f eq 'ARRAY' ? @$f : ($f, 0);
+        sprintf("    $format", $_, $dynamic ? 'dynamic' : 'static');
+    } sort keys %$filters);
+    $filters = "{\n$filters    }";
+    
+    $output .= sprintf($format, 'FILTERS (global)' => $filters);
+
+    $output .= '}';
+    return $output;
+}
+
+
+#========================================================================
+#                         -- STATIC FILTER SUBS --
+#========================================================================
+
+#------------------------------------------------------------------------
+# uri_filter()                                           [% FILTER uri %]
+#
+# URI escape a string.  This code is borrowed from Gisle Aas' URI::Escape
+# module, copyright 1995-2004.  See RFC2396 for details.
+#-----------------------------------------------------------------------
+
+# cache of escaped characters
+our $URI_ESCAPES;
+
+sub uri_filter {
+    my $text = shift;
+
+    $URI_ESCAPES ||= {
+        map { ( chr($_), sprintf("%%%02X", $_) ) } (0..255),
+    };
+
+    if ($] >= 5.008 && utf8::is_utf8($text)) {
+        utf8::encode($text);
+    }
+    
+    $text =~ s/([^A-Za-z0-9\-_.!~*'()])/$URI_ESCAPES->{$1}/eg;
+    $text;
+}
+
+#------------------------------------------------------------------------
+# url_filter()                                           [% FILTER uri %]
+#
+# NOTE: the difference: url vs uri. 
+# This implements the old-style, non-strict behaviour of the uri filter 
+# which allows any valid URL characters to pass through so that 
+# http://example.com/blah.html does not get the ':' and '/' characters 
+# munged. 
+#-----------------------------------------------------------------------
+
+sub url_filter {
+    my $text = shift;
+
+    $URI_ESCAPES ||= {
+        map { ( chr($_), sprintf("%%%02X", $_) ) } (0..255),
+    };
+
+    if ($] >= 5.008 && utf8::is_utf8($text)) {
+        utf8::encode($text);
+    }
+    
+    $text =~ s/([^;\/?:@&=+\$,A-Za-z0-9\-_.!~*'()])/$URI_ESCAPES->{$1}/eg;
+    $text;
+}
+
+
+#------------------------------------------------------------------------
+# html_filter()                                         [% FILTER html %]
+#
+# Convert any '<', '>' or '&' characters to the HTML equivalents, '<',
+# '>' and '&', respectively. 
+#------------------------------------------------------------------------
+
+sub html_filter {
+    my $text = shift;
+    for ($text) {
+        s/&/&/g;
+        s/</</g;
+        s/>/>/g;
+        s/"/"/g;
+    }
+    return $text;
+}
+
+
+#------------------------------------------------------------------------
+# xml_filter()                                           [% FILTER xml %]
+#
+# Same as the html filter, but adds the conversion of ' to ' which
+# is native to XML.
+#------------------------------------------------------------------------
+
+sub xml_filter {
+    my $text = shift;
+    for ($text) {
+        s/&/&/g;
+        s/</</g;
+        s/>/>/g;
+        s/"/"/g;
+        s/'/'/g;
+    }
+    return $text;
+}
+
+
+#------------------------------------------------------------------------
+# html_paragraph()                                 [% FILTER html_para %]
+#
+# Wrap each paragraph of text (delimited by two or more newlines) in the
+# <p>...</p> HTML tags.
+#------------------------------------------------------------------------
+
+sub html_paragraph  {
+    my $text = shift;
+    return "<p>\n" 
+           . join("\n</p>\n\n<p>\n", split(/(?:\r?\n){2,}/, $text))
+           . "</p>\n";
+}
+
+
+#------------------------------------------------------------------------
+# html_para_break()                          [% FILTER html_para_break %]
+#                                               
+# Join each paragraph of text (delimited by two or more newlines) with
+# <br><br> HTML tags.
+#------------------------------------------------------------------------
+
+sub html_para_break  {
+    my $text = shift;
+    $text =~ s|(\r?\n){2,}|$1<br />$1<br />$1|g;
+    return $text;
+}
+
+#------------------------------------------------------------------------
+# html_line_break()                          [% FILTER html_line_break %]
+#
+# replaces any newlines with <br> HTML tags.
+#------------------------------------------------------------------------
+
+sub html_line_break  {
+    my $text = shift;
+    $text =~ s|(\r?\n)|<br />$1|g;
+    return $text;
+}
+
+#========================================================================
+#                    -- DYNAMIC FILTER FACTORIES --
+#========================================================================
+
+#------------------------------------------------------------------------
+# html_entity_filter_factory(\%options)                 [% FILTER html %]
+#
+# Dynamic version of the static html filter which attempts to locate the
+# Apache::Util or HTML::Entities modules to perform full entity encoding
+# of the text passed.  Returns an exception if one or other of the 
+# modules can't be located.
+#------------------------------------------------------------------------
+
+sub use_html_entities {
+    require HTML::Entities;
+    return ($AVAILABLE->{ HTML_ENTITY } = \&HTML::Entities::encode_entities);
+}
+
+sub use_apache_util {
+    require Apache::Util;
+    Apache::Util::escape_html('');      # TODO: explain this
+    return ($AVAILABLE->{ HTML_ENTITY } = \&Apache::Util::escape_html);
+}
+
+sub html_entity_filter_factory {
+    my $context = shift;
+    my $haz;
+    
+    # if Apache::Util is installed then we use escape_html
+    $haz = $AVAILABLE->{ HTML_ENTITY } 
+       ||  eval { use_apache_util()   }
+       ||  eval { use_html_entities() }
+       ||  -1;      # we use -1 for "not available" because it's a true value
+
+    return ref $haz eq 'CODE'
+        ? $haz
+        : (undef, Template::Exception->new( 
+            html_entity => 'cannot locate Apache::Util or HTML::Entities' )
+          );
+}
+
+
+#------------------------------------------------------------------------
+# indent_filter_factory($pad)                    [% FILTER indent(pad) %]
+#
+# Create a filter to indent text by a fixed pad string or when $pad is
+# numerical, a number of space. 
+#------------------------------------------------------------------------
+
+sub indent_filter_factory {
+    my ($context, $pad) = @_;
+    $pad = 4 unless defined $pad;
+    $pad = ' ' x $pad if $pad =~ /^\d+$/;
+
+    return sub {
+        my $text = shift;
+        $text = '' unless defined $text;
+        $text =~ s/^/$pad/mg;
+        return $text;
+    }
+}
+
+#------------------------------------------------------------------------
+# format_filter_factory()                     [% FILTER format(format) %]
+#
+# Create a filter to format text according to a printf()-like format
+# string.
+#------------------------------------------------------------------------
+
+sub format_filter_factory {
+    my ($context, $format) = @_;
+    $format = '%s' unless defined $format;
+
+    return sub {
+        my $text = shift;
+        $text = '' unless defined $text;
+        return join("\n", map{ sprintf($format, $_) } split(/\n/, $text));
+    }
+}
+
+
+#------------------------------------------------------------------------
+# repeat_filter_factory($n)                        [% FILTER repeat(n) %]
+#
+# Create a filter to repeat text n times.
+#------------------------------------------------------------------------
+
+sub repeat_filter_factory {
+    my ($context, $iter) = @_;
+    $iter = 1 unless defined $iter and length $iter;
+
+    return sub {
+        my $text = shift;
+        $text = '' unless defined $text;
+        return join('\n', $text) x $iter;
+    }
+}
+
+
+#------------------------------------------------------------------------
+# replace_filter_factory($s, $r)    [% FILTER replace(search, replace) %]
+#
+# Create a filter to replace 'search' text with 'replace'
+#------------------------------------------------------------------------
+
+sub replace_filter_factory {
+    my ($context, $search, $replace) = @_;
+    $search = '' unless defined $search;
+    $replace = '' unless defined $replace;
+
+    return sub {
+        my $text = shift;
+        $text = '' unless defined $text;
+        $text =~ s/$search/$replace/g;
+        return $text;
+    }
+}
+
+
+#------------------------------------------------------------------------
+# remove_filter_factory($text)                  [% FILTER remove(text) %]
+#
+# Create a filter to remove 'search' string from the input text.
+#------------------------------------------------------------------------
+
+sub remove_filter_factory {
+    my ($context, $search) = @_;
+
+    return sub {
+        my $text = shift;
+        $text = '' unless defined $text;
+        $text =~ s/$search//g;
+        return $text;
+    }
+}
+
+
+#------------------------------------------------------------------------
+# truncate_filter_factory($n)                    [% FILTER truncate(n) %]
+#
+# Create a filter to truncate text after n characters.
+#------------------------------------------------------------------------
+
+sub truncate_filter_factory {
+    my ($context, $len, $char) = @_;
+    $len  = $TRUNCATE_LENGTH unless defined $len;
+    $char = $TRUNCATE_ADDON  unless defined $char;
+
+    # Length of char is the minimum length
+    my $lchar = length $char;
+    if ($len < $lchar) {
+        $char  = substr($char, 0, $len);
+        $lchar = $len;
+    }
+
+    return sub {
+        my $text = shift;
+        return $text if length $text <= $len;
+        return substr($text, 0, $len - $lchar) . $char;
+
+
+    }
+}
+
+
+#------------------------------------------------------------------------
+# eval_filter_factory                                   [% FILTER eval %]
+# 
+# Create a filter to evaluate template text.
+#------------------------------------------------------------------------
+
+sub eval_filter_factory {
+    my $context = shift;
+
+    return sub {
+        my $text = shift;
+        $context->process(\$text);
+    }
+}
+
+
+#------------------------------------------------------------------------
+# perl_filter_factory                                   [% FILTER perl %]
+# 
+# Create a filter to process Perl text iff the context EVAL_PERL flag 
+# is set.
+#------------------------------------------------------------------------
+
+sub perl_filter_factory {
+    my $context = shift;
+    my $stash = $context->stash;
+
+    return (undef, Template::Exception->new('perl', 'EVAL_PERL is not set'))
+        unless $context->eval_perl();
+
+    return sub {
+        my $text = shift;
+        local($Template::Perl::context) = $context;
+        local($Template::Perl::stash)   = $stash;
+        my $out = eval <<EOF;
+package Template::Perl; 
+\$stash = \$context->stash(); 
+$text
+EOF
+        $context->throw($@) if $@;
+        return $out;
+    }
+}
+
+
+#------------------------------------------------------------------------
+# redirect_filter_factory($context, $file)    [% FILTER redirect(file) %]
+#
+# Create a filter to redirect the block text to a file.
+#------------------------------------------------------------------------
+
+sub redirect_filter_factory {
+    my ($context, $file, $options) = @_;
+    my $outpath = $context->config->{ OUTPUT_PATH };
+
+    return (undef, Template::Exception->new('redirect', 
+                                            'OUTPUT_PATH is not set'))
+        unless $outpath;
+
+    $context->throw('redirect', "relative filenames are not supported: $file")
+        if $file =~ m{(^|/)\.\./};
+
+    $options = { binmode => $options } unless ref $options;
+
+    sub {
+        my $text = shift;
+        my $outpath = $context->config->{ OUTPUT_PATH }
+            || return '';
+        $outpath .= "/$file";
+        my $error = Template::_output($outpath, \$text, $options);
+        die Template::Exception->new('redirect', $error)
+            if $error;
+        return '';
+    }
+}
+
+
+#------------------------------------------------------------------------
+# stdout_filter_factory($context, $binmode)    [% FILTER stdout(binmode) %]
+#
+# Create a filter to print a block to stdout, with an optional binmode.
+#------------------------------------------------------------------------
+
+sub stdout_filter_factory {
+    my ($context, $options) = @_;
+
+    $options = { binmode => $options } unless ref $options;
+
+    sub {
+        my $text = shift;
+        binmode(STDOUT) if $options->{ binmode };
+        print STDOUT $text;
+        return '';
+    }
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Filters - Post-processing filters for template blocks
+
+=head1 SYNOPSIS
+
+    use Template::Filters;
+    
+    $filters = Template::Filters->new(\%config);
+    
+    ($filter, $error) = $filters->fetch($name, \@args, $context);
+    
+    if ($filter) {
+        print &$filter("some text");
+    }
+    else {
+        print "Could not fetch $name filter: $error\n";
+    }
+
+=head1 DESCRIPTION
+
+The C<Template::Filters> module implements a provider for creating subroutines
+that implement the standard filters. Additional custom filters may be provided
+via the L<FILTERS> configuration option.
+
+=head1 METHODS
+
+=head2 new(\%params) 
+
+Constructor method which instantiates and returns a reference to a
+C<Template::Filters> object.  A reference to a hash array of configuration
+items may be passed as a parameter.  These are described below.  
+
+    my $filters = Template::Filters->new({
+        FILTERS => { ... },
+    });
+    
+    my $template = Template->new({
+        LOAD_FILTERS => [ $filters ],
+    });
+
+A default C<Template::Filters> module is created by the L<Template> module
+if the L<LOAD_FILTERS> option isn't specified.  All configuration parameters
+are forwarded to the constructor.
+
+    $template = Template->new({
+        FILTERS => { ... },
+    });
+
+=head2 fetch($name, \@args, $context)
+
+Called to request that a filter of a given name be provided.  The name
+of the filter should be specified as the first parameter.  This should
+be one of the standard filters or one specified in the L<FILTERS>
+configuration hash.  The second argument should be a reference to an
+array containing configuration parameters for the filter.  This may be
+specified as 0, or undef where no parameters are provided.  The third
+argument should be a reference to the current L<Template::Context>
+object.
+
+The method returns a reference to a filter sub-routine on success.  It
+may also return C<(undef, STATUS_DECLINE)> to decline the request, to allow
+delegation onto other filter providers in the L<LOAD_FILTERS> chain of 
+responsibility.  On error, C<($error, STATUS_ERROR)> is returned where $error
+is an error message or L<Template::Exception> object indicating the error
+that occurred. 
+
+When the C<TOLERANT> option is set, errors are automatically downgraded to
+a C<STATUS_DECLINE> response.
+
+=head2 use_html_entities()
+
+This class method can be called to configure the C<html_entity> filter to use
+the L<HTML::Entities> module. An error will be raised if it is not installed
+on your system.
+
+    use Template::Filters;
+    Template::Filters->use_html_entities();
+
+=head2 use_apache_util()
+
+This class method can be called to configure the C<html_entity> filter to use
+the L<Apache::Util> module. An error will be raised if it is not installed on
+your system.
+
+    use Template::Filters;
+    Template::Filters->use_apache_util();
+
+=head1 CONFIGURATION OPTIONS
+
+The following list summarises the configuration options that can be provided
+to the C<Template::Filters> L<new()> constructor. Please see
+L<Template::Manual::Config> for further information about each option.
+
+=head2 FILTERS
+
+The L<FILTERS|Template::Manual::Config#FILTERS> option can be used to specify
+custom filters which can then be used with the
+L<FILTER|Template::Manual::Directives#FILTER> directive like any other. These
+are added to the standard filters which are available by default.
+
+    $filters = Template::Filters->new({
+        FILTERS => {
+            'sfilt1' =>   \&static_filter,
+            'dfilt1' => [ \&dyanamic_filter_factory, 1 ],
+        },
+    });
+
+=head2 TOLERANT
+
+The L<TOLERANT|Template::Manual::Config#TOLERANT> flag can be set to indicate
+that the C<Template::Filters> module should ignore any errors and instead
+return C<STATUS_DECLINED>.
+
+=head2 DEBUG
+
+The L<DEBUG|Template::Manual::Config#DEBUG> option can be used to enable
+debugging messages for the Template::Filters module by setting it to include
+the C<DEBUG_FILTERS> value.
+
+    use Template::Constants qw( :debug );
+    
+    my $template = Template->new({
+        DEBUG => DEBUG_FILTERS | DEBUG_PLUGINS,
+    });
+
+=head1 STANDARD FILTERS
+
+Please see L<Template::Manual::Filters> for a list of the filters provided
+with the Template Toolkit, complete with examples of use.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Manual::Filters>, L<Template>, L<Template::Context>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Grammar.pm b/bench/perl/Template/Grammar.pm
new file mode 100644
index 0000000..2ab287e
--- /dev/null
+++ b/bench/perl/Template/Grammar.pm
@@ -0,0 +1,6252 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Grammar
+#
+# DESCRIPTION
+#   Grammar file for the Template Toolkit language containing token
+#   definitions and parser state/rules tables generated by Parse::Yapp.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2006 Andy Wardley.  All Rights Reserved.
+#   Copyright (C) 1998-2000 Canon Research Centre Europe Ltd.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+# REVISION
+#   $Id: Grammar.pm 1243 2009-07-04 15:32:19Z abw $
+#
+# IMPORTANT NOTE
+#   This module is constructed from the parser/Grammar.pm.skel file by
+#   running the parser/yc script.  You only need to do this if # you
+#   have modified the grammar in the parser/Parser.yp file and need #
+#   to-recompile it.  See the README in the 'parser' directory for
+#   more information (sub-directory of the Template distribution).
+#
+#========================================================================
+
+package Template::Grammar;
+
+use strict;
+use warnings;
+
+our $VERSION  = 2.25;
+
+my (@RESERVED, %CMPOP, $LEXTABLE, $RULES, $STATES);
+my ($factory, $rawstart);
+
+
+#========================================================================
+
+# Reserved words, comparison and binary operators
+#========================================================================
+
+ at RESERVED = qw( 
+	GET CALL SET DEFAULT INSERT INCLUDE PROCESS WRAPPER BLOCK END
+	USE PLUGIN FILTER MACRO PERL RAWPERL TO STEP AND OR NOT DIV MOD
+	IF UNLESS ELSE ELSIF FOR NEXT WHILE SWITCH CASE META IN
+	TRY THROW CATCH FINAL LAST RETURN STOP CLEAR VIEW DEBUG
+    );
+
+# for historical reasons, != and == are converted to ne and eq to perform 
+# stringwise comparison (mainly because it doesn't generate "non-numerical 
+# comparison" warnings which != and == can) but the others (e.g. < > <= >=)
+# are not converted to their stringwise equivalents.  I added 'gt' et al, 
+# briefly for v2.04d and then took them out again in 2.04e.
+
+
+%CMPOP = qw( 
+    != ne
+    == eq
+    <  <
+    >  >
+    >= >=
+    <= <=
+);
+
+#    eq eq  # add these lines to the above to 
+#    lt lt  # enable the eq, lt and gt operators      
+#    gt gt
+
+#========================================================================
+# Lexer Token Table
+#========================================================================
+
+# lookup table used by lexer is initialised with special-cases
+$LEXTABLE = {
+    'FOREACH' => 'FOR',
+    'BREAK'   => 'LAST',
+    '&&'      => 'AND',
+    '||'      => 'OR',
+    '!'       => 'NOT',
+    '|'	      => 'FILTER',
+    '.'       => 'DOT',
+    '_'       => 'CAT',
+    '..'      => 'TO',
+#    ':'       => 'MACRO',
+    '='       => 'ASSIGN',
+    '=>'      => 'ASSIGN',
+#    '->'      => 'ARROW',
+    ','       => 'COMMA',
+    '\\'      => 'REF',
+    'and'     => 'AND',		# explicitly specified so that qw( and or
+    'or'      => 'OR',		# not ) can always be used in lower case, 
+    'not'     => 'NOT',		# regardless of ANYCASE flag
+    'mod'     => 'MOD',
+    'div'     => 'DIV',
+};
+
+# localise the temporary variables needed to complete lexer table
+{ 
+#    my @tokens = qw< ( ) [ ] { } ${ $ / ; : ? >;
+    my @tokens = qw< ( ) [ ] { } ${ $ + / ; : ? >;
+    my @cmpop  = keys %CMPOP;
+#    my @binop  = qw( + - * % );              # '/' above, in @tokens
+    my @binop  = qw( - * % );              # '+' and '/' above, in @tokens
+
+    # fill lexer table, slice by slice, with reserved words and operators
+    @$LEXTABLE{ @RESERVED, @cmpop, @binop, @tokens } 
+	= ( @RESERVED, ('CMPOP') x @cmpop, ('BINOP') x @binop, @tokens );
+}
+
+
+#========================================================================
+# CLASS METHODS
+#========================================================================
+
+sub new {
+    my $class = shift;
+    bless {
+	LEXTABLE => $LEXTABLE,
+	STATES   => $STATES,
+	RULES    => $RULES,
+    }, $class;
+}
+
+# update method to set package-scoped $factory lexical 
+sub install_factory {
+    my ($self, $new_factory) = @_;
+    $factory = $new_factory;
+}
+
+
+#========================================================================
+# States
+#========================================================================
+
+$STATES = [
+	{#State 0
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'loop' => 4,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'template' => 52,
+			'defblockname' => 14,
+			'ident' => 16,
+			'assign' => 19,
+			'macro' => 20,
+			'lterm' => 56,
+			'node' => 23,
+			'term' => 58,
+			'rawperl' => 59,
+			'expr' => 62,
+			'use' => 63,
+			'defblock' => 66,
+			'filter' => 29,
+			'sterm' => 68,
+			'perl' => 31,
+			'chunks' => 33,
+			'setlist' => 70,
+			'try' => 35,
+			'switch' => 34,
+			'directive' => 71,
+			'block' => 72,
+			'condition' => 73
+		}
+	},
+	{#State 1
+		ACTIONS => {
+			"\$" => 43,
+			'LITERAL' => 75,
+			'IDENT' => 2,
+			"\${" => 37
+		},
+		GOTOS => {
+			'setlist' => 76,
+			'item' => 39,
+			'assign' => 19,
+			'node' => 23,
+			'ident' => 74
+		}
+	},
+	{#State 2
+		DEFAULT => -130
+	},
+	{#State 3
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 79,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 4
+		DEFAULT => -23
+	},
+	{#State 5
+		ACTIONS => {
+			";" => 80
+		}
+	},
+	{#State 6
+		DEFAULT => -37
+	},
+	{#State 7
+		DEFAULT => -14
+	},
+	{#State 8
+		ACTIONS => {
+			"\"" => 89,
+			"\$" => 86,
+			'LITERAL' => 88,
+			'FILENAME' => 83,
+			'IDENT' => 81,
+			'NUMBER' => 84
+		},
+		GOTOS => {
+			'filepart' => 87,
+			'names' => 91,
+			'nameargs' => 90,
+			'filename' => 85,
+			'name' => 82
+		}
+	},
+	{#State 9
+		ACTIONS => {
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"]" => 94,
+			"\${" => 37
+		},
+		GOTOS => {
+			'sterm' => 96,
+			'item' => 39,
+			'range' => 93,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 95,
+			'lterm' => 56,
+			'list' => 92
+		}
+	},
+	{#State 10
+		ACTIONS => {
+			";" => 97
+		}
+	},
+	{#State 11
+		DEFAULT => -5
+	},
+	{#State 12
+		ACTIONS => {
+			";" => -20
+		},
+		DEFAULT => -27
+	},
+	{#State 13
+		DEFAULT => -78,
+		GOTOS => {
+			'@5-1' => 98
+		}
+	},
+	{#State 14
+		ACTIONS => {
+			'IDENT' => 99
+		},
+		DEFAULT => -87,
+		GOTOS => {
+			'blockargs' => 102,
+			'metadata' => 101,
+			'meta' => 100
+		}
+	},
+	{#State 15
+		ACTIONS => {
+			'IDENT' => 99
+		},
+		GOTOS => {
+			'metadata' => 103,
+			'meta' => 100
+		}
+	},
+	{#State 16
+		ACTIONS => {
+			'DOT' => 104,
+			'ASSIGN' => 105
+		},
+		DEFAULT => -109
+	},
+	{#State 17
+		ACTIONS => {
+			"\"" => 89,
+			"\$" => 86,
+			'LITERAL' => 88,
+			'FILENAME' => 83,
+			'IDENT' => 81,
+			'NUMBER' => 84
+		},
+		GOTOS => {
+			'filepart' => 87,
+			'names' => 91,
+			'nameargs' => 106,
+			'filename' => 85,
+			'name' => 82
+		}
+	},
+	{#State 18
+		ACTIONS => {
+			'IDENT' => 107
+		}
+	},
+	{#State 19
+		DEFAULT => -149
+	},
+	{#State 20
+		DEFAULT => -12
+	},
+	{#State 21
+		ACTIONS => {
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 108,
+			"\"" => 60,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'sterm' => 68,
+			'item' => 39,
+			'loopvar' => 110,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 109,
+			'lterm' => 56
+		}
+	},
+	{#State 22
+		DEFAULT => -40
+	},
+	{#State 23
+		DEFAULT => -127
+	},
+	{#State 24
+		DEFAULT => -6
+	},
+	{#State 25
+		ACTIONS => {
+			"\"" => 117,
+			"\$" => 114,
+			'LITERAL' => 116,
+			'FILENAME' => 83,
+			'IDENT' => 111,
+			'NUMBER' => 84,
+			"\${" => 37
+		},
+		GOTOS => {
+			'names' => 91,
+			'lvalue' => 112,
+			'item' => 113,
+			'name' => 82,
+			'filepart' => 87,
+			'filename' => 85,
+			'nameargs' => 118,
+			'lnameargs' => 115
+		}
+	},
+	{#State 26
+		DEFAULT => -113
+	},
+	{#State 27
+		ACTIONS => {
+			"\$" => 43,
+			'IDENT' => 2,
+			"\${" => 37
+		},
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'ident' => 119
+		}
+	},
+	{#State 28
+		ACTIONS => {
+			'LITERAL' => 124,
+			'FILENAME' => 83,
+			'IDENT' => 120,
+			'NUMBER' => 84
+		},
+		DEFAULT => -87,
+		GOTOS => {
+			'blockargs' => 123,
+			'filepart' => 87,
+			'filename' => 122,
+			'blockname' => 121,
+			'metadata' => 101,
+			'meta' => 100
+		}
+	},
+	{#State 29
+		DEFAULT => -43
+	},
+	{#State 30
+		ACTIONS => {
+			"\$" => 43,
+			'LITERAL' => 129,
+			'IDENT' => 2,
+			"\${" => 37
+		},
+		DEFAULT => -119,
+		GOTOS => {
+			'params' => 128,
+			'hash' => 125,
+			'item' => 126,
+			'param' => 127
+		}
+	},
+	{#State 31
+		DEFAULT => -25
+	},
+	{#State 32
+		ACTIONS => {
+			"\"" => 117,
+			"\$" => 114,
+			'LITERAL' => 116,
+			'FILENAME' => 83,
+			'IDENT' => 111,
+			'NUMBER' => 84,
+			"\${" => 37
+		},
+		GOTOS => {
+			'names' => 91,
+			'lvalue' => 112,
+			'item' => 113,
+			'name' => 82,
+			'filepart' => 87,
+			'filename' => 85,
+			'nameargs' => 118,
+			'lnameargs' => 130
+		}
+	},
+	{#State 33
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -2,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 131,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 34
+		DEFAULT => -22
+	},
+	{#State 35
+		DEFAULT => -24
+	},
+	{#State 36
+		ACTIONS => {
+			"\"" => 89,
+			"\$" => 86,
+			'LITERAL' => 88,
+			'FILENAME' => 83,
+			'IDENT' => 81,
+			'NUMBER' => 84
+		},
+		GOTOS => {
+			'filepart' => 87,
+			'names' => 91,
+			'nameargs' => 132,
+			'filename' => 85,
+			'name' => 82
+		}
+	},
+	{#State 37
+		ACTIONS => {
+			"\"" => 60,
+			"\$" => 43,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			'REF' => 27,
+			'NUMBER' => 26,
+			"\${" => 37
+		},
+		GOTOS => {
+			'sterm' => 133,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77
+		}
+	},
+	{#State 38
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 134,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 39
+		ACTIONS => {
+			"(" => 135
+		},
+		DEFAULT => -128
+	},
+	{#State 40
+		ACTIONS => {
+			";" => 136
+		}
+	},
+	{#State 41
+		DEFAULT => -38
+	},
+	{#State 42
+		DEFAULT => -11
+	},
+	{#State 43
+		ACTIONS => {
+			'IDENT' => 137
+		}
+	},
+	{#State 44
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 138,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 45
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 139,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 46
+		DEFAULT => -42
+	},
+	{#State 47
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 140,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 48
+		ACTIONS => {
+			'IF' => 144,
+			'FILTER' => 143,
+			'FOR' => 142,
+			'WHILE' => 146,
+			'WRAPPER' => 145,
+			'UNLESS' => 141
+		}
+	},
+	{#State 49
+		DEFAULT => -39
+	},
+	{#State 50
+		DEFAULT => -10
+	},
+	{#State 51
+		ACTIONS => {
+			"\"" => 89,
+			"\$" => 86,
+			'LITERAL' => 88,
+			'FILENAME' => 83,
+			'IDENT' => 81,
+			'NUMBER' => 84
+		},
+		GOTOS => {
+			'filepart' => 87,
+			'names' => 91,
+			'nameargs' => 147,
+			'filename' => 85,
+			'name' => 82
+		}
+	},
+	{#State 52
+		ACTIONS => {
+			'' => 148
+		}
+	},
+	{#State 53
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 57,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 151,
+			'sterm' => 68,
+			'item' => 39,
+			'assign' => 150,
+			'node' => 23,
+			'ident' => 149,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 54
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 152,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 55
+		ACTIONS => {
+			"\"" => 89,
+			"\$" => 86,
+			'LITERAL' => 88,
+			'FILENAME' => 83,
+			'IDENT' => 81,
+			'NUMBER' => 84
+		},
+		GOTOS => {
+			'filepart' => 87,
+			'names' => 91,
+			'nameargs' => 153,
+			'filename' => 85,
+			'name' => 82
+		}
+	},
+	{#State 56
+		DEFAULT => -103
+	},
+	{#State 57
+		ACTIONS => {
+			'ASSIGN' => 154
+		},
+		DEFAULT => -112
+	},
+	{#State 58
+		DEFAULT => -146
+	},
+	{#State 59
+		DEFAULT => -15
+	},
+	{#State 60
+		DEFAULT => -176,
+		GOTOS => {
+			'quoted' => 155
+		}
+	},
+	{#State 61
+		ACTIONS => {
+			"\"" => 89,
+			"\$" => 86,
+			'LITERAL' => 88,
+			'FILENAME' => 83,
+			'IDENT' => 81,
+			'NUMBER' => 84
+		},
+		GOTOS => {
+			'filepart' => 87,
+			'names' => 91,
+			'nameargs' => 156,
+			'filename' => 85,
+			'name' => 82
+		}
+	},
+	{#State 62
+		ACTIONS => {
+			";" => -16,
+			"+" => 157,
+			'CAT' => 163,
+			'CMPOP' => 164,
+			"?" => 158,
+			'DIV' => 159,
+			'MOD' => 165,
+			"/" => 166,
+			'AND' => 160,
+			'BINOP' => 161,
+			'OR' => 162
+		},
+		DEFAULT => -26
+	},
+	{#State 63
+		DEFAULT => -13
+	},
+	{#State 64
+		DEFAULT => -36
+	},
+	{#State 65
+		ACTIONS => {
+			"\"" => 89,
+			"\$" => 86,
+			'LITERAL' => 88,
+			'FILENAME' => 83,
+			'IDENT' => 81,
+			'NUMBER' => 84
+		},
+		GOTOS => {
+			'filepart' => 87,
+			'names' => 91,
+			'nameargs' => 167,
+			'filename' => 85,
+			'name' => 82
+		}
+	},
+	{#State 66
+		DEFAULT => -9
+	},
+	{#State 67
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 168,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 68
+		DEFAULT => -104
+	},
+	{#State 69
+		ACTIONS => {
+			"\$" => 43,
+			'LITERAL' => 75,
+			'IDENT' => 2,
+			"\${" => 37
+		},
+		GOTOS => {
+			'setlist' => 169,
+			'item' => 39,
+			'assign' => 19,
+			'node' => 23,
+			'ident' => 74
+		}
+	},
+	{#State 70
+		ACTIONS => {
+			"\$" => 43,
+			'COMMA' => 171,
+			'LITERAL' => 75,
+			'IDENT' => 2,
+			"\${" => 37
+		},
+		DEFAULT => -19,
+		GOTOS => {
+			'item' => 39,
+			'assign' => 170,
+			'node' => 23,
+			'ident' => 74
+		}
+	},
+	{#State 71
+		DEFAULT => -8
+	},
+	{#State 72
+		DEFAULT => -1
+	},
+	{#State 73
+		DEFAULT => -21
+	},
+	{#State 74
+		ACTIONS => {
+			'ASSIGN' => 172,
+			'DOT' => 104
+		}
+	},
+	{#State 75
+		ACTIONS => {
+			'ASSIGN' => 154
+		}
+	},
+	{#State 76
+		ACTIONS => {
+			'COMMA' => 171,
+			'LITERAL' => 75,
+			'IDENT' => 2,
+			"\$" => 43,
+			"\${" => 37
+		},
+		DEFAULT => -30,
+		GOTOS => {
+			'item' => 39,
+			'assign' => 170,
+			'node' => 23,
+			'ident' => 74
+		}
+	},
+	{#State 77
+		ACTIONS => {
+			'DOT' => 104
+		},
+		DEFAULT => -109
+	},
+	{#State 78
+		DEFAULT => -112
+	},
+	{#State 79
+		ACTIONS => {
+			'CMPOP' => 164,
+			"?" => 158,
+			";" => 173,
+			"+" => 157,
+			'MOD' => 165,
+			'DIV' => 159,
+			"/" => 166,
+			'AND' => 160,
+			'CAT' => 163,
+			'BINOP' => 161,
+			'OR' => 162
+		}
+	},
+	{#State 80
+		DEFAULT => -7
+	},
+	{#State 81
+		DEFAULT => -173
+	},
+	{#State 82
+		DEFAULT => -166
+	},
+	{#State 83
+		DEFAULT => -172
+	},
+	{#State 84
+		DEFAULT => -174
+	},
+	{#State 85
+		ACTIONS => {
+			'DOT' => 174
+		},
+		DEFAULT => -168
+	},
+	{#State 86
+		ACTIONS => {
+			"\$" => 43,
+			'IDENT' => 2,
+			"\${" => 37
+		},
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'ident' => 175
+		}
+	},
+	{#State 87
+		DEFAULT => -171
+	},
+	{#State 88
+		DEFAULT => -169
+	},
+	{#State 89
+		DEFAULT => -176,
+		GOTOS => {
+			'quoted' => 176
+		}
+	},
+	{#State 90
+		DEFAULT => -35
+	},
+	{#State 91
+		ACTIONS => {
+			"+" => 177,
+			"(" => 178
+		},
+		DEFAULT => -156,
+		GOTOS => {
+			'args' => 179
+		}
+	},
+	{#State 92
+		ACTIONS => {
+			"{" => 30,
+			'COMMA' => 182,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"]" => 180,
+			"\${" => 37
+		},
+		GOTOS => {
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 181,
+			'lterm' => 56
+		}
+	},
+	{#State 93
+		ACTIONS => {
+			"]" => 183
+		}
+	},
+	{#State 94
+		DEFAULT => -107
+	},
+	{#State 95
+		DEFAULT => -116
+	},
+	{#State 96
+		ACTIONS => {
+			'TO' => 184
+		},
+		DEFAULT => -104
+	},
+	{#State 97
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 185,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 98
+		ACTIONS => {
+			";" => 186
+		}
+	},
+	{#State 99
+		ACTIONS => {
+			'ASSIGN' => 187
+		}
+	},
+	{#State 100
+		DEFAULT => -99
+	},
+	{#State 101
+		ACTIONS => {
+			'COMMA' => 189,
+			'IDENT' => 99
+		},
+		DEFAULT => -86,
+		GOTOS => {
+			'meta' => 188
+		}
+	},
+	{#State 102
+		ACTIONS => {
+			";" => 190
+		}
+	},
+	{#State 103
+		ACTIONS => {
+			'COMMA' => 189,
+			'IDENT' => 99
+		},
+		DEFAULT => -17,
+		GOTOS => {
+			'meta' => 188
+		}
+	},
+	{#State 104
+		ACTIONS => {
+			"\$" => 43,
+			'IDENT' => 2,
+			'NUMBER' => 192,
+			"\${" => 37
+		},
+		GOTOS => {
+			'item' => 39,
+			'node' => 191
+		}
+	},
+	{#State 105
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'WRAPPER' => 55,
+			'FOR' => 21,
+			'NEXT' => 22,
+			'LITERAL' => 57,
+			"\"" => 60,
+			'PROCESS' => 61,
+			'FILTER' => 25,
+			'RETURN' => 64,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 193,
+			'DEFAULT' => 69,
+			"{" => 30,
+			"\${" => 37
+		},
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'term' => 58,
+			'loop' => 4,
+			'expr' => 195,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'atomdir' => 12,
+			'mdir' => 194,
+			'filter' => 29,
+			'sterm' => 68,
+			'ident' => 149,
+			'perl' => 31,
+			'setlist' => 70,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'directive' => 196,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 106
+		DEFAULT => -33
+	},
+	{#State 107
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'INCLUDE' => 17,
+			"(" => 198,
+			'SWITCH' => 54,
+			'WRAPPER' => 55,
+			'FOR' => 21,
+			'NEXT' => 22,
+			'LITERAL' => 57,
+			"\"" => 60,
+			'PROCESS' => 61,
+			'FILTER' => 25,
+			'RETURN' => 64,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 193,
+			'DEFAULT' => 69,
+			"{" => 30,
+			"\${" => 37
+		},
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'term' => 58,
+			'loop' => 4,
+			'expr' => 199,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'atomdir' => 12,
+			'mdir' => 197,
+			'filter' => 29,
+			'sterm' => 68,
+			'ident' => 149,
+			'perl' => 31,
+			'setlist' => 70,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'directive' => 196,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 108
+		ACTIONS => {
+			'IN' => 201,
+			'ASSIGN' => 200
+		},
+		DEFAULT => -130
+	},
+	{#State 109
+		DEFAULT => -156,
+		GOTOS => {
+			'args' => 202
+		}
+	},
+	{#State 110
+		ACTIONS => {
+			";" => 203
+		}
+	},
+	{#State 111
+		ACTIONS => {
+			'ASSIGN' => -130
+		},
+		DEFAULT => -173
+	},
+	{#State 112
+		ACTIONS => {
+			'ASSIGN' => 204
+		}
+	},
+	{#State 113
+		DEFAULT => -159
+	},
+	{#State 114
+		ACTIONS => {
+			"\$" => 43,
+			'IDENT' => 205,
+			"\${" => 37
+		},
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'ident' => 175
+		}
+	},
+	{#State 115
+		ACTIONS => {
+			";" => 206
+		}
+	},
+	{#State 116
+		ACTIONS => {
+			'ASSIGN' => -161
+		},
+		DEFAULT => -169
+	},
+	{#State 117
+		DEFAULT => -176,
+		GOTOS => {
+			'quoted' => 207
+		}
+	},
+	{#State 118
+		DEFAULT => -158
+	},
+	{#State 119
+		ACTIONS => {
+			'DOT' => 104
+		},
+		DEFAULT => -110
+	},
+	{#State 120
+		ACTIONS => {
+			'ASSIGN' => 187
+		},
+		DEFAULT => -173
+	},
+	{#State 121
+		DEFAULT => -83
+	},
+	{#State 122
+		ACTIONS => {
+			'DOT' => 174
+		},
+		DEFAULT => -84
+	},
+	{#State 123
+		ACTIONS => {
+			";" => 208
+		}
+	},
+	{#State 124
+		DEFAULT => -85
+	},
+	{#State 125
+		ACTIONS => {
+			"}" => 209
+		}
+	},
+	{#State 126
+		ACTIONS => {
+			'ASSIGN' => 210
+		}
+	},
+	{#State 127
+		DEFAULT => -122
+	},
+	{#State 128
+		ACTIONS => {
+			"\$" => 43,
+			'COMMA' => 212,
+			'LITERAL' => 129,
+			'IDENT' => 2,
+			"\${" => 37
+		},
+		DEFAULT => -118,
+		GOTOS => {
+			'item' => 126,
+			'param' => 211
+		}
+	},
+	{#State 129
+		ACTIONS => {
+			'ASSIGN' => 213
+		}
+	},
+	{#State 130
+		DEFAULT => -73
+	},
+	{#State 131
+		DEFAULT => -4
+	},
+	{#State 132
+		ACTIONS => {
+			";" => 214
+		}
+	},
+	{#State 133
+		ACTIONS => {
+			"}" => 215
+		}
+	},
+	{#State 134
+		ACTIONS => {
+			'DIV' => 159,
+			'BINOP' => 161,
+			"+" => 157,
+			'CAT' => 163,
+			'CMPOP' => 164,
+			'MOD' => 165,
+			"/" => 166
+		},
+		DEFAULT => -142
+	},
+	{#State 135
+		DEFAULT => -156,
+		GOTOS => {
+			'args' => 216
+		}
+	},
+	{#State 136
+		DEFAULT => -76,
+		GOTOS => {
+			'@4-2' => 217
+		}
+	},
+	{#State 137
+		DEFAULT => -132
+	},
+	{#State 138
+		ACTIONS => {
+			'CMPOP' => 164,
+			"?" => 158,
+			";" => 218,
+			"+" => 157,
+			'MOD' => 165,
+			'DIV' => 159,
+			"/" => 166,
+			'AND' => 160,
+			'CAT' => 163,
+			'BINOP' => 161,
+			'OR' => 162
+		}
+	},
+	{#State 139
+		ACTIONS => {
+			"+" => 157,
+			'CAT' => 163,
+			'CMPOP' => 164,
+			"?" => 158,
+			'DIV' => 159,
+			'MOD' => 165,
+			"/" => 166,
+			'AND' => 160,
+			'BINOP' => 161,
+			'OR' => 162
+		},
+		DEFAULT => -29
+	},
+	{#State 140
+		ACTIONS => {
+			"+" => 157,
+			'CAT' => 163,
+			'CMPOP' => 164,
+			"?" => 158,
+			'DIV' => 159,
+			'MOD' => 165,
+			"/" => 166,
+			'AND' => 160,
+			'BINOP' => 161,
+			'OR' => 162
+		},
+		DEFAULT => -28
+	},
+	{#State 141
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 219,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 142
+		ACTIONS => {
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 108,
+			"\"" => 60,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'sterm' => 68,
+			'item' => 39,
+			'loopvar' => 220,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 109,
+			'lterm' => 56
+		}
+	},
+	{#State 143
+		ACTIONS => {
+			"\"" => 117,
+			"\$" => 114,
+			'LITERAL' => 116,
+			'FILENAME' => 83,
+			'IDENT' => 111,
+			'NUMBER' => 84,
+			"\${" => 37
+		},
+		GOTOS => {
+			'names' => 91,
+			'lvalue' => 112,
+			'item' => 113,
+			'name' => 82,
+			'filepart' => 87,
+			'filename' => 85,
+			'nameargs' => 118,
+			'lnameargs' => 221
+		}
+	},
+	{#State 144
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 222,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 145
+		ACTIONS => {
+			"\"" => 89,
+			"\$" => 86,
+			'LITERAL' => 88,
+			'FILENAME' => 83,
+			'IDENT' => 81,
+			'NUMBER' => 84
+		},
+		GOTOS => {
+			'filepart' => 87,
+			'names' => 91,
+			'nameargs' => 223,
+			'filename' => 85,
+			'name' => 82
+		}
+	},
+	{#State 146
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 224,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 147
+		DEFAULT => -41
+	},
+	{#State 148
+		DEFAULT => 0
+	},
+	{#State 149
+		ACTIONS => {
+			'DOT' => 104,
+			'ASSIGN' => 172
+		},
+		DEFAULT => -109
+	},
+	{#State 150
+		ACTIONS => {
+			")" => 225
+		}
+	},
+	{#State 151
+		ACTIONS => {
+			'CMPOP' => 164,
+			"?" => 158,
+			"+" => 157,
+			'MOD' => 165,
+			'DIV' => 159,
+			"/" => 166,
+			'AND' => 160,
+			'CAT' => 163,
+			'BINOP' => 161,
+			")" => 226,
+			'OR' => 162
+		}
+	},
+	{#State 152
+		ACTIONS => {
+			'CMPOP' => 164,
+			"?" => 158,
+			";" => 227,
+			"+" => 157,
+			'MOD' => 165,
+			'DIV' => 159,
+			"/" => 166,
+			'AND' => 160,
+			'CAT' => 163,
+			'BINOP' => 161,
+			'OR' => 162
+		}
+	},
+	{#State 153
+		ACTIONS => {
+			";" => 228
+		}
+	},
+	{#State 154
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 229,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 155
+		ACTIONS => {
+			"\"" => 234,
+			'TEXT' => 231,
+			";" => 233,
+			"\$" => 43,
+			'IDENT' => 2,
+			"\${" => 37
+		},
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'ident' => 230,
+			'quotable' => 232
+		}
+	},
+	{#State 156
+		DEFAULT => -34
+	},
+	{#State 157
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 235,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 158
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 236,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 159
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 237,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 160
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 238,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 161
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 239,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 162
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 240,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 163
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 241,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 164
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 242,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 165
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 243,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 166
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 244,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 167
+		DEFAULT => -32
+	},
+	{#State 168
+		ACTIONS => {
+			'CMPOP' => 164,
+			"?" => 158,
+			";" => 245,
+			"+" => 157,
+			'MOD' => 165,
+			'DIV' => 159,
+			"/" => 166,
+			'AND' => 160,
+			'CAT' => 163,
+			'BINOP' => 161,
+			'OR' => 162
+		}
+	},
+	{#State 169
+		ACTIONS => {
+			'COMMA' => 171,
+			'LITERAL' => 75,
+			'IDENT' => 2,
+			"\$" => 43,
+			"\${" => 37
+		},
+		DEFAULT => -31,
+		GOTOS => {
+			'item' => 39,
+			'assign' => 170,
+			'node' => 23,
+			'ident' => 74
+		}
+	},
+	{#State 170
+		DEFAULT => -147
+	},
+	{#State 171
+		DEFAULT => -148
+	},
+	{#State 172
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 246,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 173
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 247,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 174
+		ACTIONS => {
+			'FILENAME' => 83,
+			'IDENT' => 81,
+			'NUMBER' => 84
+		},
+		GOTOS => {
+			'filepart' => 248
+		}
+	},
+	{#State 175
+		ACTIONS => {
+			'DOT' => 104
+		},
+		DEFAULT => -156,
+		GOTOS => {
+			'args' => 249
+		}
+	},
+	{#State 176
+		ACTIONS => {
+			"\"" => 250,
+			'TEXT' => 231,
+			";" => 233,
+			"\$" => 43,
+			'IDENT' => 2,
+			"\${" => 37
+		},
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'ident' => 230,
+			'quotable' => 232
+		}
+	},
+	{#State 177
+		ACTIONS => {
+			"\"" => 89,
+			'LITERAL' => 88,
+			'FILENAME' => 83,
+			'IDENT' => 81,
+			'NUMBER' => 84
+		},
+		GOTOS => {
+			'filepart' => 87,
+			'filename' => 85,
+			'name' => 251
+		}
+	},
+	{#State 178
+		DEFAULT => -156,
+		GOTOS => {
+			'args' => 252
+		}
+	},
+	{#State 179
+		ACTIONS => {
+			'NOT' => 38,
+			'LITERAL' => 256,
+			'IDENT' => 2,
+			"\"" => 60,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"{" => 30,
+			'COMMA' => 258,
+			"(" => 53,
+			"\${" => 37
+		},
+		DEFAULT => -163,
+		GOTOS => {
+			'expr' => 257,
+			'sterm' => 68,
+			'item' => 254,
+			'param' => 255,
+			'node' => 23,
+			'ident' => 253,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 180
+		DEFAULT => -105
+	},
+	{#State 181
+		DEFAULT => -114
+	},
+	{#State 182
+		DEFAULT => -115
+	},
+	{#State 183
+		DEFAULT => -106
+	},
+	{#State 184
+		ACTIONS => {
+			"\"" => 60,
+			"\$" => 43,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			'REF' => 27,
+			'NUMBER' => 26,
+			"\${" => 37
+		},
+		GOTOS => {
+			'sterm' => 259,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77
+		}
+	},
+	{#State 185
+		ACTIONS => {
+			'FINAL' => 260,
+			'CATCH' => 262
+		},
+		DEFAULT => -72,
+		GOTOS => {
+			'final' => 261
+		}
+	},
+	{#State 186
+		ACTIONS => {
+			'TEXT' => 263
+		}
+	},
+	{#State 187
+		ACTIONS => {
+			"\"" => 266,
+			'LITERAL' => 265,
+			'NUMBER' => 264
+		}
+	},
+	{#State 188
+		DEFAULT => -97
+	},
+	{#State 189
+		DEFAULT => -98
+	},
+	{#State 190
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'loop' => 4,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'template' => 267,
+			'defblockname' => 14,
+			'ident' => 16,
+			'assign' => 19,
+			'macro' => 20,
+			'lterm' => 56,
+			'node' => 23,
+			'term' => 58,
+			'rawperl' => 59,
+			'expr' => 62,
+			'use' => 63,
+			'defblock' => 66,
+			'filter' => 29,
+			'sterm' => 68,
+			'perl' => 31,
+			'chunks' => 33,
+			'setlist' => 70,
+			'switch' => 34,
+			'try' => 35,
+			'directive' => 71,
+			'block' => 72,
+			'condition' => 73
+		}
+	},
+	{#State 191
+		DEFAULT => -125
+	},
+	{#State 192
+		DEFAULT => -126
+	},
+	{#State 193
+		ACTIONS => {
+			";" => 268
+		}
+	},
+	{#State 194
+		DEFAULT => -89
+	},
+	{#State 195
+		ACTIONS => {
+			";" => -150,
+			"+" => 157,
+			'LITERAL' => -150,
+			'IDENT' => -150,
+			'CAT' => 163,
+			"\$" => -150,
+			'CMPOP' => 164,
+			"?" => 158,
+			'DIV' => 159,
+			'MOD' => 165,
+			'COMMA' => -150,
+			"/" => 166,
+			'AND' => 160,
+			'BINOP' => 161,
+			'OR' => 162,
+			"\${" => -150
+		},
+		DEFAULT => -26
+	},
+	{#State 196
+		DEFAULT => -92
+	},
+	{#State 197
+		DEFAULT => -91
+	},
+	{#State 198
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 57,
+			'IDENT' => 269,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 151,
+			'sterm' => 68,
+			'item' => 39,
+			'assign' => 150,
+			'margs' => 270,
+			'node' => 23,
+			'ident' => 149,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 199
+		ACTIONS => {
+			"+" => 157,
+			'CAT' => 163,
+			'CMPOP' => 164,
+			"?" => 158,
+			'DIV' => 159,
+			'MOD' => 165,
+			"/" => 166,
+			'AND' => 160,
+			'BINOP' => 161,
+			'OR' => 162
+		},
+		DEFAULT => -26
+	},
+	{#State 200
+		ACTIONS => {
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 271,
+			'lterm' => 56
+		}
+	},
+	{#State 201
+		ACTIONS => {
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 272,
+			'lterm' => 56
+		}
+	},
+	{#State 202
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'COMMA' => 258,
+			'LITERAL' => 256,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		DEFAULT => -64,
+		GOTOS => {
+			'expr' => 257,
+			'sterm' => 68,
+			'item' => 254,
+			'param' => 255,
+			'node' => 23,
+			'ident' => 253,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 203
+		DEFAULT => -56,
+		GOTOS => {
+			'@1-3' => 273
+		}
+	},
+	{#State 204
+		ACTIONS => {
+			"\"" => 89,
+			"\$" => 86,
+			'LITERAL' => 88,
+			'FILENAME' => 83,
+			'IDENT' => 81,
+			'NUMBER' => 84
+		},
+		GOTOS => {
+			'filepart' => 87,
+			'names' => 91,
+			'nameargs' => 274,
+			'filename' => 85,
+			'name' => 82
+		}
+	},
+	{#State 205
+		ACTIONS => {
+			'ASSIGN' => -132
+		},
+		DEFAULT => -130
+	},
+	{#State 206
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 275,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 207
+		ACTIONS => {
+			"\"" => 276,
+			'TEXT' => 231,
+			";" => 233,
+			"\$" => 43,
+			'IDENT' => 2,
+			"\${" => 37
+		},
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'ident' => 230,
+			'quotable' => 232
+		}
+	},
+	{#State 208
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 277,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 209
+		DEFAULT => -108
+	},
+	{#State 210
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 278,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 211
+		DEFAULT => -120
+	},
+	{#State 212
+		DEFAULT => -121
+	},
+	{#State 213
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 279,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 214
+		DEFAULT => -74,
+		GOTOS => {
+			'@3-3' => 280
+		}
+	},
+	{#State 215
+		DEFAULT => -131
+	},
+	{#State 216
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'COMMA' => 258,
+			'LITERAL' => 256,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			")" => 281,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 257,
+			'sterm' => 68,
+			'item' => 254,
+			'param' => 255,
+			'node' => 23,
+			'ident' => 253,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 217
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 282,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 218
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 283,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 219
+		ACTIONS => {
+			'CMPOP' => 164,
+			"?" => 158,
+			"+" => 157,
+			'MOD' => 165,
+			'DIV' => 159,
+			"/" => 166,
+			'AND' => 160,
+			'CAT' => 163,
+			'BINOP' => 161,
+			'OR' => 162
+		},
+		DEFAULT => -47
+	},
+	{#State 220
+		DEFAULT => -58
+	},
+	{#State 221
+		DEFAULT => -81
+	},
+	{#State 222
+		ACTIONS => {
+			'CMPOP' => 164,
+			"?" => 158,
+			"+" => 157,
+			'MOD' => 165,
+			'DIV' => 159,
+			"/" => 166,
+			'AND' => 160,
+			'CAT' => 163,
+			'BINOP' => 161,
+			'OR' => 162
+		},
+		DEFAULT => -45
+	},
+	{#State 223
+		DEFAULT => -66
+	},
+	{#State 224
+		ACTIONS => {
+			'CMPOP' => 164,
+			"?" => 158,
+			"+" => 157,
+			'MOD' => 165,
+			'DIV' => 159,
+			"/" => 166,
+			'AND' => 160,
+			'CAT' => 163,
+			'BINOP' => 161,
+			'OR' => 162
+		},
+		DEFAULT => -61
+	},
+	{#State 225
+		DEFAULT => -144
+	},
+	{#State 226
+		DEFAULT => -145
+	},
+	{#State 227
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 284,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 228
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 285,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 229
+		ACTIONS => {
+			"+" => 157,
+			'CAT' => 163,
+			'CMPOP' => 164,
+			"?" => 158,
+			'DIV' => 159,
+			'MOD' => 165,
+			"/" => 166,
+			'AND' => 160,
+			'BINOP' => 161,
+			'OR' => 162
+		},
+		DEFAULT => -151
+	},
+	{#State 230
+		ACTIONS => {
+			'DOT' => 104
+		},
+		DEFAULT => -177
+	},
+	{#State 231
+		DEFAULT => -178
+	},
+	{#State 232
+		DEFAULT => -175
+	},
+	{#State 233
+		DEFAULT => -179
+	},
+	{#State 234
+		DEFAULT => -111
+	},
+	{#State 235
+		ACTIONS => {
+			'DIV' => 159,
+			'MOD' => 165,
+			"/" => 166
+		},
+		DEFAULT => -135
+	},
+	{#State 236
+		ACTIONS => {
+			":" => 286,
+			'CMPOP' => 164,
+			"?" => 158,
+			"+" => 157,
+			'MOD' => 165,
+			'DIV' => 159,
+			"/" => 166,
+			'AND' => 160,
+			'CAT' => 163,
+			'BINOP' => 161,
+			'OR' => 162
+		}
+	},
+	{#State 237
+		ACTIONS => {
+			'MOD' => 165
+		},
+		DEFAULT => -136
+	},
+	{#State 238
+		ACTIONS => {
+			'DIV' => 159,
+			'BINOP' => 161,
+			"+" => 157,
+			'CAT' => 163,
+			'CMPOP' => 164,
+			'MOD' => 165,
+			"/" => 166
+		},
+		DEFAULT => -140
+	},
+	{#State 239
+		ACTIONS => {
+			'DIV' => 159,
+			"+" => 157,
+			'MOD' => 165,
+			"/" => 166
+		},
+		DEFAULT => -133
+	},
+	{#State 240
+		ACTIONS => {
+			'DIV' => 159,
+			'BINOP' => 161,
+			"+" => 157,
+			'CAT' => 163,
+			'CMPOP' => 164,
+			'MOD' => 165,
+			"/" => 166
+		},
+		DEFAULT => -141
+	},
+	{#State 241
+		ACTIONS => {
+			'DIV' => 159,
+			'BINOP' => 161,
+			"+" => 157,
+			'CMPOP' => 164,
+			'MOD' => 165,
+			"/" => 166
+		},
+		DEFAULT => -139
+	},
+	{#State 242
+		ACTIONS => {
+			'DIV' => 159,
+			'BINOP' => 161,
+			"+" => 157,
+			'MOD' => 165,
+			"/" => 166
+		},
+		DEFAULT => -138
+	},
+	{#State 243
+		DEFAULT => -137
+	},
+	{#State 244
+		ACTIONS => {
+			'DIV' => 159,
+			'MOD' => 165
+		},
+		DEFAULT => -134
+	},
+	{#State 245
+		DEFAULT => -59,
+		GOTOS => {
+			'@2-3' => 287
+		}
+	},
+	{#State 246
+		ACTIONS => {
+			"+" => 157,
+			'CAT' => 163,
+			'CMPOP' => 164,
+			"?" => 158,
+			'DIV' => 159,
+			'MOD' => 165,
+			"/" => 166,
+			'AND' => 160,
+			'BINOP' => 161,
+			'OR' => 162
+		},
+		DEFAULT => -150
+	},
+	{#State 247
+		ACTIONS => {
+			'ELSIF' => 290,
+			'ELSE' => 288
+		},
+		DEFAULT => -50,
+		GOTOS => {
+			'else' => 289
+		}
+	},
+	{#State 248
+		DEFAULT => -170
+	},
+	{#State 249
+		ACTIONS => {
+			'NOT' => 38,
+			'LITERAL' => 256,
+			'IDENT' => 2,
+			"\"" => 60,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"{" => 30,
+			'COMMA' => 258,
+			"(" => 53,
+			"\${" => 37
+		},
+		DEFAULT => -162,
+		GOTOS => {
+			'expr' => 257,
+			'sterm' => 68,
+			'item' => 254,
+			'param' => 255,
+			'node' => 23,
+			'ident' => 253,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 250
+		DEFAULT => -167
+	},
+	{#State 251
+		DEFAULT => -165
+	},
+	{#State 252
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'COMMA' => 258,
+			'LITERAL' => 256,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			")" => 291,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 257,
+			'sterm' => 68,
+			'item' => 254,
+			'param' => 255,
+			'node' => 23,
+			'ident' => 253,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 253
+		ACTIONS => {
+			'DOT' => 104,
+			'ASSIGN' => 292
+		},
+		DEFAULT => -109
+	},
+	{#State 254
+		ACTIONS => {
+			"(" => 135,
+			'ASSIGN' => 210
+		},
+		DEFAULT => -128
+	},
+	{#State 255
+		DEFAULT => -153
+	},
+	{#State 256
+		ACTIONS => {
+			'ASSIGN' => 213
+		},
+		DEFAULT => -112
+	},
+	{#State 257
+		ACTIONS => {
+			"+" => 157,
+			'CAT' => 163,
+			'CMPOP' => 164,
+			"?" => 158,
+			'DIV' => 159,
+			'MOD' => 165,
+			"/" => 166,
+			'AND' => 160,
+			'BINOP' => 161,
+			'OR' => 162
+		},
+		DEFAULT => -152
+	},
+	{#State 258
+		DEFAULT => -155
+	},
+	{#State 259
+		DEFAULT => -117
+	},
+	{#State 260
+		ACTIONS => {
+			";" => 293
+		}
+	},
+	{#State 261
+		ACTIONS => {
+			'END' => 294
+		}
+	},
+	{#State 262
+		ACTIONS => {
+			";" => 296,
+			'DEFAULT' => 297,
+			'FILENAME' => 83,
+			'IDENT' => 81,
+			'NUMBER' => 84
+		},
+		GOTOS => {
+			'filepart' => 87,
+			'filename' => 295
+		}
+	},
+	{#State 263
+		ACTIONS => {
+			'END' => 298
+		}
+	},
+	{#State 264
+		DEFAULT => -102
+	},
+	{#State 265
+		DEFAULT => -100
+	},
+	{#State 266
+		ACTIONS => {
+			'TEXT' => 299
+		}
+	},
+	{#State 267
+		ACTIONS => {
+			'END' => 300
+		}
+	},
+	{#State 268
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 301,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 269
+		ACTIONS => {
+			'IDENT' => -96,
+			")" => -96,
+			'COMMA' => -96
+		},
+		DEFAULT => -130
+	},
+	{#State 270
+		ACTIONS => {
+			'COMMA' => 304,
+			'IDENT' => 302,
+			")" => 303
+		}
+	},
+	{#State 271
+		DEFAULT => -156,
+		GOTOS => {
+			'args' => 305
+		}
+	},
+	{#State 272
+		DEFAULT => -156,
+		GOTOS => {
+			'args' => 306
+		}
+	},
+	{#State 273
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 307,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 274
+		DEFAULT => -157
+	},
+	{#State 275
+		ACTIONS => {
+			'END' => 308
+		}
+	},
+	{#State 276
+		ACTIONS => {
+			'ASSIGN' => -160
+		},
+		DEFAULT => -167
+	},
+	{#State 277
+		ACTIONS => {
+			'END' => 309
+		}
+	},
+	{#State 278
+		ACTIONS => {
+			'DIV' => 159,
+			'AND' => 160,
+			'BINOP' => 161,
+			'OR' => 162,
+			"+" => 157,
+			'CAT' => 163,
+			'CMPOP' => 164,
+			"?" => 158,
+			'MOD' => 165,
+			"/" => 166
+		},
+		DEFAULT => -124
+	},
+	{#State 279
+		ACTIONS => {
+			'DIV' => 159,
+			'AND' => 160,
+			'BINOP' => 161,
+			'OR' => 162,
+			"+" => 157,
+			'CAT' => 163,
+			'CMPOP' => 164,
+			"?" => 158,
+			'MOD' => 165,
+			"/" => 166
+		},
+		DEFAULT => -123
+	},
+	{#State 280
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 310,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 281
+		DEFAULT => -129
+	},
+	{#State 282
+		ACTIONS => {
+			'END' => 311
+		}
+	},
+	{#State 283
+		ACTIONS => {
+			'ELSIF' => 290,
+			'ELSE' => 288
+		},
+		DEFAULT => -50,
+		GOTOS => {
+			'else' => 312
+		}
+	},
+	{#State 284
+		ACTIONS => {
+			'CASE' => 313
+		},
+		DEFAULT => -55,
+		GOTOS => {
+			'case' => 314
+		}
+	},
+	{#State 285
+		ACTIONS => {
+			'END' => 315
+		}
+	},
+	{#State 286
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 316,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 287
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 317,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 288
+		ACTIONS => {
+			";" => 318
+		}
+	},
+	{#State 289
+		ACTIONS => {
+			'END' => 319
+		}
+	},
+	{#State 290
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 320,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 291
+		DEFAULT => -164
+	},
+	{#State 292
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'expr' => 321,
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 293
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 322,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 294
+		DEFAULT => -67
+	},
+	{#State 295
+		ACTIONS => {
+			'DOT' => 174,
+			";" => 323
+		}
+	},
+	{#State 296
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 324,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 297
+		ACTIONS => {
+			";" => 325
+		}
+	},
+	{#State 298
+		DEFAULT => -79
+	},
+	{#State 299
+		ACTIONS => {
+			"\"" => 326
+		}
+	},
+	{#State 300
+		DEFAULT => -82
+	},
+	{#State 301
+		ACTIONS => {
+			'END' => 327
+		}
+	},
+	{#State 302
+		DEFAULT => -94
+	},
+	{#State 303
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'WRAPPER' => 55,
+			'FOR' => 21,
+			'NEXT' => 22,
+			'LITERAL' => 57,
+			"\"" => 60,
+			'PROCESS' => 61,
+			'FILTER' => 25,
+			'RETURN' => 64,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 193,
+			'DEFAULT' => 69,
+			"{" => 30,
+			"\${" => 37
+		},
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'term' => 58,
+			'loop' => 4,
+			'expr' => 199,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'atomdir' => 12,
+			'mdir' => 328,
+			'filter' => 29,
+			'sterm' => 68,
+			'ident' => 149,
+			'perl' => 31,
+			'setlist' => 70,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'directive' => 196,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 304
+		DEFAULT => -95
+	},
+	{#State 305
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'COMMA' => 258,
+			'LITERAL' => 256,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		DEFAULT => -62,
+		GOTOS => {
+			'expr' => 257,
+			'sterm' => 68,
+			'item' => 254,
+			'param' => 255,
+			'node' => 23,
+			'ident' => 253,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 306
+		ACTIONS => {
+			'NOT' => 38,
+			"{" => 30,
+			'COMMA' => 258,
+			'LITERAL' => 256,
+			'IDENT' => 2,
+			"\"" => 60,
+			"(" => 53,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		DEFAULT => -63,
+		GOTOS => {
+			'expr' => 257,
+			'sterm' => 68,
+			'item' => 254,
+			'param' => 255,
+			'node' => 23,
+			'ident' => 253,
+			'term' => 58,
+			'lterm' => 56
+		}
+	},
+	{#State 307
+		ACTIONS => {
+			'END' => 329
+		}
+	},
+	{#State 308
+		DEFAULT => -80
+	},
+	{#State 309
+		DEFAULT => -88
+	},
+	{#State 310
+		ACTIONS => {
+			'END' => 330
+		}
+	},
+	{#State 311
+		DEFAULT => -77
+	},
+	{#State 312
+		ACTIONS => {
+			'END' => 331
+		}
+	},
+	{#State 313
+		ACTIONS => {
+			";" => 332,
+			'DEFAULT' => 334,
+			"{" => 30,
+			'LITERAL' => 78,
+			'IDENT' => 2,
+			"\"" => 60,
+			"\$" => 43,
+			"[" => 9,
+			'NUMBER' => 26,
+			'REF' => 27,
+			"\${" => 37
+		},
+		GOTOS => {
+			'sterm' => 68,
+			'item' => 39,
+			'node' => 23,
+			'ident' => 77,
+			'term' => 333,
+			'lterm' => 56
+		}
+	},
+	{#State 314
+		ACTIONS => {
+			'END' => 335
+		}
+	},
+	{#State 315
+		DEFAULT => -65
+	},
+	{#State 316
+		ACTIONS => {
+			'DIV' => 159,
+			'AND' => 160,
+			'BINOP' => 161,
+			'OR' => 162,
+			"+" => 157,
+			'CAT' => 163,
+			'CMPOP' => 164,
+			"?" => 158,
+			'MOD' => 165,
+			"/" => 166
+		},
+		DEFAULT => -143
+	},
+	{#State 317
+		ACTIONS => {
+			'END' => 336
+		}
+	},
+	{#State 318
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 337,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 319
+		DEFAULT => -46
+	},
+	{#State 320
+		ACTIONS => {
+			'CMPOP' => 164,
+			"?" => 158,
+			";" => 338,
+			"+" => 157,
+			'MOD' => 165,
+			'DIV' => 159,
+			"/" => 166,
+			'AND' => 160,
+			'CAT' => 163,
+			'BINOP' => 161,
+			'OR' => 162
+		}
+	},
+	{#State 321
+		ACTIONS => {
+			"+" => 157,
+			'CAT' => 163,
+			'CMPOP' => 164,
+			"?" => 158,
+			'DIV' => 159,
+			'MOD' => 165,
+			"/" => 166,
+			'AND' => 160,
+			'BINOP' => 161,
+			'OR' => 162
+		},
+		DEFAULT => -154
+	},
+	{#State 322
+		DEFAULT => -71
+	},
+	{#State 323
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 339,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 324
+		ACTIONS => {
+			'FINAL' => 260,
+			'CATCH' => 262
+		},
+		DEFAULT => -72,
+		GOTOS => {
+			'final' => 340
+		}
+	},
+	{#State 325
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 341,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 326
+		DEFAULT => -101
+	},
+	{#State 327
+		DEFAULT => -93
+	},
+	{#State 328
+		DEFAULT => -90
+	},
+	{#State 329
+		DEFAULT => -57
+	},
+	{#State 330
+		DEFAULT => -75
+	},
+	{#State 331
+		DEFAULT => -44
+	},
+	{#State 332
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 342,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 333
+		ACTIONS => {
+			";" => 343
+		}
+	},
+	{#State 334
+		ACTIONS => {
+			";" => 344
+		}
+	},
+	{#State 335
+		DEFAULT => -51
+	},
+	{#State 336
+		DEFAULT => -60
+	},
+	{#State 337
+		DEFAULT => -49
+	},
+	{#State 338
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 345,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 339
+		ACTIONS => {
+			'FINAL' => 260,
+			'CATCH' => 262
+		},
+		DEFAULT => -72,
+		GOTOS => {
+			'final' => 346
+		}
+	},
+	{#State 340
+		DEFAULT => -70
+	},
+	{#State 341
+		ACTIONS => {
+			'FINAL' => 260,
+			'CATCH' => 262
+		},
+		DEFAULT => -72,
+		GOTOS => {
+			'final' => 347
+		}
+	},
+	{#State 342
+		DEFAULT => -54
+	},
+	{#State 343
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 348,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 344
+		ACTIONS => {
+			'SET' => 1,
+			'PERL' => 40,
+			'NOT' => 38,
+			'IDENT' => 2,
+			'CLEAR' => 41,
+			'UNLESS' => 3,
+			'IF' => 44,
+			"\$" => 43,
+			'STOP' => 6,
+			'CALL' => 45,
+			'THROW' => 8,
+			'GET' => 47,
+			"[" => 9,
+			'TRY' => 10,
+			'LAST' => 49,
+			'DEBUG' => 51,
+			'RAWPERL' => 13,
+			'META' => 15,
+			'INCLUDE' => 17,
+			"(" => 53,
+			'SWITCH' => 54,
+			'MACRO' => 18,
+			'WRAPPER' => 55,
+			";" => -18,
+			'FOR' => 21,
+			'LITERAL' => 57,
+			'NEXT' => 22,
+			"\"" => 60,
+			'TEXT' => 24,
+			'PROCESS' => 61,
+			'RETURN' => 64,
+			'FILTER' => 25,
+			'INSERT' => 65,
+			'NUMBER' => 26,
+			'REF' => 27,
+			'WHILE' => 67,
+			'BLOCK' => 28,
+			'DEFAULT' => 69,
+			"{" => 30,
+			'USE' => 32,
+			'VIEW' => 36,
+			"\${" => 37
+		},
+		DEFAULT => -3,
+		GOTOS => {
+			'item' => 39,
+			'node' => 23,
+			'rawperl' => 59,
+			'term' => 58,
+			'loop' => 4,
+			'use' => 63,
+			'expr' => 62,
+			'capture' => 42,
+			'statement' => 5,
+			'view' => 7,
+			'wrapper' => 46,
+			'atomexpr' => 48,
+			'chunk' => 11,
+			'defblock' => 66,
+			'atomdir' => 12,
+			'anonblock' => 50,
+			'sterm' => 68,
+			'defblockname' => 14,
+			'filter' => 29,
+			'ident' => 16,
+			'perl' => 31,
+			'setlist' => 70,
+			'chunks' => 33,
+			'try' => 35,
+			'switch' => 34,
+			'assign' => 19,
+			'block' => 349,
+			'directive' => 71,
+			'macro' => 20,
+			'condition' => 73,
+			'lterm' => 56
+		}
+	},
+	{#State 345
+		ACTIONS => {
+			'ELSIF' => 290,
+			'ELSE' => 288
+		},
+		DEFAULT => -50,
+		GOTOS => {
+			'else' => 350
+		}
+	},
+	{#State 346
+		DEFAULT => -68
+	},
+	{#State 347
+		DEFAULT => -69
+	},
+	{#State 348
+		ACTIONS => {
+			'CASE' => 313
+		},
+		DEFAULT => -55,
+		GOTOS => {
+			'case' => 351
+		}
+	},
+	{#State 349
+		DEFAULT => -53
+	},
+	{#State 350
+		DEFAULT => -48
+	},
+	{#State 351
+		DEFAULT => -52
+	}
+]; 
+
+
+#========================================================================
+# Rules
+#========================================================================
+
+$RULES = [
+	[#Rule 0
+		 '$start', 2, undef
+	],
+	[#Rule 1
+		 'template', 1,
+sub
+#line 64 "Parser.yp"
+{ $factory->template($_[1])           }
+	],
+	[#Rule 2
+		 'block', 1,
+sub
+#line 67 "Parser.yp"
+{ $factory->block($_[1])              }
+	],
+	[#Rule 3
+		 'block', 0,
+sub
+#line 68 "Parser.yp"
+{ $factory->block()                   }
+	],
+	[#Rule 4
+		 'chunks', 2,
+sub
+#line 71 "Parser.yp"
+{ push(@{$_[1]}, $_[2]) 
+                                        if defined $_[2]; $_[1]           }
+	],
+	[#Rule 5
+		 'chunks', 1,
+sub
+#line 73 "Parser.yp"
+{ defined $_[1] ? [ $_[1] ] : [ ]     }
+	],
+	[#Rule 6
+		 'chunk', 1,
+sub
+#line 76 "Parser.yp"
+{ $factory->textblock($_[1])          }
+	],
+	[#Rule 7
+		 'chunk', 2,
+sub
+#line 77 "Parser.yp"
+{ return '' unless $_[1];
+                                      $_[0]->location() . $_[1];
+                                    }
+	],
+	[#Rule 8
+		 'statement', 1, undef
+	],
+	[#Rule 9
+		 'statement', 1, undef
+	],
+	[#Rule 10
+		 'statement', 1, undef
+	],
+	[#Rule 11
+		 'statement', 1, undef
+	],
+	[#Rule 12
+		 'statement', 1, undef
+	],
+	[#Rule 13
+		 'statement', 1, undef
+	],
+	[#Rule 14
+		 'statement', 1, undef
+	],
+	[#Rule 15
+		 'statement', 1, undef
+	],
+	[#Rule 16
+		 'statement', 1,
+sub
+#line 90 "Parser.yp"
+{ $factory->get($_[1])                }
+	],
+	[#Rule 17
+		 'statement', 2,
+sub
+#line 91 "Parser.yp"
+{ $_[0]->add_metadata($_[2]);         }
+	],
+	[#Rule 18
+		 'statement', 0, undef
+	],
+	[#Rule 19
+		 'directive', 1,
+sub
+#line 95 "Parser.yp"
+{ $factory->set($_[1])                }
+	],
+	[#Rule 20
+		 'directive', 1, undef
+	],
+	[#Rule 21
+		 'directive', 1, undef
+	],
+	[#Rule 22
+		 'directive', 1, undef
+	],
+	[#Rule 23
+		 'directive', 1, undef
+	],
+	[#Rule 24
+		 'directive', 1, undef
+	],
+	[#Rule 25
+		 'directive', 1, undef
+	],
+	[#Rule 26
+		 'atomexpr', 1,
+sub
+#line 109 "Parser.yp"
+{ $factory->get($_[1])                }
+	],
+	[#Rule 27
+		 'atomexpr', 1, undef
+	],
+	[#Rule 28
+		 'atomdir', 2,
+sub
+#line 113 "Parser.yp"
+{ $factory->get($_[2])                }
+	],
+	[#Rule 29
+		 'atomdir', 2,
+sub
+#line 114 "Parser.yp"
+{ $factory->call($_[2])               }
+	],
+	[#Rule 30
+		 'atomdir', 2,
+sub
+#line 115 "Parser.yp"
+{ $factory->set($_[2])                }
+	],
+	[#Rule 31
+		 'atomdir', 2,
+sub
+#line 116 "Parser.yp"
+{ $factory->default($_[2])            }
+	],
+	[#Rule 32
+		 'atomdir', 2,
+sub
+#line 117 "Parser.yp"
+{ $factory->insert($_[2])             }
+	],
+	[#Rule 33
+		 'atomdir', 2,
+sub
+#line 118 "Parser.yp"
+{ $factory->include($_[2])            }
+	],
+	[#Rule 34
+		 'atomdir', 2,
+sub
+#line 119 "Parser.yp"
+{ $factory->process($_[2])            }
+	],
+	[#Rule 35
+		 'atomdir', 2,
+sub
+#line 120 "Parser.yp"
+{ $factory->throw($_[2])              }
+	],
+	[#Rule 36
+		 'atomdir', 1,
+sub
+#line 121 "Parser.yp"
+{ $factory->return()                  }
+	],
+	[#Rule 37
+		 'atomdir', 1,
+sub
+#line 122 "Parser.yp"
+{ $factory->stop()                    }
+	],
+	[#Rule 38
+		 'atomdir', 1,
+sub
+#line 123 "Parser.yp"
+{ "\$output = '';";                   }
+	],
+	[#Rule 39
+		 'atomdir', 1,
+sub
+#line 124 "Parser.yp"
+{ $_[0]->block_label('last ', ';')    }
+	],
+	[#Rule 40
+		 'atomdir', 1,
+sub
+#line 125 "Parser.yp"
+{ $_[0]->in_block('FOR')
+                                        ? $factory->next($_[0]->block_label)
+                                        : $_[0]->block_label('next ', ';') }
+	],
+	[#Rule 41
+		 'atomdir', 2,
+sub
+#line 128 "Parser.yp"
+{ if ($_[2]->[0]->[0] =~ /^'(on|off)'$/) {
+                                          $_[0]->{ DEBUG_DIRS } = ($1 eq 'on');
+                                          $factory->debug($_[2]);
+                                      }
+                                      else {
+                                          $_[0]->{ DEBUG_DIRS } ? $factory->debug($_[2]) : '';
+                                      }
+                                    }
+	],
+	[#Rule 42
+		 'atomdir', 1, undef
+	],
+	[#Rule 43
+		 'atomdir', 1, undef
+	],
+	[#Rule 44
+		 'condition', 6,
+sub
+#line 141 "Parser.yp"
+{ $factory->if(@_[2, 4, 5])           }
+	],
+	[#Rule 45
+		 'condition', 3,
+sub
+#line 142 "Parser.yp"
+{ $factory->if(@_[3, 1])              }
+	],
+	[#Rule 46
+		 'condition', 6,
+sub
+#line 144 "Parser.yp"
+{ $factory->if("!($_[2])", @_[4, 5])  }
+	],
+	[#Rule 47
+		 'condition', 3,
+sub
+#line 145 "Parser.yp"
+{ $factory->if("!($_[3])", $_[1])     }
+	],
+	[#Rule 48
+		 'else', 5,
+sub
+#line 149 "Parser.yp"
+{ unshift(@{$_[5]}, [ @_[2, 4] ]);
+                                      $_[5];                              }
+	],
+	[#Rule 49
+		 'else', 3,
+sub
+#line 151 "Parser.yp"
+{ [ $_[3] ]                           }
+	],
+	[#Rule 50
+		 'else', 0,
+sub
+#line 152 "Parser.yp"
+{ [ undef ]                           }
+	],
+	[#Rule 51
+		 'switch', 6,
+sub
+#line 156 "Parser.yp"
+{ $factory->switch(@_[2, 5])          }
+	],
+	[#Rule 52
+		 'case', 5,
+sub
+#line 160 "Parser.yp"
+{ unshift(@{$_[5]}, [ @_[2, 4] ]); 
+                                      $_[5];                              }
+	],
+	[#Rule 53
+		 'case', 4,
+sub
+#line 162 "Parser.yp"
+{ [ $_[4] ]                           }
+	],
+	[#Rule 54
+		 'case', 3,
+sub
+#line 163 "Parser.yp"
+{ [ $_[3] ]                           }
+	],
+	[#Rule 55
+		 'case', 0,
+sub
+#line 164 "Parser.yp"
+{ [ undef ]                           }
+	],
+	[#Rule 56
+		 '@1-3', 0,
+sub
+#line 167 "Parser.yp"
+{ $_[0]->enter_block('FOR')           }
+	],
+	[#Rule 57
+		 'loop', 6,
+sub
+#line 168 "Parser.yp"
+{ $factory->foreach(@{$_[2]}, $_[5], $_[0]->leave_block)  }
+	],
+	[#Rule 58
+		 'loop', 3,
+sub
+#line 169 "Parser.yp"
+{ $factory->foreach(@{$_[3]}, $_[1])  }
+	],
+	[#Rule 59
+		 '@2-3', 0,
+sub
+#line 170 "Parser.yp"
+{ $_[0]->enter_block('WHILE')         }
+	],
+	[#Rule 60
+		 'loop', 6,
+sub
+#line 171 "Parser.yp"
+{ $factory->while(@_[2, 5], $_[0]->leave_block) }
+	],
+	[#Rule 61
+		 'loop', 3,
+sub
+#line 172 "Parser.yp"
+{ $factory->while(@_[3, 1]) }
+	],
+	[#Rule 62
+		 'loopvar', 4,
+sub
+#line 175 "Parser.yp"
+{ [ @_[1, 3, 4] ]                     }
+	],
+	[#Rule 63
+		 'loopvar', 4,
+sub
+#line 176 "Parser.yp"
+{ [ @_[1, 3, 4] ]                     }
+	],
+	[#Rule 64
+		 'loopvar', 2,
+sub
+#line 177 "Parser.yp"
+{ [ 0, @_[1, 2] ]                     }
+	],
+	[#Rule 65
+		 'wrapper', 5,
+sub
+#line 181 "Parser.yp"
+{ $factory->wrapper(@_[2, 4])         }
+	],
+	[#Rule 66
+		 'wrapper', 3,
+sub
+#line 183 "Parser.yp"
+{ $factory->wrapper(@_[3, 1])         }
+	],
+	[#Rule 67
+		 'try', 5,
+sub
+#line 187 "Parser.yp"
+{ $factory->try(@_[3, 4])             }
+	],
+	[#Rule 68
+		 'final', 5,
+sub
+#line 191 "Parser.yp"
+{ unshift(@{$_[5]}, [ @_[2,4] ]);
+                                      $_[5];                              }
+	],
+	[#Rule 69
+		 'final', 5,
+sub
+#line 194 "Parser.yp"
+{ unshift(@{$_[5]}, [ undef, $_[4] ]);
+                                      $_[5];                              }
+	],
+	[#Rule 70
+		 'final', 4,
+sub
+#line 197 "Parser.yp"
+{ unshift(@{$_[4]}, [ undef, $_[3] ]);
+                                      $_[4];                              }
+	],
+	[#Rule 71
+		 'final', 3,
+sub
+#line 199 "Parser.yp"
+{ [ $_[3] ]                           }
+	],
+	[#Rule 72
+		 'final', 0,
+sub
+#line 200 "Parser.yp"
+{ [ 0 ] }
+	],
+	[#Rule 73
+		 'use', 2,
+sub
+#line 203 "Parser.yp"
+{ $factory->use($_[2])                }
+	],
+	[#Rule 74
+		 '@3-3', 0,
+sub
+#line 206 "Parser.yp"
+{ $_[0]->push_defblock();             }
+	],
+	[#Rule 75
+		 'view', 6,
+sub
+#line 207 "Parser.yp"
+{ $factory->view(@_[2,5], 
+                                                     $_[0]->pop_defblock) }
+	],
+	[#Rule 76
+		 '@4-2', 0,
+sub
+#line 211 "Parser.yp"
+{ ${$_[0]->{ INPERL }}++;             }
+	],
+	[#Rule 77
+		 'perl', 5,
+sub
+#line 212 "Parser.yp"
+{ ${$_[0]->{ INPERL }}--;
+                                      $_[0]->{ EVAL_PERL } 
+                                      ? $factory->perl($_[4])             
+                                      : $factory->no_perl();              }
+	],
+	[#Rule 78
+		 '@5-1', 0,
+sub
+#line 218 "Parser.yp"
+{ ${$_[0]->{ INPERL }}++; 
+                                      $rawstart = ${$_[0]->{'LINE'}};     }
+	],
+	[#Rule 79
+		 'rawperl', 5,
+sub
+#line 220 "Parser.yp"
+{ ${$_[0]->{ INPERL }}--;
+                                      $_[0]->{ EVAL_PERL } 
+                                      ? $factory->rawperl($_[4], $rawstart)
+                                      : $factory->no_perl();              }
+	],
+	[#Rule 80
+		 'filter', 5,
+sub
+#line 227 "Parser.yp"
+{ $factory->filter(@_[2,4])           }
+	],
+	[#Rule 81
+		 'filter', 3,
+sub
+#line 229 "Parser.yp"
+{ $factory->filter(@_[3,1])           }
+	],
+	[#Rule 82
+		 'defblock', 5,
+sub
+#line 234 "Parser.yp"
+{ my $name = join('/', @{ $_[0]->{ DEFBLOCKS } });
+                                      pop(@{ $_[0]->{ DEFBLOCKS } });
+                                      $_[0]->define_block($name, $_[4]); 
+                                      undef
+                                    }
+	],
+	[#Rule 83
+		 'defblockname', 2,
+sub
+#line 241 "Parser.yp"
+{ push(@{ $_[0]->{ DEFBLOCKS } }, $_[2]);
+                                      $_[2];
+                                    }
+	],
+	[#Rule 84
+		 'blockname', 1, undef
+	],
+	[#Rule 85
+		 'blockname', 1,
+sub
+#line 247 "Parser.yp"
+{ $_[1] =~ s/^'(.*)'$/$1/; $_[1]      }
+	],
+	[#Rule 86
+		 'blockargs', 1, undef
+	],
+	[#Rule 87
+		 'blockargs', 0, undef
+	],
+	[#Rule 88
+		 'anonblock', 5,
+sub
+#line 255 "Parser.yp"
+{ local $" = ', ';
+                                      print STDERR "experimental block args: [@{ $_[2] }]\n"
+                                          if $_[2];
+                                      $factory->anon_block($_[4])         }
+	],
+	[#Rule 89
+		 'capture', 3,
+sub
+#line 261 "Parser.yp"
+{ $factory->capture(@_[1, 3])         }
+	],
+	[#Rule 90
+		 'macro', 6,
+sub
+#line 265 "Parser.yp"
+{ $factory->macro(@_[2, 6, 4])        }
+	],
+	[#Rule 91
+		 'macro', 3,
+sub
+#line 266 "Parser.yp"
+{ $factory->macro(@_[2, 3])           }
+	],
+	[#Rule 92
+		 'mdir', 1, undef
+	],
+	[#Rule 93
+		 'mdir', 4,
+sub
+#line 270 "Parser.yp"
+{ $_[3]                               }
+	],
+	[#Rule 94
+		 'margs', 2,
+sub
+#line 273 "Parser.yp"
+{ push(@{$_[1]}, $_[2]); $_[1]        }
+	],
+	[#Rule 95
+		 'margs', 2,
+sub
+#line 274 "Parser.yp"
+{ $_[1]                               }
+	],
+	[#Rule 96
+		 'margs', 1,
+sub
+#line 275 "Parser.yp"
+{ [ $_[1] ]                           }
+	],
+	[#Rule 97
+		 'metadata', 2,
+sub
+#line 278 "Parser.yp"
+{ push(@{$_[1]}, @{$_[2]}); $_[1]     }
+	],
+	[#Rule 98
+		 'metadata', 2, undef
+	],
+	[#Rule 99
+		 'metadata', 1, undef
+	],
+	[#Rule 100
+		 'meta', 3,
+sub
+#line 283 "Parser.yp"
+{ for ($_[3]) { s/^'//; s/'$//; 
+                                                       s/\\'/'/g  }; 
+                                         [ @_[1,3] ] }
+	],
+	[#Rule 101
+		 'meta', 5,
+sub
+#line 286 "Parser.yp"
+{ [ @_[1,4] ] }
+	],
+	[#Rule 102
+		 'meta', 3,
+sub
+#line 287 "Parser.yp"
+{ [ @_[1,3] ] }
+	],
+	[#Rule 103
+		 'term', 1, undef
+	],
+	[#Rule 104
+		 'term', 1, undef
+	],
+	[#Rule 105
+		 'lterm', 3,
+sub
+#line 299 "Parser.yp"
+{ "[ $_[2] ]"                         }
+	],
+	[#Rule 106
+		 'lterm', 3,
+sub
+#line 300 "Parser.yp"
+{ "[ $_[2] ]"                         }
+	],
+	[#Rule 107
+		 'lterm', 2,
+sub
+#line 301 "Parser.yp"
+{ "[ ]"                               }
+	],
+	[#Rule 108
+		 'lterm', 3,
+sub
+#line 302 "Parser.yp"
+{ "{ $_[2]  }"                        }
+	],
+	[#Rule 109
+		 'sterm', 1,
+sub
+#line 305 "Parser.yp"
+{ $factory->ident($_[1])              }
+	],
+	[#Rule 110
+		 'sterm', 2,
+sub
+#line 306 "Parser.yp"
+{ $factory->identref($_[2])           }
+	],
+	[#Rule 111
+		 'sterm', 3,
+sub
+#line 307 "Parser.yp"
+{ $factory->quoted($_[2])             }
+	],
+	[#Rule 112
+		 'sterm', 1, undef
+	],
+	[#Rule 113
+		 'sterm', 1, undef
+	],
+	[#Rule 114
+		 'list', 2,
+sub
+#line 312 "Parser.yp"
+{ "$_[1], $_[2]"                      }
+	],
+	[#Rule 115
+		 'list', 2, undef
+	],
+	[#Rule 116
+		 'list', 1, undef
+	],
+	[#Rule 117
+		 'range', 3,
+sub
+#line 317 "Parser.yp"
+{ $_[1] . '..' . $_[3]                }
+	],
+	[#Rule 118
+		 'hash', 1, undef
+	],
+	[#Rule 119
+		 'hash', 0,
+sub
+#line 322 "Parser.yp"
+{ "" }
+	],
+	[#Rule 120
+		 'params', 2,
+sub
+#line 325 "Parser.yp"
+{ "$_[1], $_[2]"                      }
+	],
+	[#Rule 121
+		 'params', 2, undef
+	],
+	[#Rule 122
+		 'params', 1, undef
+	],
+	[#Rule 123
+		 'param', 3,
+sub
+#line 330 "Parser.yp"
+{ "$_[1] => $_[3]"                    }
+	],
+	[#Rule 124
+		 'param', 3,
+sub
+#line 331 "Parser.yp"
+{ "$_[1] => $_[3]"                    }
+	],
+	[#Rule 125
+		 'ident', 3,
+sub
+#line 334 "Parser.yp"
+{ push(@{$_[1]}, @{$_[3]}); $_[1]     }
+	],
+	[#Rule 126
+		 'ident', 3,
+sub
+#line 335 "Parser.yp"
+{ push(@{$_[1]}, 
+                                           map {($_, 0)} split(/\./, $_[3]));
+                                      $_[1];                              }
+	],
+	[#Rule 127
+		 'ident', 1, undef
+	],
+	[#Rule 128
+		 'node', 1,
+sub
+#line 341 "Parser.yp"
+{ [ $_[1], 0 ]                        }
+	],
+	[#Rule 129
+		 'node', 4,
+sub
+#line 342 "Parser.yp"
+{ [ $_[1], $factory->args($_[3]) ]    }
+	],
+	[#Rule 130
+		 'item', 1,
+sub
+#line 345 "Parser.yp"
+{ "'$_[1]'"                           }
+	],
+	[#Rule 131
+		 'item', 3,
+sub
+#line 346 "Parser.yp"
+{ $_[2]                               }
+	],
+	[#Rule 132
+		 'item', 2,
+sub
+#line 347 "Parser.yp"
+{ $_[0]->{ V1DOLLAR }
+                                       ? "'$_[2]'" 
+                                       : $factory->ident(["'$_[2]'", 0])  }
+	],
+	[#Rule 133
+		 'expr', 3,
+sub
+#line 352 "Parser.yp"
+{ "$_[1] $_[2] $_[3]"                 }
+	],
+	[#Rule 134
+		 'expr', 3,
+sub
+#line 353 "Parser.yp"
+{ "$_[1] $_[2] $_[3]"                 }
+	],
+	[#Rule 135
+		 'expr', 3,
+sub
+#line 354 "Parser.yp"
+{ "$_[1] $_[2] $_[3]"                 }
+	],
+	[#Rule 136
+		 'expr', 3,
+sub
+#line 355 "Parser.yp"
+{ "int($_[1] / $_[3])"                }
+	],
+	[#Rule 137
+		 'expr', 3,
+sub
+#line 356 "Parser.yp"
+{ "$_[1] % $_[3]"                     }
+	],
+	[#Rule 138
+		 'expr', 3,
+sub
+#line 357 "Parser.yp"
+{ "$_[1] $CMPOP{ $_[2] } $_[3]"       }
+	],
+	[#Rule 139
+		 'expr', 3,
+sub
+#line 358 "Parser.yp"
+{ "$_[1]  . $_[3]"                    }
+	],
+	[#Rule 140
+		 'expr', 3,
+sub
+#line 359 "Parser.yp"
+{ "$_[1] && $_[3]"                    }
+	],
+	[#Rule 141
+		 'expr', 3,
+sub
+#line 360 "Parser.yp"
+{ "$_[1] || $_[3]"                    }
+	],
+	[#Rule 142
+		 'expr', 2,
+sub
+#line 361 "Parser.yp"
+{ "! $_[2]"                           }
+	],
+	[#Rule 143
+		 'expr', 5,
+sub
+#line 362 "Parser.yp"
+{ "$_[1] ? $_[3] : $_[5]"             }
+	],
+	[#Rule 144
+		 'expr', 3,
+sub
+#line 363 "Parser.yp"
+{ $factory->assign(@{$_[2]})          }
+	],
+	[#Rule 145
+		 'expr', 3,
+sub
+#line 364 "Parser.yp"
+{ "($_[2])"                           }
+	],
+	[#Rule 146
+		 'expr', 1, undef
+	],
+	[#Rule 147
+		 'setlist', 2,
+sub
+#line 368 "Parser.yp"
+{ push(@{$_[1]}, @{$_[2]}); $_[1]     }
+	],
+	[#Rule 148
+		 'setlist', 2, undef
+	],
+	[#Rule 149
+		 'setlist', 1, undef
+	],
+	[#Rule 150
+		 'assign', 3,
+sub
+#line 374 "Parser.yp"
+{ [ $_[1], $_[3] ]                    }
+	],
+	[#Rule 151
+		 'assign', 3,
+sub
+#line 375 "Parser.yp"
+{ [ @_[1,3] ]                         }
+	],
+	[#Rule 152
+		 'args', 2,
+sub
+#line 382 "Parser.yp"
+{ push(@{$_[1]}, $_[2]); $_[1]        }
+	],
+	[#Rule 153
+		 'args', 2,
+sub
+#line 383 "Parser.yp"
+{ push(@{$_[1]->[0]}, $_[2]); $_[1]   }
+	],
+	[#Rule 154
+		 'args', 4,
+sub
+#line 384 "Parser.yp"
+{ push(@{$_[1]->[0]}, "'', " . 
+                                      $factory->assign(@_[2,4])); $_[1]  }
+	],
+	[#Rule 155
+		 'args', 2,
+sub
+#line 386 "Parser.yp"
+{ $_[1]                               }
+	],
+	[#Rule 156
+		 'args', 0,
+sub
+#line 387 "Parser.yp"
+{ [ [ ] ]                             }
+	],
+	[#Rule 157
+		 'lnameargs', 3,
+sub
+#line 397 "Parser.yp"
+{ push(@{$_[3]}, $_[1]); $_[3]        }
+	],
+	[#Rule 158
+		 'lnameargs', 1, undef
+	],
+	[#Rule 159
+		 'lvalue', 1, undef
+	],
+	[#Rule 160
+		 'lvalue', 3,
+sub
+#line 402 "Parser.yp"
+{ $factory->quoted($_[2])             }
+	],
+	[#Rule 161
+		 'lvalue', 1, undef
+	],
+	[#Rule 162
+		 'nameargs', 3,
+sub
+#line 406 "Parser.yp"
+{ [ [$factory->ident($_[2])], $_[3] ]   }
+	],
+	[#Rule 163
+		 'nameargs', 2,
+sub
+#line 407 "Parser.yp"
+{ [ @_[1,2] ] }
+	],
+	[#Rule 164
+		 'nameargs', 4,
+sub
+#line 408 "Parser.yp"
+{ [ @_[1,3] ] }
+	],
+	[#Rule 165
+		 'names', 3,
+sub
+#line 411 "Parser.yp"
+{ push(@{$_[1]}, $_[3]); $_[1] }
+	],
+	[#Rule 166
+		 'names', 1,
+sub
+#line 412 "Parser.yp"
+{ [ $_[1] ]                    }
+	],
+	[#Rule 167
+		 'name', 3,
+sub
+#line 415 "Parser.yp"
+{ $factory->quoted($_[2])  }
+	],
+	[#Rule 168
+		 'name', 1,
+sub
+#line 416 "Parser.yp"
+{ "'$_[1]'" }
+	],
+	[#Rule 169
+		 'name', 1, undef
+	],
+	[#Rule 170
+		 'filename', 3,
+sub
+#line 420 "Parser.yp"
+{ "$_[1].$_[3]" }
+	],
+	[#Rule 171
+		 'filename', 1, undef
+	],
+	[#Rule 172
+		 'filepart', 1, undef
+	],
+	[#Rule 173
+		 'filepart', 1, undef
+	],
+	[#Rule 174
+		 'filepart', 1, undef
+	],
+	[#Rule 175
+		 'quoted', 2,
+sub
+#line 434 "Parser.yp"
+{ push(@{$_[1]}, $_[2]) 
+                                          if defined $_[2]; $_[1]         }
+	],
+	[#Rule 176
+		 'quoted', 0,
+sub
+#line 436 "Parser.yp"
+{ [ ]                                 }
+	],
+	[#Rule 177
+		 'quotable', 1,
+sub
+#line 439 "Parser.yp"
+{ $factory->ident($_[1])              }
+	],
+	[#Rule 178
+		 'quotable', 1,
+sub
+#line 440 "Parser.yp"
+{ $factory->text($_[1])               }
+	],
+	[#Rule 179
+		 'quotable', 1,
+sub
+#line 441 "Parser.yp"
+{ undef                               }
+	]
+];
+
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Grammar - Parser state/rule tables for the TT grammar
+
+=head1 SYNOPSIS
+
+    # no user serviceable parts inside
+
+=head1 DESCRIPTION
+
+This module defines the state and rule tables that the L<Template::Parser>
+module uses to parse templates.  It is generated from a YACC-like grammar
+using the C<Parse::Yapp> module.  The F<parser> sub-directory of the 
+Template Toolkit source distribution contains the grammar and other 
+files required to generate this module.
+
+But you don't need to worry about any of that unless you're planning to 
+modify the Template Toolkit language.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt>
+
+L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Parser>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
+
+
+
+
+
+
+
+
+
+
diff --git a/bench/perl/Template/Iterator.pm b/bench/perl/Template/Iterator.pm
new file mode 100644
index 0000000..0b55c5c
--- /dev/null
+++ b/bench/perl/Template/Iterator.pm
@@ -0,0 +1,493 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Iterator
+#
+# DESCRIPTION
+#
+#   Module defining an iterator class which is used by the FOREACH
+#   directive for iterating through data sets.  This may be
+#   sub-classed to define more specific iterator types.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Iterator;
+
+use strict;
+use warnings;
+use base 'Template::Base';
+use Template::Constants;
+use Template::Exception;
+use Scalar::Util qw(blessed);
+
+use constant ODD  => 'odd';
+use constant EVEN => 'even';
+
+our $VERSION = 2.68;
+our $DEBUG   = 0 unless defined $DEBUG;
+our $AUTOLOAD;
+
+#========================================================================
+#                      -----  CLASS METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# new(\@target, \%options)
+#
+# Constructor method which creates and returns a reference to a new 
+# Template::Iterator object.  A reference to the target data (array
+# or hash) may be passed for the object to iterate through.
+#------------------------------------------------------------------------
+
+sub new {
+    my $class  = shift;
+    my $data   = shift || [ ];
+    my $params = shift || { };
+
+    if (ref $data eq 'HASH') {
+        # map a hash into a list of { key => ???, value => ??? } hashes,
+        # one for each key, sorted by keys
+        $data = [ map { { key => $_, value => $data->{ $_ } } }
+                  sort keys %$data ];
+    }
+    elsif (blessed($data) && $data->can('as_list')) {
+        $data = $data->as_list();
+    }
+    elsif (ref $data ne 'ARRAY') {
+        # coerce any non-list data into an array reference
+        $data  = [ $data ] ;
+    }
+
+    bless {
+        _DATA  => $data,
+        _ERROR => '',
+    }, $class;
+}
+
+
+#========================================================================
+#                   -----  PUBLIC OBJECT METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# get_first()
+#
+# Initialises the object for iterating through the target data set.  The 
+# first record is returned, if defined, along with the STATUS_OK value.
+# If there is no target data, or the data is an empty set, then undef 
+# is returned with the STATUS_DONE value.  
+#------------------------------------------------------------------------
+
+sub get_first {
+    my $self  = shift;
+    my $data  = $self->{ _DATA };
+
+    $self->{ _DATASET } = $self->{ _DATA };
+    my $size = scalar @$data;
+    my $index = 0;
+    
+    return (undef, Template::Constants::STATUS_DONE) unless $size;
+
+    # initialise various counters, flags, etc.
+    @$self{ qw( SIZE MAX INDEX COUNT FIRST LAST ) } 
+            = ( $size, $size - 1, $index, 1, 1, $size > 1 ? 0 : 1, undef );
+    @$self{ qw( PREV NEXT ) } = ( undef, $self->{ _DATASET }->[ $index + 1 ]);
+
+    return $self->{ _DATASET }->[ $index ];
+}
+
+
+
+#------------------------------------------------------------------------
+# get_next()
+#
+# Called repeatedly to access successive elements in the data set.
+# Should only be called after calling get_first() or a warning will 
+# be raised and (undef, STATUS_DONE) returned.
+#------------------------------------------------------------------------
+
+sub get_next {
+    my $self = shift;
+    my ($max, $index) = @$self{ qw( MAX INDEX ) };
+    my $data = $self->{ _DATASET };
+
+    # warn about incorrect usage
+    unless (defined $index) {
+        my ($pack, $file, $line) = caller();
+        warn("iterator get_next() called before get_first() at $file line $line\n");
+        return (undef, Template::Constants::STATUS_DONE);   ## RETURN ##
+    }
+
+    # if there's still some data to go...
+    if ($index < $max) {
+        # update counters and flags
+        $index++;
+        @$self{ qw( INDEX COUNT FIRST LAST ) }
+        = ( $index, $index + 1, 0, $index == $max ? 1 : 0 );
+        @$self{ qw( PREV NEXT ) } = @$data[ $index - 1, $index + 1 ];
+        return $data->[ $index ];                           ## RETURN ##
+    }
+    else {
+        return (undef, Template::Constants::STATUS_DONE);   ## RETURN ##
+    }
+}
+
+
+#------------------------------------------------------------------------
+# get_all()
+#
+# Method which returns all remaining items in the iterator as a Perl list
+# reference.  May be called at any time in the life-cycle of the iterator.
+# The get_first() method will be called automatically if necessary, and
+# then subsequent get_next() calls are made, storing each returned 
+# result until the list is exhausted.  
+#------------------------------------------------------------------------
+
+sub get_all {
+    my $self = shift;
+    my ($max, $index) = @$self{ qw( MAX INDEX ) };
+    my @data;
+
+    # handle cases where get_first() has yet to be called.
+    unless (defined $index) {
+        my ($first, $status) = $self->get_first;
+
+        # refresh $max and $index, after get_first updates MAX and INDEX
+        ($max, $index) = @$self{ qw( MAX INDEX ) };
+
+        # empty lists are handled here.
+        if ($status && $status == Template::Constants::STATUS_DONE) {
+            return (undef, Template::Constants::STATUS_DONE);   ## RETURN ##
+        }
+
+        push @data, $first;
+
+        ## if there's nothing left in the iterator, return the single value.
+        unless ($index < $max) {
+            return \@data;
+        }
+    }
+
+    # if there's still some data to go...
+    if ($index < $max) {
+        $index++;
+        push @data, @{ $self->{ _DATASET } } [ $index..$max ];
+        
+        # update counters and flags
+        @$self{ qw( INDEX COUNT FIRST LAST ) }
+        = ( $max, $max + 1, 0, 1 );
+
+        return \@data;                                      ## RETURN ##
+    }
+    else {
+        return (undef, Template::Constants::STATUS_DONE);   ## RETURN ##
+    }
+}
+
+sub odd {
+    shift->{ COUNT } % 2 ? 1 : 0
+}
+
+sub even {
+    shift->{ COUNT } % 2 ? 0 : 1
+}
+
+sub parity {
+    shift->{ COUNT } % 2 ? ODD : EVEN;
+}
+
+
+#------------------------------------------------------------------------
+# AUTOLOAD
+#
+# Provides access to internal fields (e.g. size, first, last, max, etc)
+#------------------------------------------------------------------------
+
+sub AUTOLOAD {
+    my $self = shift;
+    my $item = $AUTOLOAD;
+    $item =~ s/.*:://;
+    return if $item eq 'DESTROY';
+
+    # alias NUMBER to COUNT for backwards compatability
+    $item = 'COUNT' if $item =~ /NUMBER/i;
+
+    return $self->{ uc $item };
+}
+
+
+#========================================================================
+#                   -----  PRIVATE DEBUG METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# _dump()
+#
+# Debug method which returns a string detailing the internal state of 
+# the iterator object.
+#------------------------------------------------------------------------
+
+sub _dump {
+    my $self = shift;
+    join('',
+         "  Data: ", $self->{ _DATA  }, "\n",
+         " Index: ", $self->{ INDEX  }, "\n",
+         "Number: ", $self->{ NUMBER }, "\n",
+         "   Max: ", $self->{ MAX    }, "\n",
+         "  Size: ", $self->{ SIZE   }, "\n",
+         " First: ", $self->{ FIRST  }, "\n",
+         "  Last: ", $self->{ LAST   }, "\n",
+         "\n"
+     );
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Iterator - Data iterator used by the FOREACH directive
+
+=head1 SYNOPSIS
+
+    my $iter = Template::Iterator->new(\@data, \%options);
+
+=head1 DESCRIPTION
+
+The C<Template::Iterator> module defines a generic data iterator for use 
+by the C<FOREACH> directive.  
+
+It may be used as the base class for custom iterators.
+
+=head1 PUBLIC METHODS
+
+=head2 new($data) 
+
+Constructor method.  A reference to a list of values is passed as the
+first parameter.  Subsequent calls to L<get_first()> and L<get_next()> calls 
+will return each element from the list.
+
+    my $iter = Template::Iterator->new([ 'foo', 'bar', 'baz' ]);
+
+The constructor will also accept a reference to a hash array and will 
+expand it into a list in which each entry is a hash array containing
+a 'C<key>' and 'C<value>' item, sorted according to the hash keys.
+
+    my $iter = Template::Iterator->new({ 
+        foo => 'Foo Item',
+        bar => 'Bar Item',
+    });
+
+This is equivalent to:
+
+    my $iter = Template::Iterator->new([
+        { key => 'bar', value => 'Bar Item' },
+        { key => 'foo', value => 'Foo Item' },
+    ]);
+
+When passed a single item which is not an array reference, the constructor
+will automatically create a list containing that single item.
+
+    my $iter = Template::Iterator->new('foo');
+
+This is equivalent to:
+
+    my $iter = Template::Iterator->new([ 'foo' ]);
+
+Note that a single item which is an object based on a blessed ARRAY 
+references will NOT be treated as an array and will be folded into 
+a list containing that one object reference.
+
+    my $list = bless [ 'foo', 'bar' ], 'MyListClass';
+    my $iter = Template::Iterator->new($list);
+
+equivalent to:
+
+    my $iter = Template::Iterator->new([ $list ]);
+
+If the object provides an C<as_list()> method then the L<Template::Iterator>
+constructor will call that method to return the list of data.  For example:
+
+    package MyListObject;
+    
+    sub new {
+        my $class = shift;
+        bless [ @_ ], $class;
+    }
+
+    package main;
+    
+    my $list = MyListObject->new('foo', 'bar');
+    my $iter = Template::Iterator->new($list);
+
+This is then functionally equivalent to:
+
+    my $iter = Template::Iterator->new([ $list ]);
+
+The iterator will return only one item, a reference to the C<MyListObject>
+object, C<$list>.
+
+By adding an C<as_list()> method to the C<MyListObject> class, we can force
+the C<Template::Iterator> constructor to treat the object as a list and 
+use the data contained within.
+
+    package MyListObject;
+    
+    ...
+    
+    sub as_list {
+        my $self = shift;
+        return $self;
+    }
+    
+    package main;
+    
+    my $list = MyListObject->new('foo', 'bar');
+    my $iter = Template::Iterator->new($list);
+
+The iterator will now return the two items, 'C<foo>' and 'C<bar>', which the 
+C<MyObjectList> encapsulates.
+
+=head2 get_first()
+
+Returns a C<($value, $error)> pair for the first item in the iterator set.
+The C<$error> returned may be zero or undefined to indicate a valid datum
+was successfully returned.  Returns an error of C<STATUS_DONE> if the list 
+is empty.
+
+=head2 get_next()
+
+Returns a C<($value, $error)> pair for the next item in the iterator set.
+Returns an error of C<STATUS_DONE> if all items in the list have been 
+visited.
+
+=head2 get_all()
+
+Returns a C<(\@values, $error)> pair for all remaining items in the iterator 
+set.  Returns an error of C<STATUS_DONE> if all items in the list have been 
+visited.
+
+=head2 size()
+
+Returns the size of the data set or undef if unknown.
+
+=head2 max()
+
+Returns the maximum index number (i.e. the index of the last element) 
+which is equivalent to L<size()> - C<1>.
+
+=head2 index()
+
+Returns the current index number which is in the range C<0> to L<max()>.
+
+=head2 count()
+
+Returns the current iteration count in the range C<1> to L<size()>.  This is
+equivalent to L<index()> + C<1>.  
+
+=head2 first()
+
+Returns a boolean value to indicate if the iterator is currently on 
+the first iteration of the set.
+
+=head2 last()
+
+Returns a boolean value to indicate if the iterator is currently on
+the last iteration of the set.
+
+=head2 prev()
+
+Returns the previous item in the data set, or C<undef> if the iterator is
+on the first item.
+
+=head2 next()
+
+Returns the next item in the data set or C<undef> if the iterator is on the 
+last item.
+
+=head2 parity()
+
+Returns the text string C<even> or C<odd> to indicate the parity of the 
+current iteration count (starting at 1).  This is typically used to create
+striped I<zebra tables>.
+
+    <table>
+    [% FOREACH name IN ['Arthur', 'Ford', 'Trillian'] -%]
+      <tr class="[% loop.parity %]">
+        <td>[% name %]</td>
+      </tr>
+    [% END %]
+    </table>
+
+This will produce the following output:
+
+    <table>
+      <tr class="odd">
+        <td>Arthur</td>
+      </tr>
+      <tr class="even">
+        <td>Ford</td>
+      </tr>
+      <tr class="odd">
+        <td>Trillian</td>
+      </tr>
+    </table>
+
+You can then style the C<tr.odd> and C<tr.even> elements using CSS:
+
+    tr.odd td {
+        background-color: black;
+        color: white;
+    }
+    
+    tr.even td {
+        background-color: white;
+        color: black;
+    }
+
+=head2 odd()
+
+Returns a boolean (0/1) value to indicate if the current iterator count
+(starting at 1) is an odd number. In other words, this will return a true
+value for the first iterator, the third, fifth, and so on.
+
+=head2 even()
+
+Returns a boolean (0/1) value to indicate if the current iterator count
+(starting at 1) is an even number. In other words, this will return a true
+value for the second iteration, the fourth, sixth, and so on.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Namespace/Constants.pm b/bench/perl/Template/Namespace/Constants.pm
new file mode 100644
index 0000000..aeb0166
--- /dev/null
+++ b/bench/perl/Template/Namespace/Constants.pm
@@ -0,0 +1,176 @@
+#================================================================= -*-Perl-*- 
+#
+# Template::Namespace::Constants
+#
+# DESCRIPTION
+#   Plugin compiler module for performing constant folding at compile time
+#   on variables in a particular namespace.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Namespace::Constants;
+
+use strict;
+use warnings;
+use base 'Template::Base';
+use Template::Config;
+use Template::Directive;
+use Template::Exception;
+
+our $VERSION = 1.27;
+our $DEBUG   = 0 unless defined $DEBUG;
+
+
+sub _init {
+    my ($self, $config) = @_;
+    $self->{ STASH } = Template::Config->stash($config)
+        || return $self->error(Template::Config->error());
+    return $self;
+}
+
+
+
+#------------------------------------------------------------------------
+# ident(\@ident)                                             foo.bar(baz)
+#------------------------------------------------------------------------
+
+sub ident {
+    my ($self, $ident) = @_;
+    my @save = @$ident;
+
+    # discard first node indicating constants namespace
+    splice(@$ident, 0, 2);
+
+    my $nelems = @$ident / 2;
+    my ($e, $result);
+    local $" = ', ';
+
+    print STDERR "constant ident [ @$ident ] " if $DEBUG;
+
+    foreach $e (0..$nelems-1) {
+        # node name must be a constant
+        unless ($ident->[$e * 2] =~ s/^'(.+)'$/$1/s) {
+            $self->DEBUG(" * deferred (non-constant item: ", $ident->[$e * 2], ")\n")
+                if $DEBUG;
+            return Template::Directive->ident(\@save);
+        }
+
+        # if args is non-zero then it must be eval'ed 
+        if ($ident->[$e * 2 + 1]) {
+            my $args = $ident->[$e * 2 + 1];
+            my $comp = eval "$args";
+            if ($@) {
+                $self->DEBUG(" * deferred (non-constant args: $args)\n") if $DEBUG;
+                return Template::Directive->ident(\@save);
+            }
+            $self->DEBUG("($args) ") if $comp && $DEBUG;
+            $ident->[$e * 2 + 1] = $comp;
+        }
+    }
+
+
+    $result = $self->{ STASH }->get($ident);
+
+    if (! length $result || ref $result) {
+        my $reason = length $result ? 'reference' : 'no result';
+        $self->DEBUG(" * deferred ($reason)\n") if $DEBUG;
+        return Template::Directive->ident(\@save);
+    }
+
+    $result =~ s/'/\\'/g;
+
+    $self->DEBUG(" * resolved => '$result'\n") if $DEBUG;
+
+    return "'$result'";
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Namespace::Constants - Compile time constant folding
+
+=head1 SYNOPSIS
+
+    # easy way to define constants
+    use Template;
+    
+    my $tt = Template->new({
+        CONSTANTS => {
+            pi => 3.14,
+            e  => 2.718,
+        },
+    });
+
+    # nitty-gritty, hands-dirty way
+    use Template::Namespace::Constants;
+    
+    my $tt = Template->new({
+        NAMESPACE => {
+            constants => Template::Namespace::Constants->new({
+                pi => 3.14,
+                e  => 2.718,
+            },
+        },
+    });
+
+=head1 DESCRIPTION
+
+The C<Template::Namespace::Constants> module implements a namespace handler
+which is plugged into the C<Template::Directive> compiler module.  This then
+performs compile time constant folding of variables in a particular namespace.
+
+=head1 METHODS
+
+=head2 new(\%constants)
+
+The new() constructor method creates and returns a reference to a new
+Template::Namespace::Constants object.  This creates an internal stash
+to store the constant variable definitions passed as arguments.
+
+    my $handler = Template::Namespace::Constants->new({
+        pi => 3.14,
+        e  => 2.718,
+    });
+
+=head2 ident(\@ident)
+
+Method called to resolve a variable identifier into a compiled form.  In this
+case, the method fetches the corresponding constant value from its internal
+stash and returns it.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Directive>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Parser.pm b/bench/perl/Template/Parser.pm
new file mode 100644
index 0000000..225a3b3
--- /dev/null
+++ b/bench/perl/Template/Parser.pm
@@ -0,0 +1,1131 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Parser
+#
+# DESCRIPTION
+#   This module implements a LALR(1) parser and assocated support 
+#   methods to parse template documents into the appropriate "compiled"
+#   format.  Much of the parser DFA code (see _parse() method) is based 
+#   on Francois Desarmenien's Parse::Yapp module.  Kudos to him.
+# 
+# AUTHOR
+#   Andy Wardley <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#   The following copyright notice appears in the Parse::Yapp 
+#   documentation.  
+#
+#      The Parse::Yapp module and its related modules and shell
+#      scripts are copyright (c) 1998 Francois Desarmenien,
+#      France. All rights reserved.
+#
+#      You may use and distribute them under the terms of either
+#      the GNU General Public License or the Artistic License, as
+#      specified in the Perl README file.
+# 
+#============================================================================
+
+package Template::Parser;
+
+use strict;
+use warnings;
+use base 'Template::Base';
+
+use Template::Constants qw( :status :chomp );
+use Template::Directive;
+use Template::Grammar;
+
+# parser state constants
+use constant CONTINUE => 0;
+use constant ACCEPT   => 1;
+use constant ERROR    => 2;
+use constant ABORT    => 3;
+
+our $VERSION = 2.89;
+our $DEBUG   = 0 unless defined $DEBUG;
+our $ERROR   = '';
+
+
+#========================================================================
+#                        -- COMMON TAG STYLES --
+#========================================================================
+
+our $TAG_STYLE   = {
+    'default'   => [ '\[%',    '%\]'    ],
+    'template1' => [ '[\[%]%', '%[\]%]' ],
+    'metatext'  => [ '%%',     '%%'     ],
+    'html'      => [ '<!--',   '-->'    ],
+    'mason'     => [ '<%',     '>'      ],
+    'asp'       => [ '<%',     '%>'     ],
+    'php'       => [ '<\?',    '\?>'    ],
+    'star'      => [ '\[\*',   '\*\]'   ],
+};
+$TAG_STYLE->{ template } = $TAG_STYLE->{ tt2 } = $TAG_STYLE->{ default };
+
+
+our $DEFAULT_STYLE = {
+    START_TAG   => $TAG_STYLE->{ default }->[0],
+    END_TAG     => $TAG_STYLE->{ default }->[1],
+#    TAG_STYLE   => 'default',
+    ANYCASE     => 0,
+    INTERPOLATE => 0,
+    PRE_CHOMP   => 0,
+    POST_CHOMP  => 0,
+    V1DOLLAR    => 0,
+    EVAL_PERL   => 0,
+};
+
+our $QUOTED_ESCAPES = {
+        n => "\n",
+        r => "\r",
+        t => "\t",
+};
+
+# note that '-' must come first so Perl doesn't think it denotes a range
+our $CHOMP_FLAGS  = qr/[-=~+]/;
+
+
+
+#========================================================================
+#                      -----  PUBLIC METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# new(\%config)
+#
+# Constructor method. 
+#------------------------------------------------------------------------
+
+sub new {
+    my $class  = shift;
+    my $config = $_[0] && ref($_[0]) eq 'HASH' ? shift(@_) : { @_ };
+    my ($tagstyle, $debug, $start, $end, $defaults, $grammar, $hash, $key, $udef);
+
+    my $self = bless { 
+        START_TAG   => undef,
+        END_TAG     => undef,
+        TAG_STYLE   => 'default',
+        ANYCASE     => 0,
+        INTERPOLATE => 0,
+        PRE_CHOMP   => 0,
+        POST_CHOMP  => 0,
+        V1DOLLAR    => 0,
+        EVAL_PERL   => 0,
+        FILE_INFO   => 1,
+        GRAMMAR     => undef,
+        _ERROR      => '',
+        IN_BLOCK    => [ ],
+        FACTORY     => $config->{ FACTORY } || 'Template::Directive',
+    }, $class;
+
+    # update self with any relevant keys in config
+    foreach $key (keys %$self) {
+        $self->{ $key } = $config->{ $key } if defined $config->{ $key };
+    }
+    $self->{ FILEINFO } = [ ];
+    
+    # DEBUG config item can be a bitmask
+    if (defined ($debug = $config->{ DEBUG })) {
+        $self->{ DEBUG } = $debug & ( Template::Constants::DEBUG_PARSER
+                                    | Template::Constants::DEBUG_FLAGS );
+        $self->{ DEBUG_DIRS } = $debug & Template::Constants::DEBUG_DIRS;
+    }
+    # package variable can be set to 1 to support previous behaviour
+    elsif ($DEBUG == 1) {
+        $self->{ DEBUG } = Template::Constants::DEBUG_PARSER;
+        $self->{ DEBUG_DIRS } = 0;
+    }
+    # otherwise let $DEBUG be a bitmask
+    else {
+        $self->{ DEBUG } = $DEBUG & ( Template::Constants::DEBUG_PARSER
+                                    | Template::Constants::DEBUG_FLAGS );
+        $self->{ DEBUG_DIRS } = $DEBUG & Template::Constants::DEBUG_DIRS;
+    }
+
+    $grammar = $self->{ GRAMMAR } ||= do {
+        require Template::Grammar;
+        Template::Grammar->new();
+    };
+
+    # build a FACTORY object to include any NAMESPACE definitions,
+    # but only if FACTORY isn't already an object
+    if ($config->{ NAMESPACE } && ! ref $self->{ FACTORY }) {
+        my $fclass = $self->{ FACTORY };
+        $self->{ FACTORY } = $fclass->new( NAMESPACE => $config->{ NAMESPACE } )
+            || return $class->error($fclass->error());
+    }
+    
+    # load grammar rules, states and lex table
+    @$self{ qw( LEXTABLE STATES RULES ) } 
+        = @$grammar{ qw( LEXTABLE STATES RULES ) };
+    
+    $self->new_style($config)
+        || return $class->error($self->error());
+        
+    return $self;
+}
+
+#-----------------------------------------------------------------------
+# These methods are used to track nested IF and WHILE blocks.  Each 
+# generated if/while block is given a label indicating the directive 
+# type and nesting depth, e.g. FOR0, WHILE1, FOR2, WHILE3, etc.  The
+# NEXT and LAST directives use the innermost label, e.g. last WHILE3;
+#-----------------------------------------------------------------------
+
+sub enter_block {
+    my ($self, $name) = @_;
+    my $blocks = $self->{ IN_BLOCK };
+    push(@{ $self->{ IN_BLOCK } }, $name);
+}
+
+sub leave_block {
+    my $self = shift;
+    my $label = $self->block_label;
+    pop(@{ $self->{ IN_BLOCK } });
+    return $label;
+}
+
+sub in_block {
+    my ($self, $name) = @_;
+    my $blocks = $self->{ IN_BLOCK };
+    return @$blocks && $blocks->[-1] eq $name;
+}
+
+sub block_label {
+    my ($self, $prefix, $suffix) = @_;
+    my $blocks = $self->{ IN_BLOCK };
+    my $name   = @$blocks 
+        ? $blocks->[-1] . scalar @$blocks 
+        : undef;
+    return join('', grep { defined $_ } $prefix, $name, $suffix);
+}
+
+
+
+#------------------------------------------------------------------------
+# new_style(\%config)
+# 
+# Install a new (stacked) parser style.  This feature is currently 
+# experimental but should mimic the previous behaviour with regard to 
+# TAG_STYLE, START_TAG, END_TAG, etc.
+#------------------------------------------------------------------------
+
+sub new_style {
+    my ($self, $config) = @_;
+    my $styles = $self->{ STYLE } ||= [ ];
+    my ($tagstyle, $tags, $start, $end, $key);
+
+    # clone new style from previous or default style
+    my $style  = { %{ $styles->[-1] || $DEFAULT_STYLE } };
+
+    # expand START_TAG and END_TAG from specified TAG_STYLE
+    if ($tagstyle = $config->{ TAG_STYLE }) {
+        return $self->error("Invalid tag style: $tagstyle")
+            unless defined ($tags = $TAG_STYLE->{ $tagstyle });
+        ($start, $end) = @$tags;
+        $config->{ START_TAG } ||= $start;
+        $config->{   END_TAG } ||= $end;
+    }
+
+    foreach $key (keys %$DEFAULT_STYLE) {
+        $style->{ $key } = $config->{ $key } if defined $config->{ $key };
+    }
+    push(@$styles, $style);
+    return $style;
+}
+
+
+#------------------------------------------------------------------------
+# old_style()
+#
+# Pop the current parser style and revert to the previous one.  See 
+# new_style().   ** experimental **
+#------------------------------------------------------------------------
+
+sub old_style {
+    my $self = shift;
+    my $styles = $self->{ STYLE };
+    return $self->error('only 1 parser style remaining')
+        unless (@$styles > 1);
+    pop @$styles;
+    return $styles->[-1];
+}
+
+
+#------------------------------------------------------------------------
+# parse($text, $data)
+#
+# Parses the text string, $text and returns a hash array representing
+# the compiled template block(s) as Perl code, in the format expected
+# by Template::Document.
+#------------------------------------------------------------------------
+
+sub parse {
+    my ($self, $text, $info) = @_;
+    my ($tokens, $block);
+
+    $info->{ DEBUG } = $self->{ DEBUG_DIRS }
+        unless defined $info->{ DEBUG };
+
+#    print "info: { ", join(', ', map { "$_ => $info->{ $_ }" } keys %$info), " }\n";
+
+    # store for blocks defined in the template (see define_block())
+    my $defblock = $self->{ DEFBLOCK } = { };
+    my $metadata = $self->{ METADATA } = [ ];
+    $self->{ DEFBLOCKS } = [ ];
+
+    $self->{ _ERROR } = '';
+
+    # split file into TEXT/DIRECTIVE chunks
+    $tokens = $self->split_text($text)
+        || return undef;                                    ## RETURN ##
+
+    push(@{ $self->{ FILEINFO } }, $info);
+
+    # parse chunks
+    $block = $self->_parse($tokens, $info);
+
+    pop(@{ $self->{ FILEINFO } });
+
+    return undef unless $block;                             ## RETURN ##
+
+    $self->debug("compiled main template document block:\n$block")
+        if $self->{ DEBUG } & Template::Constants::DEBUG_PARSER;
+
+    return {
+        BLOCK     => $block,
+        DEFBLOCKS => $defblock,
+        METADATA  => { @$metadata },
+    };
+}
+
+
+
+#------------------------------------------------------------------------
+# split_text($text)
+#
+# Split input template text into directives and raw text chunks.
+#------------------------------------------------------------------------
+
+sub split_text {
+    my ($self, $text) = @_;
+    my ($pre, $dir, $prelines, $dirlines, $postlines, $chomp, $tags, @tags);
+    my $style = $self->{ STYLE }->[-1];
+    my ($start, $end, $prechomp, $postchomp, $interp ) = 
+        @$style{ qw( START_TAG END_TAG PRE_CHOMP POST_CHOMP INTERPOLATE ) };
+    my $tags_dir = $self->{ANYCASE} ? qr<TAGS>i : qr<TAGS>;
+
+    my @tokens = ();
+    my $line = 1;
+
+    return \@tokens                                         ## RETURN ##
+        unless defined $text && length $text;
+
+    # extract all directives from the text
+    while ($text =~ s/
+           ^(.*?)               # $1 - start of line up to directive
+           (?:
+            $start          # start of tag
+            (.*?)           # $2 - tag contents
+            $end            # end of tag
+            )
+           //sx) {
+        
+        ($pre, $dir) = ($1, $2);
+        $pre = '' unless defined $pre;
+        $dir = '' unless defined $dir;
+        
+        $prelines  = ($pre =~ tr/\n//);  # newlines in preceeding text
+        $dirlines  = ($dir =~ tr/\n//);  # newlines in directive tag
+        $postlines = 0;                  # newlines chomped after tag
+        
+        for ($dir) {
+            if (/^\#/) {
+                # comment out entire directive except for any end chomp flag
+                $dir = ($dir =~ /($CHOMP_FLAGS)$/o) ? $1 : '';
+            }
+            else {
+                s/^($CHOMP_FLAGS)?\s*//so;
+                # PRE_CHOMP: process whitespace before tag
+                $chomp = $1 ? $1 : $prechomp;
+                $chomp =~ tr/-=~+/1230/;
+                if ($chomp && $pre) {
+                    # chomp off whitespace and newline preceding directive
+                    if ($chomp == CHOMP_ALL) { 
+                        $pre =~ s{ (\r?\n|^) [^\S\n]* \z }{}mx;
+                    }
+                    elsif ($chomp == CHOMP_COLLAPSE) { 
+                        $pre =~ s{ (\s+) \z }{ }x;
+                    }
+                    elsif ($chomp == CHOMP_GREEDY) { 
+                        $pre =~ s{ (\s+) \z }{}x;
+                    }
+                }
+            }
+            
+            # POST_CHOMP: process whitespace after tag
+            s/\s*($CHOMP_FLAGS)?\s*$//so;
+            $chomp = $1 ? $1 : $postchomp;
+            $chomp =~ tr/-=~+/1230/;
+            if ($chomp) {
+                if ($chomp == CHOMP_ALL) { 
+                    $text =~ s{ ^ ([^\S\n]* \n) }{}x  
+                        && $postlines++;
+                }
+                elsif ($chomp == CHOMP_COLLAPSE) { 
+                    $text =~ s{ ^ (\s+) }{ }x  
+                        && ($postlines += $1=~y/\n//);
+                }
+                # any trailing whitespace
+                elsif ($chomp == CHOMP_GREEDY) { 
+                    $text =~ s{ ^ (\s+) }{}x  
+                        && ($postlines += $1=~y/\n//);
+                }
+            }
+        }
+            
+        # any text preceding the directive can now be added
+        if (length $pre) {
+            push(@tokens, $interp
+                 ? [ $pre, $line, 'ITEXT' ]
+                 : ('TEXT', $pre) );
+        }
+        $line += $prelines;
+            
+        # and now the directive, along with line number information
+        if (length $dir) {
+            # the TAGS directive is a compile-time switch
+            if ($dir =~ /^$tags_dir\s+(.*)/) {
+                my @tags = split(/\s+/, $1);
+                if (scalar @tags > 1) {
+                    ($start, $end) = map { quotemeta($_) } @tags;
+                }
+                elsif ($tags = $TAG_STYLE->{ $tags[0] }) {
+                    ($start, $end) = @$tags;
+                }
+                else {
+                    warn "invalid TAGS style: $tags[0]\n";
+                }
+            }
+            else {
+                # DIRECTIVE is pushed as:
+                #   [ $dirtext, $line_no(s), \@tokens ]
+                push(@tokens, 
+                     [ $dir, 
+                       ($dirlines 
+                        ? sprintf("%d-%d", $line, $line + $dirlines)
+                        : $line),
+                       $self->tokenise_directive($dir) ]);
+            }
+        }
+            
+        # update line counter to include directive lines and any extra
+        # newline chomped off the start of the following text
+        $line += $dirlines + $postlines;
+    }
+        
+    # anything remaining in the string is plain text 
+    push(@tokens, $interp 
+         ? [ $text, $line, 'ITEXT' ]
+         : ( 'TEXT', $text) )
+        if length $text;
+        
+    return \@tokens;                                        ## RETURN ##
+}
+    
+
+
+#------------------------------------------------------------------------
+# interpolate_text($text, $line)
+#
+# Examines $text looking for any variable references embedded like
+# $this or like ${ this }.
+#------------------------------------------------------------------------
+
+sub interpolate_text {
+    my ($self, $text, $line) = @_;
+    my @tokens  = ();
+    my ($pre, $var, $dir);
+
+
+   while ($text =~
+           /
+           ( (?: \\. | [^\$] ){1,3000} ) # escaped or non-'$' character [$1]
+           |
+           ( \$ (?:                 # embedded variable            [$2]
+             (?: \{ ([^\}]*) \} )   # ${ ... }                     [$3]
+             |
+             ([\w\.]+)              # $word                        [$4]
+             )
+           )
+        /gx) {
+
+        ($pre, $var, $dir) = ($1, $3 || $4, $2);
+
+        # preceding text
+        if (defined($pre) && length($pre)) {
+            $line += $pre =~ tr/\n//;
+            $pre =~ s/\\\$/\$/g;
+            push(@tokens, 'TEXT', $pre);
+        }
+        # $variable reference
+        if ($var) {
+            $line += $dir =~ tr/\n/ /;
+            push(@tokens, [ $dir, $line, $self->tokenise_directive($var) ]);
+        }
+        # other '$' reference - treated as text
+        elsif ($dir) {
+            $line += $dir =~ tr/\n//;
+            push(@tokens, 'TEXT', $dir);
+        }
+    }
+
+    return \@tokens;
+}
+
+
+
+#------------------------------------------------------------------------
+# tokenise_directive($text)
+#
+# Called by the private _parse() method when it encounters a DIRECTIVE
+# token in the list provided by the split_text() or interpolate_text()
+# methods.  The directive text is passed by parameter.
+#
+# The method splits the directive into individual tokens as recognised
+# by the parser grammar (see Template::Grammar for details).  It
+# constructs a list of tokens each represented by 2 elements, as per
+# split_text() et al.  The first element contains the token type, the
+# second the token itself.
+#
+# The method tokenises the string using a complex (but fast) regex.
+# For a deeper understanding of the regex magic at work here, see
+# Jeffrey Friedl's excellent book "Mastering Regular Expressions",
+# from O'Reilly, ISBN 1-56592-257-3
+#
+# Returns a reference to the list of chunks (each one being 2 elements) 
+# identified in the directive text.  On error, the internal _ERROR string 
+# is set and undef is returned.
+#------------------------------------------------------------------------
+
+sub tokenise_directive {
+    my ($self, $text, $line) = @_;
+    my ($token, $uctoken, $type, $lookup);
+    my $lextable = $self->{ LEXTABLE };
+    my $style    = $self->{ STYLE }->[-1];
+    my ($anycase, $start, $end) = @$style{ qw( ANYCASE START_TAG END_TAG ) };
+    my @tokens = ( );
+
+    while ($text =~ 
+            / 
+                # strip out any comments
+                (\#[^\n]*)
+           |
+                # a quoted phrase matches in $3
+                (["'])                   # $2 - opening quote, ' or "
+                (                        # $3 - quoted text buffer
+                    (?:                  # repeat group (no backreference)
+                        \\\\             # an escaped backslash \\
+                    |                    # ...or...
+                        \\\2             # an escaped quote \" or \' (match $1)
+                    |                    # ...or...
+                        .                # any other character
+                    |   \n
+                    )*?                  # non-greedy repeat
+                )                        # end of $3
+                \2                       # match opening quote
+            |
+                # an unquoted number matches in $4
+                (-?\d+(?:\.\d+)?)       # numbers
+            |
+                # filename matches in $5
+                ( \/?\w+(?:(?:\/|::?)\w*)+ | \/\w+)
+            |
+                # an identifier matches in $6
+                (\w+)                    # variable identifier
+            |   
+                # an unquoted word or symbol matches in $7
+                (   [(){}\[\]:;,\/\\]    # misc parenthesis and symbols
+#               |   \->                  # arrow operator (for future?)
+                |   [+\-*]               # math operations
+                |   \$\{?                # dollar with option left brace
+                |   =>                   # like '='
+                |   [=!<>]?= | [!<>]     # eqality tests
+                |   &&? | \|\|?          # boolean ops
+                |   \.\.?                # n..n sequence
+                |   \S+                  # something unquoted
+                )                        # end of $7
+            /gmxo) {
+
+        # ignore comments to EOL
+        next if $1;
+
+        # quoted string
+        if (defined ($token = $3)) {
+            # double-quoted string may include $variable references
+            if ($2 eq '"') {
+                if ($token =~ /[\$\\]/) {
+                    $type = 'QUOTED';
+                    # unescape " and \ but leave \$ escaped so that 
+                        # interpolate_text() doesn't incorrectly treat it
+                    # as a variable reference
+#                   $token =~ s/\\([\\"])/$1/g;
+                        for ($token) {
+                                s/\\([^\$nrt])/$1/g;
+                                s/\\([nrt])/$QUOTED_ESCAPES->{ $1 }/ge;
+                        }
+                    push(@tokens, ('"') x 2,
+                                  @{ $self->interpolate_text($token) },
+                                  ('"') x 2);
+                    next;
+                }
+                else {
+                    $type = 'LITERAL';
+                    $token =~ s['][\\']g;
+                    $token = "'$token'";
+                }
+            } 
+            else {
+                $type = 'LITERAL';
+                $token = "'$token'";
+            }
+        }
+        # number
+        elsif (defined ($token = $4)) {
+            $type = 'NUMBER';
+        }
+        elsif (defined($token = $5)) {
+            $type = 'FILENAME';
+        }
+        elsif (defined($token = $6)) {
+            # Fold potential keywords to UPPER CASE if the ANYCASE option is
+            # set, unless (we've got some preceeding tokens and) the previous
+            # token is a DOT op.  This prevents the 'last' in 'data.last'
+            # from being interpreted as the LAST keyword.
+            $uctoken = 
+                ($anycase && (! @tokens || $tokens[-2] ne 'DOT'))
+                    ? uc $token
+                    :    $token;
+            if (defined ($type = $lextable->{ $uctoken })) {
+                $token = $uctoken;
+            }
+            else {
+                $type = 'IDENT';
+            }
+        }
+        elsif (defined ($token = $7)) {
+            # reserved words may be in lower case unless case sensitive
+            $uctoken = $anycase ? uc $token : $token;
+            unless (defined ($type = $lextable->{ $uctoken })) {
+                $type = 'UNQUOTED';
+            }
+        }
+
+        push(@tokens, $type, $token);
+
+#       print(STDERR " +[ $type, $token ]\n")
+#           if $DEBUG;
+    }
+
+#    print STDERR "tokenise directive() returning:\n  [ @tokens ]\n"
+#       if $DEBUG;
+
+    return \@tokens;                                        ## RETURN ##
+}
+
+
+#------------------------------------------------------------------------
+# define_block($name, $block)
+#
+# Called by the parser 'defblock' rule when a BLOCK definition is 
+# encountered in the template.  The name of the block is passed in the 
+# first parameter and a reference to the compiled block is passed in
+# the second.  This method stores the block in the $self->{ DEFBLOCK }
+# hash which has been initialised by parse() and will later be used 
+# by the same method to call the store() method on the calling cache
+# to define the block "externally".
+#------------------------------------------------------------------------
+
+sub define_block {
+    my ($self, $name, $block) = @_;
+    my $defblock = $self->{ DEFBLOCK } 
+        || return undef;
+
+    $self->debug("compiled block '$name':\n$block")
+        if $self->{ DEBUG } & Template::Constants::DEBUG_PARSER;
+
+    $defblock->{ $name } = $block;
+    
+    return undef;
+}
+
+sub push_defblock {
+    my $self = shift;
+    my $stack = $self->{ DEFBLOCK_STACK } ||= [];
+    push(@$stack, $self->{ DEFBLOCK } );
+    $self->{ DEFBLOCK } = { };
+}
+
+sub pop_defblock {
+    my $self  = shift;
+    my $defs  = $self->{ DEFBLOCK };
+    my $stack = $self->{ DEFBLOCK_STACK } || return $defs;
+    return $defs unless @$stack;
+    $self->{ DEFBLOCK } = pop @$stack;
+    return $defs;
+}
+
+
+#------------------------------------------------------------------------
+# add_metadata(\@setlist)
+#------------------------------------------------------------------------
+
+sub add_metadata {
+    my ($self, $setlist) = @_;
+    my $metadata = $self->{ METADATA } 
+        || return undef;
+
+    push(@$metadata, @$setlist);
+    
+    return undef;
+}
+
+
+#------------------------------------------------------------------------
+# location()
+#
+# Return Perl comment indicating current parser file and line
+#------------------------------------------------------------------------
+
+sub location {
+    my $self = shift;
+    return "\n" unless $self->{ FILE_INFO };
+    my $line = ${ $self->{ LINE } };
+    my $info = $self->{ FILEINFO }->[-1];
+    my $file = $info->{ path } || $info->{ name } 
+        || '(unknown template)';
+    $line =~ s/\-.*$//; # might be 'n-n'
+    $line ||= 1;
+    return "#line $line \"$file\"\n";
+}
+
+
+#========================================================================
+#                     -----  PRIVATE METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# _parse(\@tokens, \@info)
+#
+# Parses the list of input tokens passed by reference and returns a 
+# Template::Directive::Block object which contains the compiled 
+# representation of the template. 
+#
+# This is the main parser DFA loop.  See embedded comments for 
+# further details.
+#
+# On error, undef is returned and the internal _ERROR field is set to 
+# indicate the error.  This can be retrieved by calling the error() 
+# method.
+#------------------------------------------------------------------------
+
+sub _parse {
+    my ($self, $tokens, $info) = @_;
+    my ($token, $value, $text, $line, $inperl);
+    my ($state, $stateno, $status, $action, $lookup, $coderet, @codevars);
+    my ($lhs, $len, $code);         # rule contents
+    my $stack = [ [ 0, undef ] ];   # DFA stack
+
+# DEBUG
+#   local $" = ', ';
+
+    # retrieve internal rule and state tables
+    my ($states, $rules) = @$self{ qw( STATES RULES ) };
+
+    # call the grammar set_factory method to install emitter factory
+    $self->{ GRAMMAR }->install_factory($self->{ FACTORY });
+
+    $line = $inperl = 0;
+    $self->{ LINE   } = \$line;
+    $self->{ FILE   } = $info->{ name };
+    $self->{ INPERL } = \$inperl;
+
+    $status = CONTINUE;
+    my $in_string = 0;
+
+    while(1) {
+        # get state number and state
+        $stateno =  $stack->[-1]->[0];
+        $state   = $states->[$stateno];
+
+        # see if any lookaheads exist for the current state
+        if (exists $state->{'ACTIONS'}) {
+
+            # get next token and expand any directives (i.e. token is an 
+            # array ref) onto the front of the token list
+            while (! defined $token && @$tokens) {
+                $token = shift(@$tokens);
+                if (ref $token) {
+                    ($text, $line, $token) = @$token;
+                    if (ref $token) {
+                        if ($info->{ DEBUG } && ! $in_string) {
+                            # - - - - - - - - - - - - - - - - - - - - - - - - -
+                            # This is gnarly.  Look away now if you're easily
+                            # frightened.  We're pushing parse tokens onto the
+                            # pending list to simulate a DEBUG directive like so:
+                            # [% DEBUG msg line='20' text='INCLUDE foo' %]
+                            # - - - - - - - - - - - - - - - - - - - - - - - - -
+                            my $dtext = $text;
+                            $dtext =~ s[(['\\])][\\$1]g;
+                            unshift(@$tokens, 
+                                    DEBUG   => 'DEBUG',
+                                    IDENT   => 'msg',
+                                    IDENT   => 'line',
+                                    ASSIGN  => '=',
+                                    LITERAL => "'$line'",
+                                    IDENT   => 'text',
+                                    ASSIGN  => '=',
+                                    LITERAL => "'$dtext'",
+                                    IDENT   => 'file',
+                                    ASSIGN  => '=',
+                                    LITERAL => "'$info->{ name }'",
+                                    (';') x 2,
+                                    @$token, 
+                                    (';') x 2);
+                        }
+                        else {
+                            unshift(@$tokens, @$token, (';') x 2);
+                        }
+                        $token = undef;  # force redo
+                    }
+                    elsif ($token eq 'ITEXT') {
+                        if ($inperl) {
+                            # don't perform interpolation in PERL blocks
+                            $token = 'TEXT';
+                            $value = $text;
+                        }
+                        else {
+                            unshift(@$tokens, 
+                                    @{ $self->interpolate_text($text, $line) });
+                            $token = undef; # force redo
+                        }
+                    }
+                }
+                else {
+                    # toggle string flag to indicate if we're crossing
+                    # a string boundary
+                    $in_string = ! $in_string if $token eq '"';
+                    $value = shift(@$tokens);
+                }
+            };
+            # clear undefined token to avoid 'undefined variable blah blah'
+            # warnings and let the parser logic pick it up in a minute
+            $token = '' unless defined $token;
+
+            # get the next state for the current lookahead token
+            $action = defined ($lookup = $state->{'ACTIONS'}->{ $token })
+                      ? $lookup
+                      : defined ($lookup = $state->{'DEFAULT'})
+                        ? $lookup
+                        : undef;
+        }
+        else {
+            # no lookahead actions
+            $action = $state->{'DEFAULT'};
+        }
+
+        # ERROR: no ACTION
+        last unless defined $action;
+
+        # - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+        # shift (+ive ACTION)
+        # - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+        if ($action > 0) {
+            push(@$stack, [ $action, $value ]);
+            $token = $value = undef;
+            redo;
+        };
+
+        # - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+        # reduce (-ive ACTION)
+        # - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+        ($lhs, $len, $code) = @{ $rules->[ -$action ] };
+
+        # no action imples ACCEPTance
+        $action
+            or $status = ACCEPT;
+
+        # use dummy sub if code ref doesn't exist
+        $code = sub { $_[1] }
+            unless $code;
+
+        @codevars = $len
+                ?   map { $_->[1] } @$stack[ -$len .. -1 ]
+                :   ();
+
+        eval {
+            $coderet = &$code( $self, @codevars );
+        };
+        if ($@) {
+            my $err = $@;
+            chomp $err;
+            return $self->_parse_error($err);
+        }
+
+        # reduce stack by $len
+        splice(@$stack, -$len, $len);
+
+        # ACCEPT
+        return $coderet                                     ## RETURN ##
+            if $status == ACCEPT;
+
+        # ABORT
+        return undef                                        ## RETURN ##
+            if $status == ABORT;
+
+        # ERROR
+        last 
+            if $status == ERROR;
+    }
+    continue {
+        push(@$stack, [ $states->[ $stack->[-1][0] ]->{'GOTOS'}->{ $lhs }, 
+              $coderet ]), 
+    }
+
+    # ERROR                                                 ## RETURN ##
+    return $self->_parse_error('unexpected end of input')
+        unless defined $value;
+
+    # munge text of last directive to make it readable
+#    $text =~ s/\n/\\n/g;
+
+    return $self->_parse_error("unexpected end of directive", $text)
+        if $value eq ';';   # end of directive SEPARATOR
+
+    return $self->_parse_error("unexpected token ($value)", $text);
+}
+
+
+
+#------------------------------------------------------------------------
+# _parse_error($msg, $dirtext)
+#
+# Method used to handle errors encountered during the parse process
+# in the _parse() method.  
+#------------------------------------------------------------------------
+
+sub _parse_error {
+    my ($self, $msg, $text) = @_;
+    my $line = $self->{ LINE };
+    $line = ref($line) ? $$line : $line;
+    $line = 'unknown' unless $line;
+
+    $msg .= "\n  [% $text %]"
+        if defined $text;
+
+    return $self->error("line $line: $msg");
+}
+
+
+#------------------------------------------------------------------------
+# _dump()
+# 
+# Debug method returns a string representing the internal state of the 
+# object.
+#------------------------------------------------------------------------
+
+sub _dump {
+    my $self = shift;
+    my $output = "[Template::Parser] {\n";
+    my $format = "    %-16s => %s\n";
+    my $key;
+
+    foreach $key (qw( START_TAG END_TAG TAG_STYLE ANYCASE INTERPOLATE 
+                      PRE_CHOMP POST_CHOMP V1DOLLAR )) {
+        my $val = $self->{ $key };
+        $val = '<undef>' unless defined $val;
+        $output .= sprintf($format, $key, $val);
+    }
+
+    $output .= '}';
+    return $output;
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Parser - LALR(1) parser for compiling template documents
+
+=head1 SYNOPSIS
+
+    use Template::Parser;
+    
+    $parser   = Template::Parser->new(\%config);
+    $template = $parser->parse($text)
+        || die $parser->error(), "\n";
+
+=head1 DESCRIPTION
+
+The C<Template::Parser> module implements a LALR(1) parser and associated
+methods for parsing template documents into Perl code.
+
+=head1 PUBLIC METHODS
+
+=head2 new(\%params)
+
+The C<new()> constructor creates and returns a reference to a new 
+C<Template::Parser> object.  
+
+A reference to a hash may be supplied as a parameter to provide configuration values.  
+See L<CONFIGURATION OPTIONS> below for a summary of these options and 
+L<Template::Manual::Config> for full details.
+
+    my $parser = Template::Parser->new({
+        START_TAG => quotemeta('<+'),
+        END_TAG   => quotemeta('+>'),
+    });
+
+=head2 parse($text)
+
+The C<parse()> method parses the text passed in the first parameter and
+returns a reference to a hash array of data defining the compiled
+representation of the template text, suitable for passing to the
+L<Template::Document> L<new()|Template::Document#new()> constructor method. On
+error, undef is returned.
+
+    $data = $parser->parse($text)
+        || die $parser->error();
+
+The C<$data> hash reference returned contains a C<BLOCK> item containing the
+compiled Perl code for the template, a C<DEFBLOCKS> item containing a
+reference to a hash array of sub-template C<BLOCK>s defined within in the
+template, and a C<METADATA> item containing a reference to a hash array
+of metadata values defined in C<META> tags.
+
+=head1 CONFIGURATION OPTIONS
+
+The C<Template::Parser> module accepts the following configuration 
+options.  Please see L<Template::Manual::Config> for futher details
+on each option.
+
+=head2 START_TAG, END_TAG
+
+The L<START_TAG|Template::Manual::Config#START_TAG_END_TAG> and
+L<END_TAG|Template::Manual::Config#START_TAG_END_TAG> options are used to
+specify character sequences or regular expressions that mark the start and end
+of a template directive.
+
+    my $parser = Template::Parser->new({ 
+        START_TAG => quotemeta('<+'),
+        END_TAG   => quotemeta('+>'),
+    });
+
+=head2 TAG_STYLE
+
+The L<TAG_STYLE|Template::Manual::Config#TAG_STYLE> option can be used to set
+both L<START_TAG> and L<END_TAG> according to pre-defined tag styles.
+
+    my $parser = Template::Parser->new({ 
+        TAG_STYLE => 'star',     # [* ... *]
+    });
+
+=head2 PRE_CHOMP, POST_CHOMP
+
+The L<PRE_CHOMP|Template::Manual::Config#PRE_CHOMP_POST_CHOMP> and
+L<POST_CHOMP|Template::Manual::Config#PRE_CHOMP_POST_CHOMP> can be set to remove
+any whitespace before or after a directive tag, respectively.
+
+    my $parser = Template::Parser-E<gt>new({
+        PRE_CHOMP  => 1,
+        POST_CHOMP => 1,
+    });
+
+=head2 INTERPOLATE
+
+The L<INTERPOLATE|Template::Manual::Config#INTERPOLATE> flag can be set
+to allow variables to be embedded in plain text blocks.
+
+    my $parser = Template::Parser->new({ 
+        INTERPOLATE => 1,
+    });
+
+Variables should be prefixed by a C<$> to identify them, using curly braces
+to explicitly scope the variable name where necessary.
+
+    Hello ${name},
+    
+    The day today is ${day.today}.
+
+=head2 ANYCASE
+
+The L<ANYCASE|Template::Manual::Config#ANYCASE> option can be set
+to allow directive keywords to be specified in any case.
+
+    # with ANYCASE set to 1
+    [% INCLUDE foobar %]    # OK
+    [% include foobar %]    # OK
+    [% include = 10   %]    # ERROR, 'include' is a reserved word
+
+=head2 GRAMMAR
+
+The L<GRAMMAR|Template::Manual::Config#GRAMMAR> configuration item can be used
+to specify an alternate grammar for the parser. This allows a modified or
+entirely new template language to be constructed and used by the Template
+Toolkit.
+
+    use MyOrg::Template::Grammar;
+    
+    my $parser = Template::Parser->new({ 
+        GRAMMAR = MyOrg::Template::Grammar->new();
+    });
+
+By default, an instance of the default L<Template::Grammar> will be
+created and used automatically if a C<GRAMMAR> item isn't specified.
+
+=head2 DEBUG
+
+The L<DEBUG|Template::Manual::Config#DEBUG> option can be used to enable
+various debugging features of the C<Template::Parser> module.
+
+    use Template::Constants qw( :debug );
+    
+    my $template = Template->new({
+        DEBUG => DEBUG_PARSER | DEBUG_DIRS,
+    });
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+The main parsing loop of the C<Template::Parser> module was derived from a
+standalone parser generated by version 0.16 of the C<Parse::Yapp> module. The
+following copyright notice appears in the C<Parse::Yapp> documentation.
+
+    The Parse::Yapp module and its related modules and shell
+    scripts are copyright (c) 1998 Francois Desarmenien,
+    France. All rights reserved.
+    
+    You may use and distribute them under the terms of either
+    the GNU General Public License or the Artistic License, as
+    specified in the Perl README file.
+
+=head1 SEE ALSO
+
+L<Template>, L<Template::Grammar>, L<Template::Directive>
+
diff --git a/bench/perl/Template/Plugin.pm b/bench/perl/Template/Plugin.pm
new file mode 100644
index 0000000..6b65cd2
--- /dev/null
+++ b/bench/perl/Template/Plugin.pm
@@ -0,0 +1,369 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin
+#
+# DESCRIPTION
+#
+#   Module defining a base class for a plugin object which can be loaded
+#   and instantiated via the USE directive.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it an/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin;
+
+use strict;
+use warnings;
+use base 'Template::Base';
+
+our $VERSION = 2.70;
+our $DEBUG   = 0 unless defined $DEBUG;
+our $ERROR   = '';
+our $AUTOLOAD;
+
+
+#========================================================================
+#                      -----  CLASS METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# load()
+#
+# Class method called when the plugin module is first loaded.  It 
+# returns the name of a class (by default, its own class) or a prototype
+# object which will be used to instantiate new objects.  The new() 
+# method is then called against the class name (class method) or 
+# prototype object (object method) to create a new instances of the 
+# object.
+#------------------------------------------------------------------------
+
+sub load {
+    return $_[0];
+}
+
+
+#------------------------------------------------------------------------
+# new($context, $delegate, @params)
+#
+# Object constructor which is called by the Template::Context to 
+# instantiate a new Plugin object.  This base class constructor is 
+# used as a general mechanism to load and delegate to other Perl 
+# modules.  The context is passed as the first parameter, followed by
+# a reference to a delegate object or the name of the module which 
+# should be loaded and instantiated.  Any additional parameters passed 
+# to the USE directive are forwarded to the new() constructor.
+# 
+# A plugin object is returned which has an AUTOLOAD method to delegate 
+# requests to the underlying object.
+#------------------------------------------------------------------------
+
+sub new {
+    my $class = shift;
+    bless {
+    }, $class;
+}
+
+sub old_new {
+    my ($class, $context, $delclass, @params) = @_;
+    my ($delegate, $delmod);
+
+    return $class->error("no context passed to $class constructor\n")
+        unless defined $context;
+
+    if (ref $delclass) {
+        # $delclass contains a reference to a delegate object
+        $delegate = $delclass;
+    }
+    else {
+        # delclass is the name of a module to load and instantiate
+        ($delmod = $delclass) =~ s|::|/|g;
+
+        eval {
+            require "$delmod.pm";
+            $delegate = $delclass->new(@params)
+                || die "failed to instantiate $delclass object\n";
+        };
+        return $class->error($@) if $@;
+    }
+
+    bless {
+        _CONTEXT  => $context, 
+        _DELEGATE => $delegate,
+        _PARAMS   => \@params,
+    }, $class;
+}
+
+
+#------------------------------------------------------------------------
+# fail($error)
+# 
+# Version 1 error reporting function, now replaced by error() inherited
+# from Template::Base.  Raises a "deprecated function" warning and then
+# calls error().
+#------------------------------------------------------------------------
+
+sub fail {
+    my $class = shift;
+    my ($pkg, $file, $line) = caller();
+    warn "Template::Plugin::fail() is deprecated at $file line $line.  Please use error()\n";
+    $class->error(@_);
+}
+
+
+#========================================================================
+#                      -----  OBJECT METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# AUTOLOAD
+#
+# General catch-all method which delegates all calls to the _DELEGATE 
+# object.  
+#------------------------------------------------------------------------
+
+sub OLD_AUTOLOAD {
+    my $self     = shift;
+    my $method   = $AUTOLOAD;
+
+    $method =~ s/.*:://;
+    return if $method eq 'DESTROY';
+
+    if (ref $self eq 'HASH') {
+        my $delegate = $self->{ _DELEGATE } || return;
+        return $delegate->$method(@_);
+    }
+    my ($pkg, $file, $line) = caller();
+#    warn "no such '$method' method called on $self at $file line $line\n";
+    return undef;
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin - Base class for Template Toolkit plugins
+
+=head1 SYNOPSIS
+
+    package MyOrg::Template::Plugin::MyPlugin;
+    use base qw( Template::Plugin );
+    use Template::Plugin;
+    use MyModule;
+    
+    sub new {
+        my $class   = shift;
+        my $context = shift;
+        bless {
+            ...
+        }, $class;
+    }
+
+=head1 DESCRIPTION
+
+A "plugin" for the Template Toolkit is simply a Perl module which 
+exists in a known package location (e.g. C<Template::Plugin::*>) and 
+conforms to a regular standard, allowing it to be loaded and used 
+automatically.
+
+The C<Template::Plugin> module defines a base class from which other 
+plugin modules can be derived.  A plugin does not have to be derived
+from Template::Plugin but should at least conform to its object-oriented
+interface.
+
+It is recommended that you create plugins in your own package namespace
+to avoid conflict with toolkit plugins.  e.g. 
+
+    package MyOrg::Template::Plugin::FooBar;
+
+Use the L<PLUGIN_BASE|Template::Manual::Config#PLUGIN_BASE> option to specify
+the namespace that you use. e.g.
+
+    use Template;
+    my $template = Template->new({ 
+        PLUGIN_BASE => 'MyOrg::Template::Plugin',
+    });
+
+=head1 METHODS
+
+The following methods form the basic interface between the Template
+Toolkit and plugin modules.
+
+=head2 load($context)
+
+This method is called by the Template Toolkit when the plugin module
+is first loaded.  It is called as a package method and thus implicitly
+receives the package name as the first parameter.  A reference to the
+L<Template::Context> object loading the plugin is also passed.  The
+default behaviour for the C<load()> method is to simply return the class
+name.  The calling context then uses this class name to call the C<new()>
+package method.
+
+    package MyPlugin;
+    
+    sub load {               # called as MyPlugin->load($context)
+        my ($class, $context) = @_;
+        return $class;       # returns 'MyPlugin'
+    }
+
+=head2 new($context, @params)
+
+This method is called to instantiate a new plugin object for the C<USE>
+directive. It is called as a package method against the class name returned by
+L<load()>. A reference to the L<Template::Context> object creating the plugin
+is passed, along with any additional parameters specified in the C<USE>
+directive.
+
+    sub new {                # called as MyPlugin->new($context)
+        my ($class, $context, @params) = @_;
+        bless {
+            _CONTEXT => $context,
+        }, $class;           # returns blessed MyPlugin object
+    }
+
+=head2 error($error)
+
+This method, inherited from the L<Template::Base> module, is used for 
+reporting and returning errors.   It can be called as a package method
+to set/return the C<$ERROR> package variable, or as an object method to 
+set/return the object C<_ERROR> member.  When called with an argument, it
+sets the relevant variable and returns C<undef.>  When called without an
+argument, it returns the value of the variable.
+
+    package MyPlugin;
+    use base 'Template::Plugin';
+    
+    sub new {
+        my ($class, $context, $dsn) = @_;
+        
+        return $class->error('No data source specified')
+            unless $dsn;
+        
+        bless {
+            _DSN => $dsn,
+        }, $class;
+    }
+
+    package main;
+    
+    my $something = MyPlugin->new()
+        || die MyPlugin->error(), "\n";
+        
+    $something->do_something()
+        || die $something->error(), "\n";
+
+=head1 DEEPER MAGIC
+
+The L<Template::Context> object that handles the loading and use of plugins
+calls the L<new()> and L<error()> methods against the package name returned by
+the L<load()> method. In pseudo-code terms looks something like this:
+
+    $class  = MyPlugin->load($context);       # returns 'MyPlugin'
+    
+    $object = $class->new($context, @params)  # MyPlugin->new(...)
+        || die $class->error();               # MyPlugin->error()
+
+The L<load()> method may alterately return a blessed reference to an
+object instance.  In this case, L<new()> and L<error()> are then called as
+I<object> methods against that prototype instance.
+
+    package YourPlugin;
+    
+    sub load {
+        my ($class, $context) = @_;
+        bless {
+            _CONTEXT => $context,
+        }, $class;
+    }
+    
+    sub new {
+        my ($self, $context, @params) = @_;
+        return $self;
+    }
+
+In this example, we have implemented a 'Singleton' plugin.  One object 
+gets created when L<load()> is called and this simply returns itself for
+each call to L<new().>   
+
+Another implementation might require individual objects to be created
+for every call to L<new(),> but with each object sharing a reference to
+some other object to maintain cached data, database handles, etc.
+This pseudo-code example demonstrates the principle.
+
+    package MyServer;
+    
+    sub load {
+        my ($class, $context) = @_;
+        bless {
+            _CONTEXT => $context,
+            _CACHE   => { },
+        }, $class;
+    }
+    
+    sub new {
+        my ($self, $context, @params) = @_;
+        MyClient->new($self, @params);
+    }
+    
+    sub add_to_cache   { ... }
+    
+    sub get_from_cache { ... }
+
+    package MyClient;
+    
+    sub new {
+        my ($class, $server, $blah) = @_;
+        bless {
+            _SERVER => $server,
+            _BLAH   => $blah,
+        }, $class;
+    }
+    
+    sub get {
+        my $self = shift;
+        $self->{ _SERVER }->get_from_cache(@_);
+    }
+    
+    sub put {
+        my $self = shift;
+        $self->{ _SERVER }->add_to_cache(@_);
+    }
+
+When the plugin is loaded, a C<MyServer> instance is created. The L<new()>
+method is called against this object which instantiates and returns a C<MyClient>
+object, primed to communicate with the creating C<MyServer>.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template>, L<Template::Plugins>, L<Template::Context>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/Assert.pm b/bench/perl/Template/Plugin/Assert.pm
new file mode 100644
index 0000000..e35c920
--- /dev/null
+++ b/bench/perl/Template/Plugin/Assert.pm
@@ -0,0 +1,155 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::Assert
+#
+# DESCRIPTION
+#   Template Toolkit plugin module which allows you to assert that
+#   items fetchs from the stash are defined.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 2008 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::Assert;
+use base 'Template::Plugin';
+use strict;
+use warnings;
+use Template::Exception;
+
+our $VERSION   = 1.00;
+our $MONAD     = 'Template::Monad::Assert';
+our $EXCEPTION = 'Template::Exception';
+our $AUTOLOAD;
+
+sub load {
+    my $class   = shift;
+    my $context = shift;
+    my $stash   = $context->stash;
+    my $vmethod = sub {
+        $MONAD->new($stash, shift);
+    };
+
+    # define .assert vmethods for hash and list objects
+    $context->define_vmethod( hash => assert => $vmethod );
+    $context->define_vmethod( list => assert => $vmethod );
+
+    return $class;
+}
+
+sub new {
+    my ($class, $context, @args) = @_;
+    # create an assert plugin object which will handle simple variable
+    # lookups.
+    return bless { _CONTEXT => $context }, $class;
+}
+
+sub AUTOLOAD {
+    my ($self, @args) = @_;
+    my $item = $AUTOLOAD;
+    $item =~ s/.*:://;
+    return if $item eq 'DESTROY';
+    
+    # lookup the named values
+    my $stash = $self->{ _CONTEXT }->stash;
+    my $value = $stash->dotop($stash, $item, \@args);
+
+    if (! defined $value) {
+        die $EXCEPTION->new( assert => "undefined value for $item" );
+    }
+    return $value;
+}
+
+
+package Template::Monad::Assert;
+
+our $EXCEPTION = 'Template::Exception';
+our $AUTOLOAD;
+
+sub new {
+    my ($class, $stash, $this) = @_;
+    bless [$stash, $this], $class;
+}
+
+sub AUTOLOAD {
+    my ($self, @args) = @_;
+    my ($stash, $this) = @$self;
+    my $item = $AUTOLOAD;
+    $item =~ s/.*:://;
+    return if $item eq 'DESTROY';
+
+    my $value = $stash->dotop($stash, $item, \@args);
+
+    if (! defined $value) {
+        die $EXCEPTION->new( assert => "undefined value for $item" );
+    }
+    return $value;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Assert - trap undefined values
+
+=head1 SYNOPSIS
+
+    [% USE assert %]
+    
+    # throws error if any undefined values are returned
+    [% object.assert.method %]
+    [% hash.assert.key %]
+    [% list.assert.item %]
+
+=head1 DESCRIPTION
+
+This plugin defines the C<assert> virtual method that can be used
+to automatically throw errors when undefined values are used.
+
+For example, consider this dotop:
+
+    [% user.name %]
+
+If C<user.name> is an undefined value then TT will silently ignore the 
+fact and print nothing.  If you C<USE> the C<assert> plugin then you
+can add the C<assert> vmethod between the C<user> and C<name> elements,
+like so:
+
+    [% user.assert.name %]
+
+Now, if C<user.name> is an undefined value, an exception will be thrown:
+
+    assert error - undefined value for name
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 2008 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/CGI.pm b/bench/perl/Template/Plugin/CGI.pm
new file mode 100644
index 0000000..0fd933f
--- /dev/null
+++ b/bench/perl/Template/Plugin/CGI.pm
@@ -0,0 +1,135 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::CGI
+#
+# DESCRIPTION
+#   Simple Template Toolkit plugin interfacing to the CGI.pm module.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::CGI;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+use CGI;
+
+our $VERSION = 2.70;
+
+sub new {
+    my $class   = shift;
+    my $context = shift;
+    CGI->new(@_);
+}
+
+# monkeypatch CGI::params() method to Do The Right Thing in TT land
+
+sub CGI::params {
+    my $self = shift;
+    local $" = ', ';
+
+    return $self->{ _TT_PARAMS } ||= do {
+        # must call Vars() in a list context to receive
+        # plain list of key/vals rather than a tied hash
+        my $params = { $self->Vars() };
+
+        # convert any null separated values into lists
+        @$params{ keys %$params } = map { 
+            /\0/ ? [ split /\0/ ] : $_ 
+        } values %$params;
+
+        $params;
+    };
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::CGI - Interface to the CGI module
+
+=head1 SYNOPSIS
+
+    [% USE CGI %]
+    [% CGI.param('parameter') %]
+    
+    [% USE things = CGI %]
+    [% things.param('name') %]
+    
+    # see CGI docs for other methods provided by the CGI object
+
+=head1 DESCRIPTION
+
+This is a very simple Template Toolkit Plugin interface to the C<CGI> module.
+A C<CGI> object will be instantiated via the following directive:
+
+    [% USE CGI %]
+
+C<CGI> methods may then be called as follows:
+
+    [% CGI.header %]
+    [% CGI.param('parameter') %]
+
+An alias can be used to provide an alternate name by which the object should
+be identified.
+
+    [% USE mycgi = CGI %]
+    [% mycgi.start_form %]
+    [% mycgi.popup_menu({ Name   => 'Color'
+                          Values => [ 'Green' 'Black' 'Brown' ] }) %]
+
+Parenthesised parameters to the C<USE> directive will be passed to the plugin 
+constructor:
+
+    [% USE cgiprm = CGI('uid=abw&name=Andy+Wardley') %]
+    [% cgiprm.param('uid') %]
+
+=head1 METHODS
+
+In addition to all the methods supported by the C<CGI> module, this
+plugin defines the following.
+
+=head2 params()
+
+This method returns a reference to a hash of all the C<CGI> parameters.
+Any parameters that have multiple values will be returned as lists.
+
+    [% USE CGI('user=abw&item=foo&item=bar') %]
+    [% CGI.params.user %]            # abw
+    [% CGI.params.item.join(', ') %] # foo, bar
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>, L<CGI>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/Datafile.pm b/bench/perl/Template/Plugin/Datafile.pm
new file mode 100644
index 0000000..a92b696
--- /dev/null
+++ b/bench/perl/Template/Plugin/Datafile.pm
@@ -0,0 +1,166 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::Datafile
+#
+# DESCRIPTION
+#   Template Toolkit Plugin which reads a datafile and constructs a 
+#   list object containing hashes representing records in the file.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::Datafile;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+
+our $VERSION = 2.72;
+
+sub new {
+    my ($class, $context, $filename, $params) = @_;
+    my ($delim, $line, @fields, @data, @results);
+    my $self = [ ];
+    local *FD;
+    local $/ = "\n";
+
+    $params ||= { };
+    $delim = $params->{'delim'} || ':';
+    $delim = quotemeta($delim);
+
+    return $class->fail("No filename specified")
+        unless $filename;
+
+    open(FD, $filename)
+        || return $class->fail("$filename: $!");
+
+    # first line of file should contain field definitions
+    while (! $line || $line =~ /^#/) {
+        $line = <FD>;
+        chomp $line;
+        $line =~ s/\r$//;
+    }
+
+    (@fields = split(/\s*$delim\s*/, $line)) 
+        || return $class->fail("first line of file must contain field names");
+
+    # read each line of the file
+    while (<FD>) {
+        chomp;
+        s/\r$//;
+
+        # ignore comments and blank lines
+        next if /^#/ || /^\s*$/;
+
+        # split line into fields
+        @data = split(/\s*$delim\s*/);
+
+        # create hash record to represent data
+        my %record;
+        @record{ @fields } = @data;
+
+        push(@$self, \%record);
+    }
+
+#    return $self;
+    bless $self, $class;
+}       
+
+
+sub as_list {
+    return $_[0];
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Datafile - Plugin to construct records from a simple data file
+
+=head1 SYNOPSIS
+
+    [% USE mydata = datafile('/path/to/datafile') %]
+    [% USE mydata = datafile('/path/to/datafile', delim = '|') %]
+    
+    [% FOREACH record = mydata %]
+       [% record.this %]  [% record.that %]
+    [% END %]
+
+=head1 DESCRIPTION
+
+This plugin provides a simple facility to construct a list of hash 
+references, each of which represents a data record of known structure,
+from a data file.
+
+    [% USE datafile(filename) %]
+
+A absolute filename must be specified (for this initial implementation at 
+least - in a future version it might also use the C<INCLUDE_PATH>).  An 
+optional C<delim> parameter may also be provided to specify an alternate
+delimiter character.
+
+    [% USE userlist = datafile('/path/to/file/users')     %]
+    [% USE things   = datafile('items', delim = '|') %]
+
+The format of the file is intentionally simple.  The first line
+defines the field names, delimited by colons with optional surrounding
+whitespace.  Subsequent lines then defines records containing data
+items, also delimited by colons.  e.g.
+
+    id : name : email : tel
+    abw : Andy Wardley : abw at tt2.org : 555-1234
+    sam : Simon Matthews : sam at tt2.org : 555-9876
+
+Each line is read, split into composite fields, and then used to 
+initialise a hash array containing the field names as relevant keys.
+The plugin returns a blessed list reference containing the hash 
+references in the order as defined in the file.
+
+    [% FOREACH user = userlist %]
+       [% user.id %]: [% user.name %]
+    [% END %]
+
+The first line of the file B<must> contain the field definitions.
+After the first line, blank lines will be ignored, along with comment
+line which start with a 'C<#>'.
+
+=head1 BUGS
+
+Should handle file names relative to C<INCLUDE_PATH>.
+Doesn't permit use of 'C<:>' in a field.  Some escaping mechanism is required.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/Date.pm b/bench/perl/Template/Plugin/Date.pm
new file mode 100644
index 0000000..8b82e8d
--- /dev/null
+++ b/bench/perl/Template/Plugin/Date.pm
@@ -0,0 +1,355 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::Date
+#
+# DESCRIPTION
+#
+#   Plugin to generate formatted date strings.
+#
+# AUTHORS
+#   Thierry-Michel Barral  <kktos at electron-libre.com>
+#   Andy Wardley           <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 2000-2007 Thierry-Michel Barral, Andy Wardley.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::Date;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+
+use POSIX ();
+
+our $VERSION = 2.78;
+our $FORMAT  = '%H:%M:%S %d-%b-%Y';    # default strftime() format
+our @LOCALE_SUFFIX = qw( .ISO8859-1 .ISO_8859-15 .US-ASCII .UTF-8 );
+
+
+#------------------------------------------------------------------------
+# new(\%options)
+#------------------------------------------------------------------------
+
+sub new {
+    my ($class, $context, $params) = @_;
+    bless {
+        $params ? %$params : ()
+    }, $class;
+}
+
+
+#------------------------------------------------------------------------
+# now()
+# 
+# Call time() to return the current system time in seconds since the epoch.
+#------------------------------------------------------------------------
+
+sub now {
+    return time();
+}
+
+
+#------------------------------------------------------------------------
+# format()                           
+# format($time)
+# format($time, $format)
+# format($time, $format, $locale)
+# format($time, $format, $locale, $gmt_flag)
+# format(\%named_params);
+# 
+# Returns a formatted time/date string for the specified time, $time, 
+# (or the current system time if unspecified) using the $format, $locale,
+# and $gmt values specified as arguments or internal values set defined
+# at construction time).  Specifying a Perl-true value for $gmt will
+# override the local time zone and force the output to be for GMT.
+# Any or all of the arguments may be specified as named parameters which 
+# get passed as a hash array reference as the final argument.
+# ------------------------------------------------------------------------
+
+sub format {
+    my $self   = shift;
+    my $params = ref($_[$#_]) eq 'HASH' ? pop(@_) : { };
+    my $time   = shift(@_) || $params->{ time } || $self->{ time } 
+                           || $self->now();
+    my $format = @_ ? shift(@_) 
+                    : ($params->{ format } || $self->{ format } || $FORMAT);
+    my $locale = @_ ? shift(@_)
+                    : ($params->{ locale } || $self->{ locale });
+    my $gmt = @_ ? shift(@_)
+            : ($params->{ gmt } || $self->{ gmt });
+    my (@date, $datestr);
+
+    if ($time =~ /^\d+$/) {
+        # $time is now in seconds since epoch
+        if ($gmt) {
+            @date = (gmtime($time))[0..6];
+        }
+        else {
+            @date = (localtime($time))[0..6];
+        }
+    }
+    else {
+        # if $time is numeric, then we assume it's seconds since the epoch
+        # otherwise, we try to parse it as either a 'Y:M:D H:M:S' or a
+        # 'H:M:S D:M:Y' string
+
+        my @parts = (split(/(?:\/| |:|-)/, $time));
+
+        if (@parts >= 6) {
+            if (length($parts[0]) == 4) {
+                # year is first; assume 'Y:M:D H:M:S'
+                @date = @parts[reverse 0..5];
+            }
+            else {
+                # year is last; assume 'H:M:S D:M:Y'
+                @date = @parts[2,1,0,3..5];
+            }
+        }
+
+        if (!@date) {
+            return (undef, Template::Exception->new('date',
+                   "bad time/date string:  " .
+                   "expects 'h:m:s d:m:y'  got: '$time'"));
+        }
+        $date[4] -= 1;     # correct month number 1-12 to range 0-11
+        $date[5] -= 1900;  # convert absolute year to years since 1900
+        $time = &POSIX::mktime(@date);
+    }
+    
+    if ($locale) {
+        # format the date in a specific locale, saving and subsequently
+        # restoring the current locale.
+        my $old_locale = &POSIX::setlocale(&POSIX::LC_ALL);
+
+        # some systems expect locales to have a particular suffix
+        for my $suffix ('', @LOCALE_SUFFIX) {
+            my $try_locale = $locale.$suffix;
+            my $setlocale = &POSIX::setlocale(&POSIX::LC_ALL, $try_locale);
+            if (defined $setlocale && $try_locale eq $setlocale) {
+                $locale = $try_locale;
+                last;
+            }
+        }
+        $datestr = &POSIX::strftime($format, @date);
+        &POSIX::setlocale(&POSIX::LC_ALL, $old_locale);
+    }
+    else {
+        $datestr = &POSIX::strftime($format, @date);
+    }
+
+    return $datestr;
+}
+
+sub calc {
+    my $self = shift;
+    eval { require "Date/Calc.pm" };
+    $self->throw("failed to load Date::Calc: $@") if $@;
+    return Template::Plugin::Date::Calc->new('no context');
+}
+
+sub manip {
+    my $self = shift;
+    eval { require "Date/Manip.pm" };
+    $self->throw("failed to load Date::Manip: $@") if $@;
+    return Template::Plugin::Date::Manip->new('no context');
+}
+
+
+sub throw {
+    my $self = shift;
+    die (Template::Exception->new('date', join(', ', @_)));
+}
+
+
+package Template::Plugin::Date::Calc;
+use base qw( Template::Plugin );
+use vars qw( $AUTOLOAD );
+*throw = \&Template::Plugin::Date::throw;
+
+sub AUTOLOAD {
+    my $self = shift;
+    my $method = $AUTOLOAD;
+
+    $method =~ s/.*:://;
+    return if $method eq 'DESTROY';
+
+    my $sub = \&{"Date::Calc::$method"};
+    $self->throw("no such Date::Calc method: $method")
+        unless $sub;
+
+    &$sub(@_);
+}
+
+package Template::Plugin::Date::Manip;
+use base qw( Template::Plugin );
+use vars qw( $AUTOLOAD );
+*throw = \&Template::Plugin::Date::throw;
+
+sub AUTOLOAD {
+    my $self = shift;
+    my $method = $AUTOLOAD;
+    
+    $method =~ s/.*:://;
+    return if $method eq 'DESTROY';
+    
+    my $sub = \&{"Date::Manip::$method"};
+    $self->throw("no such Date::Manip method: $method")
+        unless $sub;
+    
+    &$sub(@_);
+}
+    
+    
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Date - Plugin to generate formatted date strings
+
+=head1 SYNOPSIS
+
+    [% USE date %]
+    
+    # use current time and default format
+    [% date.format %]
+    
+    # specify time as seconds since epoch
+    # or as a 'h:m:s d-m-y' or 'y-m-d h:m:s' string
+    [% date.format(960973980) %]
+    [% date.format('4:20:36 21/12/2000') %]
+    [% date.format('2000/12/21 4:20:36') %]
+    
+    # specify format
+    [% date.format(mytime, '%H:%M:%S') %]
+    
+    # specify locale
+    [% date.format(date.now, '%a %d %b %y', 'en_GB') %]
+    
+    # named parameters 
+    [% date.format(mytime, format = '%H:%M:%S') %]
+    [% date.format(locale = 'en_GB') %]
+    [% date.format(time   = date.now, 
+                   format = '%H:%M:%S', 
+                   locale = 'en_GB) %]
+    
+    # specify default format to plugin
+    [% USE date(format = '%H:%M:%S', locale = 'de_DE') %]
+    
+    [% date.format %]
+    ...
+
+=head1 DESCRIPTION
+
+The C<Date> plugin provides an easy way to generate formatted time and date
+strings by delegating to the C<POSIX> C<strftime()> routine.
+
+The plugin can be loaded via the familiar USE directive.
+
+    [% USE date %]
+
+This creates a plugin object with the default name of 'C<date>'.  An alternate
+name can be specified as such:
+
+    [% USE myname = date %]
+
+The plugin provides the C<format()> method which accepts a time value, a
+format string and a locale name.  All of these parameters are optional
+with the current system time, default format ('C<%H:%M:%S %d-%b-%Y>') and
+current locale being used respectively, if undefined.  Default values
+for the time, format and/or locale may be specified as named parameters 
+in the C<USE> directive.
+
+    [% USE date(format = '%a %d-%b-%Y', locale = 'fr_FR') %]
+
+When called without any parameters, the C<format()> method returns a string
+representing the current system time, formatted by C<strftime()> according 
+to the default format and for the default locale (which may not be the
+current one, if locale is set in the C<USE> directive).
+
+    [% date.format %]
+
+The plugin allows a time/date to be specified as seconds since the epoch,
+as is returned by C<time()>.
+
+    File last modified: [% date.format(filemod_time) %]
+
+The time/date can also be specified as a string of the form C<h:m:s d/m/y>
+or C<y/m/d h:m:s>.  Any of the characters : / - or space may be used to
+delimit fields.
+
+    [% USE day = date(format => '%A', locale => 'en_GB') %]
+    [% day.format('4:20:00 9-13-2000') %]  
+
+Output:
+
+    Tuesday
+
+A format string can also be passed to the C<format()> method, and a locale
+specification may follow that.
+
+    [% date.format(filemod, '%d-%b-%Y') %]
+    [% date.format(filemod, '%d-%b-%Y', 'en_GB') %]
+
+A fourth parameter allows you to force output in GMT, in the case of 
+seconds-since-the-epoch input:
+
+    [% date.format(filemod, '%d-%b-%Y', 'en_GB', 1) %]
+
+Note that in this case, if the local time is not GMT, then also specifying
+'C<%Z>' (time zone) in the format parameter will lead to an extremely 
+misleading result.
+
+Any or all of these parameters may be named.  Positional parameters
+should always be in the order C<($time, $format, $locale)>.
+
+    [% date.format(format => '%H:%M:%S') %]
+    [% date.format(time => filemod, format => '%H:%M:%S') %]
+    [% date.format(mytime, format => '%H:%M:%S') %]
+    [% date.format(mytime, format => '%H:%M:%S', locale => 'fr_FR') %]
+    [% date.format(mytime, format => '%H:%M:%S', gmt => 1) %]
+    ...etc...
+
+The C<now()> method returns the current system time in seconds since the 
+epoch.  
+
+    [% date.format(date.now, '%A') %]
+
+The C<calc()> method can be used to create an interface to the C<Date::Calc>
+module (if installed on your system).
+
+    [% calc = date.calc %]
+    [% calc.Monday_of_Week(22, 2001).join('/') %]
+
+The C<manip()> method can be used to create an interface to the C<Date::Manip>
+module (if installed on your system).
+
+    [% manip = date.manip %]
+    [% manip.UnixDate("Noon Yesterday","%Y %b %d %H:%M") %]
+
+=head1 AUTHORS
+
+Thierry-Michel Barral wrote the original plugin.
+
+Andy Wardley provided some minor
+fixups/enhancements, a test script and documentation.
+
+Mark D. Mills cloned C<Date::Manip> from the C<Date::Calc> sub-plugin.
+
+=head1 COPYRIGHT
+
+Copyright (C) 2000-2007 Thierry-Michel Barral, Andy Wardley.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>, L<POSIX>
+
diff --git a/bench/perl/Template/Plugin/Directory.pm b/bench/perl/Template/Plugin/Directory.pm
new file mode 100644
index 0000000..fb05e37
--- /dev/null
+++ b/bench/perl/Template/Plugin/Directory.pm
@@ -0,0 +1,386 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::Directory
+#
+# DESCRIPTION
+#   Plugin for encapsulating information about a file system directory.
+#
+# AUTHORS
+#   Michael Stevens <michael at etla.org>, with some mutilations from 
+#   Andy Wardley <abw at wardley.org>.
+#
+# COPYRIGHT
+#   Copyright (C) 2000-2007 Michael Stevens, Andy Wardley.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::Directory;
+
+use strict;
+use warnings;
+use Cwd;
+use File::Spec;
+use Template::Plugin::File;
+use base 'Template::Plugin::File';
+
+our $VERSION = 2.70;
+
+
+#------------------------------------------------------------------------
+# new(\%config)
+#
+# Constructor method.
+#------------------------------------------------------------------------
+
+sub new {
+    my $config = ref($_[-1]) eq 'HASH' ? pop(@_) : { };
+    my ($class, $context, $path) = @_;
+
+    return $class->throw('no directory specified')
+        unless defined $path and length $path;
+
+    my $self = $class->SUPER::new($context, $path, $config);
+    my ($dir, @files, $name, $item, $abs, $rel, $check);
+    $self->{ files } = [ ];
+    $self->{ dirs  } = [ ];
+    $self->{ list  } = [ ];
+    $self->{ _dir  } = { };
+
+    # don't read directory if 'nostat' or 'noscan' set
+    return $self if $config->{ nostat } || $config->{ noscan };
+
+    $self->throw("$path: not a directory")
+        unless $self->{ isdir };
+
+    $self->scan($config);
+
+    return $self;
+}
+
+
+#------------------------------------------------------------------------
+# scan(\%config)
+#
+# Scan directory for files and sub-directories.
+#------------------------------------------------------------------------
+
+sub scan {
+    my ($self, $config) = @_;
+    $config ||= { };
+    local *DH;
+    my ($dir, @files, $name, $abs, $rel, $item);
+    
+    # set 'noscan' in config if recurse isn't set, to ensure Directories
+    # created don't try to scan deeper
+    $config->{ noscan } = 1 unless $config->{ recurse };
+
+    $dir = $self->{ abs };
+    opendir(DH, $dir) or return $self->throw("$dir: $!");
+
+    @files = readdir DH;
+    closedir(DH) 
+        or return $self->throw("$dir close: $!");
+
+    my ($path, $files, $dirs, $list) = @$self{ qw( path files dirs list ) };
+    @$files = @$dirs = @$list = ();
+
+    foreach $name (sort @files) {
+        next if $name =~ /^\./;
+        $abs = File::Spec->catfile($dir, $name);
+        $rel = File::Spec->catfile($path, $name);
+
+        if (-d $abs) {
+            $item = Template::Plugin::Directory->new(undef, $rel, $config);
+            push(@$dirs, $item);
+        }
+        else {
+            $item = Template::Plugin::File->new(undef, $rel, $config);
+            push(@$files, $item);
+        }
+        push(@$list, $item);
+        $self->{ _dir }->{ $name } = $item;
+    }
+
+    return '';
+}
+
+
+#------------------------------------------------------------------------
+# file($filename)
+#
+# Fetch a named file from this directory.
+#------------------------------------------------------------------------
+
+sub file {
+    my ($self, $name) = @_;
+    return $self->{ _dir }->{ $name };
+}
+
+
+#------------------------------------------------------------------------
+# present($view)
+#
+# Present self to a Template::View
+#------------------------------------------------------------------------
+
+sub present {
+    my ($self, $view) = @_;
+    $view->view_directory($self);
+}
+
+
+#------------------------------------------------------------------------
+# content($view)
+# 
+# Present directory content to a Template::View.
+#------------------------------------------------------------------------
+
+sub content {
+    my ($self, $view) = @_;
+    return $self->{ list } unless $view;
+    my $output = '';
+    foreach my $file (@{ $self->{ list } }) {
+        $output .= $file->present($view);
+    }
+    return $output;
+}
+
+
+#------------------------------------------------------------------------
+# throw($msg)
+#
+# Throw a 'Directory' exception.
+#------------------------------------------------------------------------
+
+sub throw {
+    my ($self, $error) = @_;
+    die (Template::Exception->new('Directory', $error));
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Directory - Plugin for generating directory listings
+
+=head1 SYNOPSIS
+
+    [% USE dir = Directory(dirpath) %]
+    
+    # files returns list of regular files
+    [% FOREACH file = dir.files %]
+       [% file.name %] [% file.path %] ...
+    [% END %]
+    
+    # dirs returns list of sub-directories
+    [% FOREACH subdir = dir.dirs %]
+       [% subdir.name %] [% subdir.path %] ...
+    [% END %]
+    
+    # list returns both interleaved in order
+    [% FOREACH item = dir.list %]
+       [% IF item.isdir %]
+          Directory: [% item.name %]
+       [% ELSE %]
+          File: [% item.name %]
+       [% END %]
+    [% END %]
+    
+    # define a VIEW to display dirs/files
+    [% VIEW myview %]
+       [% BLOCK file %]
+       File: [% item.name %]
+       [% END %]
+       
+       [% BLOCK directory %]
+       Directory: [% item.name %] 
+       [% item.content(myview) | indent -%]
+       [% END %]
+    [% END %]
+    
+    # display directory content using view
+    [% myview.print(dir) %]
+
+=head1 DESCRIPTION
+
+This Template Toolkit plugin provides a simple interface to directory
+listings.  It is derived from the L<Template::Plugin::File> module and
+uses L<Template::Plugin::File> object instances to represent files within
+a directory.  Sub-directories within a directory are represented by
+further C<Template::Plugin::Directory> instances.
+
+The constructor expects a directory name as an argument.
+
+    [% USE dir = Directory('/tmp') %]
+
+It then provides access to the files and sub-directories contained within 
+the directory.
+
+    # regular files (not directories)
+    [% FOREACH file IN dir.files %]
+       [% file.name %]
+    [% END %]
+
+    # directories only
+    [% FOREACH file IN dir.dirs %]
+       [% file.name %]
+    [% END %]
+
+    # files and/or directories
+    [% FOREACH file IN dir.list %]
+       [% file.name %] ([% file.isdir ? 'directory' : 'file' %])
+    [% END %]
+
+The plugin constructor will throw a C<Directory> error if the specified
+path does not exist, is not a directory or fails to C<stat()> (see
+L<Template::Plugin::File>).  Otherwise, it will scan the directory and
+create lists named 'C<files>' containing files, 'C<dirs>' containing
+directories and 'C<list>' containing both files and directories combined.
+The C<nostat> option can be set to disable all file/directory checks
+and directory scanning.
+
+Each file in the directory will be represented by a
+L<Template::Plugin::File> object instance, and each directory by another
+C<Template::Plugin::Directory>.  If the C<recurse> flag is set, then those
+directories will contain further nested entries, and so on.  With the
+C<recurse> flag unset, as it is by default, then each is just a place
+marker for the directory and does not contain any further content
+unless its C<scan()> method is explicitly called.  The C<isdir> flag can
+be tested against files and/or directories, returning true if the item
+is a directory or false if it is a regular file.
+
+    [% FOREACH file = dir.list %]
+       [% IF file.isdir %]
+          * Directory: [% file.name %]
+       [% ELSE %]
+          * File: [% file.name %]
+       [% END %]
+    [% END %]
+
+This example shows how you might walk down a directory tree, displaying 
+content as you go.  With the recurse flag disabled, as is the default, 
+we need to explicitly call the C<scan()> method on each directory, to force
+it to lookup files and further sub-directories contained within. 
+
+    [% USE dir = Directory(dirpath) %]
+    * [% dir.path %]
+    [% INCLUDE showdir %]
+    
+    [% BLOCK showdir -%]
+      [% FOREACH file = dir.list -%]
+        [% IF file.isdir -%]
+        * [% file.name %]
+          [% file.scan -%]
+          [% INCLUDE showdir dir=file FILTER indent(4) -%]
+        [% ELSE -%]
+        - [% f.name %]
+        [% END -%]
+      [% END -%]
+     [% END %]
+
+This example is adapted (with some re-formatting for clarity) from
+a test in F<t/directry.t> which produces the following output:
+
+    * test/dir
+        - file1
+        - file2
+        * sub_one
+            - bar
+            - foo
+        * sub_two
+            - waz.html
+            - wiz.html
+        - xyzfile
+
+The C<recurse> flag can be set (disabled by default) to cause the
+constructor to automatically recurse down into all sub-directories,
+creating a new C<Template::Plugin::Directory> object for each one and 
+filling it with any further content.  In this case there is no need
+to explicitly call the C<scan()> method.
+
+    [% USE dir = Directory(dirpath, recurse=1) %]
+       ...
+       
+        [% IF file.isdir -%]
+        * [% file.name %]
+          [% INCLUDE showdir dir=file FILTER indent(4) -%]
+        [% ELSE -%]
+           ...
+
+The directory plugin also provides support for views. A view can be defined as
+a C<VIEW ... END> block and should contain C<BLOCK> definitions for files
+('C<file>') and directories ('C<directory>').
+
+    [% VIEW myview %]
+    [% BLOCK file %]
+       - [% item.name %]
+    [% END %]
+    
+    [% BLOCK directory %]
+       * [% item.name %]
+         [% item.content(myview) FILTER indent %]
+    [% END %]
+    [% END %]
+
+The view C<print()> method can then be called, passing the
+C<Directory> object as an argument.
+
+    [% USE dir = Directory(dirpath, recurse=1) %]
+    [% myview.print(dir) %]
+
+When a directory is presented to a view, either as C<[% myview.print(dir) %]>
+or C<[% dir.present(view) %]>, then the C<directory> C<BLOCK> within the
+C<myview> C<VIEW> is processed. The C<item> variable will be set to alias the
+C<Directory> object.
+
+    [% BLOCK directory %]
+       * [% item.name %]
+         [% item.content(myview) FILTER indent %]
+    [% END %]
+
+In this example, the directory name is first printed and the content(view)
+method is then called to present each item within the directory to the view.
+Further directories will be mapped to the C<directory> block, and files will be
+mapped to the C<file> block.
+
+With the recurse option disabled, as it is by default, the C<directory>
+block should explicitly call a C<scan()> on each directory.
+
+    [% VIEW myview %]
+    [% BLOCK file %]
+       - [% item.name %]
+    [% END %]
+    
+    [% BLOCK directory %]
+       * [% item.name %]
+         [% item.scan %]
+         [% item.content(myview) FILTER indent %]
+    [% END %]
+    [% END %]
+    
+    [% USE dir = Directory(dirpath) %]
+    [% myview.print(dir) %]
+
+=head1 AUTHORS
+
+Michael Stevens wrote the original Directory plugin on which this is based.
+Andy Wardley split it into separate L<File|Template::Plugin::File> and
+L<Directory|Template::Plugin::Directory> plugins, added some extra code and
+documentation for C<VIEW> support, and made a few other minor tweaks.
+
+=head1 COPYRIGHT
+
+Copyright (C) 2000-2007 Michael Stevens, Andy Wardley.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>, L<Template::Plugin::File>, L<Template::View>
+
diff --git a/bench/perl/Template/Plugin/Dumper.pm b/bench/perl/Template/Plugin/Dumper.pm
new file mode 100644
index 0000000..f1e0e8d
--- /dev/null
+++ b/bench/perl/Template/Plugin/Dumper.pm
@@ -0,0 +1,152 @@
+#==============================================================================
+# 
+# Template::Plugin::Dumper
+#
+# DESCRIPTION
+#
+# A Template Plugin to provide a Template Interface to Data::Dumper
+#
+# AUTHOR
+#   Simon Matthews <sam at tt2.org>
+#
+# COPYRIGHT
+#   Copyright (C) 2000 Simon Matthews.  All Rights Reserved
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#==============================================================================
+
+package Template::Plugin::Dumper;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+use Data::Dumper;
+
+our $VERSION = 2.70;
+our $DEBUG   = 0 unless defined $DEBUG;
+our @DUMPER_ARGS = qw( Indent Pad Varname Purity Useqq Terse Freezer
+                       Toaster Deepcopy Quotekeys Bless Maxdepth );
+our $AUTOLOAD;
+
+#==============================================================================
+#                      -----  CLASS METHODS -----
+#==============================================================================
+
+#------------------------------------------------------------------------
+# new($context, \@params)
+#------------------------------------------------------------------------
+
+sub new {
+    my ($class, $context, $params) = @_;
+    my ($key, $val);
+    $params ||= { };
+
+
+    foreach my $arg (@DUMPER_ARGS) {
+        no strict 'refs';
+        if (defined ($val = $params->{ lc $arg })
+            or defined ($val = $params->{ $arg })) {
+            ${"Data\::Dumper\::$arg"} = $val;
+        }
+    }
+
+    bless { 
+        _CONTEXT => $context, 
+    }, $class;
+}
+
+sub dump {
+    my $self = shift;
+    my $content = Dumper @_;
+    return $content;
+}
+
+
+sub dump_html {
+    my $self = shift;
+    my $content = Dumper @_;
+    for ($content) {
+        s/&/&/g;
+        s/</</g;
+        s/>/>/g;
+        s/\n/<br>\n/g;
+    }
+    return $content;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Dumper - Plugin interface to Data::Dumper
+
+=head1 SYNOPSIS
+
+    [% USE Dumper %]
+    
+    [% Dumper.dump(variable) %]
+    [% Dumper.dump_html(variable) %]
+
+=head1 DESCRIPTION
+
+This is a very simple Template Toolkit Plugin Interface to the L<Data::Dumper>
+module.  A C<Dumper> object will be instantiated via the following directive:
+
+    [% USE Dumper %]
+
+As a standard plugin, you can also specify its name in lower case:
+
+    [% USE dumper %]
+
+The C<Data::Dumper> C<Pad>, C<Indent> and C<Varname> options are supported
+as constructor arguments to affect the output generated.  See L<Data::Dumper>
+for further details.
+
+    [% USE dumper(Indent=0, Pad="<br>") %]
+
+These options can also be specified in lower case.
+
+    [% USE dumper(indent=0, pad="<br>") %]
+
+=head1 METHODS
+
+There are two methods supported by the C<Dumper> object.  Each will
+output into the template the contents of the variables passed to the
+object method.
+
+=head2 dump()
+
+Generates a raw text dump of the data structure(s) passed
+
+    [% USE Dumper %]
+    [% Dumper.dump(myvar) %]
+    [% Dumper.dump(myvar, yourvar) %]
+
+=head2 dump_html()
+
+Generates a dump of the data structures, as per L<dump()>, but with the 
+characters E<lt>, E<gt> and E<amp> converted to their equivalent HTML
+entities and newlines converted to E<lt>brE<gt>.
+
+    [% USE Dumper %]
+    [% Dumper.dump_html(myvar) %]
+
+=head1 AUTHOR
+
+Simon Matthews E<lt>sam at tt2.orgE<gt>
+
+=head1 COPYRIGHT
+
+Copyright (C) 2000 Simon Matthews.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>, L<Data::Dumper>
+
diff --git a/bench/perl/Template/Plugin/File.pm b/bench/perl/Template/Plugin/File.pm
new file mode 100644
index 0000000..3519972
--- /dev/null
+++ b/bench/perl/Template/Plugin/File.pm
@@ -0,0 +1,391 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::File
+#
+# DESCRIPTION
+#  Plugin for encapsulating information about a system file.
+#
+# AUTHOR
+#   Originally written by Michael Stevens <michael at etla.org> as the
+#   Directory plugin, then mutilated by Andy Wardley <abw at kfs.org> 
+#   into separate File and Directory plugins, with some additional 
+#   code for working with views, etc.
+#
+# COPYRIGHT
+#   Copyright 2000-2007 Michael Stevens, Andy Wardley.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::File;
+
+use strict;
+use warnings;
+use Cwd;
+use File::Spec;
+use File::Basename;
+use base 'Template::Plugin';
+
+our $VERSION = 2.71;
+
+our @STAT_KEYS = qw( dev ino mode nlink uid gid rdev size 
+                     atime mtime ctime blksize blocks );
+
+
+#------------------------------------------------------------------------
+# new($context, $file, \%config)
+#
+# Create a new File object.  Takes the pathname of the file as
+# the argument following the context and an optional 
+# hash reference of configuration parameters.
+#------------------------------------------------------------------------
+
+sub new {
+    my $config = ref($_[-1]) eq 'HASH' ? pop(@_) : { };
+    my ($class, $context, $path) = @_;
+    my ($root, $home, @stat, $abs);
+
+    return $class->throw('no file specified')
+        unless defined $path and length $path;
+
+    # path, dir, name, root, home
+
+    if (File::Spec->file_name_is_absolute($path)) {
+        $root = '';
+    }
+    elsif (($root = $config->{ root })) {
+        # strip any trailing '/' from root
+        $root =~ s[/$][];
+    }
+    else {
+        $root = '';
+    }
+
+    my ($name, $dir, $ext) = fileparse($path, '\.\w+');
+    # fixup various items
+    $dir  =~ s[/$][];
+    $dir  = '' if $dir eq '.';
+    $name = $name . $ext;
+    $ext  =~ s/^\.//g;
+
+    my @fields = File::Spec->splitdir($dir);
+    shift @fields if @fields && ! length $fields[0];
+    $home = join('/', ('..') x @fields);
+    $abs = File::Spec->catfile($root ? $root : (), $path);
+
+    my $self = { 
+        path  => $path,
+        name  => $name,
+        root  => $root,
+        home  => $home,
+        dir   => $dir,
+        ext   => $ext,
+        abs   => $abs,
+        user  => '',
+        group => '',
+        isdir => '',
+        stat  => defined $config->{ stat } 
+                       ? $config->{ stat } 
+                       : ! $config->{ nostat },
+        map { ($_ => '') } @STAT_KEYS,
+    };
+
+    if ($self->{ stat }) {
+        (@stat = stat( $abs ))
+            || return $class->throw("$abs: $!");
+
+        @$self{ @STAT_KEYS } = @stat;
+
+        unless ($config->{ noid }) {
+            $self->{ user  } = eval { getpwuid( $self->{ uid }) || $self->{ uid } };
+            $self->{ group } = eval { getgrgid( $self->{ gid }) || $self->{ gid } };
+        }
+        $self->{ isdir } = -d $abs;
+    }
+
+    bless $self, $class;
+}
+
+
+#-------------------------------------------------------------------------
+# rel($file)
+#
+# Generate a relative filename for some other file relative to this one.
+#------------------------------------------------------------------------
+
+sub rel {
+    my ($self, $path) = @_;
+    $path = $path->{ path } if ref $path eq ref $self;  # assumes same root
+    return $path if $path =~ m[^/];
+    return $path unless $self->{ home };
+    return $self->{ home } . '/' . $path;
+}
+
+
+#------------------------------------------------------------------------
+# present($view)
+#
+# Present self to a Template::View.
+#------------------------------------------------------------------------
+
+sub present {
+    my ($self, $view) = @_;
+    $view->view_file($self);
+}
+
+
+sub throw {
+    my ($self, $error) = @_;
+    die (Template::Exception->new('File', $error));
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::File - Plugin providing information about files
+
+=head1 SYNOPSIS
+
+    [% USE File(filepath) %]
+    [% File.path %]         # full path
+    [% File.name %]         # filename
+    [% File.dir %]          # directory
+
+=head1 DESCRIPTION
+
+This plugin provides an abstraction of a file.  It can be used to 
+fetch details about files from the file system, or to represent abstract
+files (e.g. when creating an index page) that may or may not exist on 
+a file system.
+
+A file name or path should be specified as a constructor argument.  e.g.
+
+    [% USE File('foo.html') %]
+    [% USE File('foo/bar/baz.html') %]
+    [% USE File('/foo/bar/baz.html') %]
+
+The file should exist on the current file system (unless C<nostat>
+option set, see below) as an absolute file when specified with as
+leading 'C</>' as per 'C</foo/bar/baz.html>', or otherwise as one relative
+to the current working directory.  The constructor performs a C<stat()>
+on the file and makes the 13 elements returned available as the plugin
+items:
+
+    dev ino mode nlink uid gid rdev size 
+    atime mtime ctime blksize blocks
+
+e.g.
+
+    [% USE File('/foo/bar/baz.html') %]
+    
+    [% File.mtime %]
+    [% File.mode %]
+    ...
+
+In addition, the C<user> and C<group> items are set to contain the user
+and group names as returned by calls to C<getpwuid()> and C<getgrgid()> for
+the file C<uid> and C<gid> elements, respectively.  On Win32 platforms
+on which C<getpwuid()> and C<getgrid()> are not available, these values are
+undefined.
+
+    [% USE File('/tmp/foo.html') %]
+    [% File.uid %]      # e.g. 500
+    [% File.user %]     # e.g. abw
+
+This user/group lookup can be disabled by setting the C<noid> option.
+
+    [% USE File('/tmp/foo.html', noid=1) %]
+    [% File.uid %]      # e.g. 500
+    [% File.user %]     # nothing
+
+The C<isdir> flag will be set if the file is a directory.
+
+    [% USE File('/tmp') %]
+    [% File.isdir %]    # 1
+
+If the C<stat()> on the file fails (e.g. file doesn't exists, bad
+permission, etc) then the constructor will throw a C<File> exception.
+This can be caught within a C<TRY...CATCH> block.
+
+    [% TRY %]
+       [% USE File('/tmp/myfile') %]
+       File exists!
+    [% CATCH File %]
+       File error: [% error.info %]
+    [% END %]
+
+Note the capitalisation of the exception type, 'C<File>', to indicate an
+error thrown by the C<File> plugin, to distinguish it from a regular
+C<file> exception thrown by the Template Toolkit.
+
+Note that the C<File> plugin can also be referenced by the lower case
+name 'C<file>'.  However, exceptions are always thrown of the C<File>
+type, regardless of the capitalisation of the plugin named used.
+
+    [% USE file('foo.html') %]
+    [% file.mtime %]
+
+As with any other Template Toolkit plugin, an alternate name can be 
+specified for the object created.
+
+    [% USE foo = file('foo.html') %]
+    [% foo.mtime %]
+
+The C<nostat> option can be specified to prevent the plugin constructor
+from performing a C<stat()> on the file specified.  In this case, the
+file does not have to exist in the file system, no attempt will be made
+to verify that it does, and no error will be thrown if it doesn't.
+The entries for the items usually returned by C<stat()> will be set 
+empty.
+
+    [% USE file('/some/where/over/the/rainbow.html', nostat=1) 
+    [% file.mtime %]     # nothing
+
+=head1 METHODS
+
+All C<File> plugins, regardless of the C<nostat> option, have set a number
+of items relating to the original path specified.
+
+=head2 path
+
+The full, original file path specified to the constructor.
+
+    [% USE file('/foo/bar.html') %]
+    [% file.path %]     # /foo/bar.html
+
+=head2 name
+
+The name of the file without any leading directories.
+
+    [% USE file('/foo/bar.html') %]
+    [% file.name %]     # bar.html
+
+=head2 dir
+
+The directory element of the path with the filename removed.
+
+    [% USE file('/foo/bar.html') %]
+    [% file.name %]     # /foo
+
+=head2 ext
+
+The file extension, if any, appearing at the end of the path following 
+a 'C<.>' (not included in the extension).
+
+    [% USE file('/foo/bar.html') %]
+    [% file.ext %]      # html
+
+=head2 home
+
+This contains a string of the form 'C<../..>' to represent the upward path
+from a file to its root directory.
+
+    [% USE file('bar.html') %]
+    [% file.home %]     # nothing
+    
+    [% USE file('foo/bar.html') %]
+    [% file.home %]     # ..
+    
+    [% USE file('foo/bar/baz.html') %]
+    [% file.home %]     # ../..
+
+=head2 root
+
+The C<root> item can be specified as a constructor argument, indicating
+a root directory in which the named file resides.  This is otherwise
+set empty.
+
+    [% USE file('foo/bar.html', root='/tmp') %]
+    [% file.root %]     # /tmp
+
+=head2 abs
+
+This returns the absolute file path by constructing a path from the 
+C<root> and C<path> options.
+
+    [% USE file('foo/bar.html', root='/tmp') %]
+    [% file.path %]     # foo/bar.html
+    [% file.root %]     # /tmp
+    [% file.abs %]      # /tmp/foo/bar.html
+
+=head2 rel(path)
+
+This returns a relative path from the current file to another path specified
+as an argument.  It is constructed by appending the path to the 'C<home>' 
+item.
+
+    [% USE file('foo/bar/baz.html') %]
+    [% file.rel('wiz/waz.html') %]      # ../../wiz/waz.html
+
+=head1 EXAMPLES
+
+    [% USE file('/foo/bar/baz.html') %]
+    
+    [% file.path  %]      # /foo/bar/baz.html
+    [% file.dir   %]      # /foo/bar
+    [% file.name  %]      # baz.html
+    [% file.home  %]      # ../..
+    [% file.root  %]      # ''
+    [% file.abs   %]      # /foo/bar/baz.html
+    [% file.ext   %]      # html
+    [% file.mtime %]      # 987654321
+    [% file.atime %]      # 987654321
+    [% file.uid   %]      # 500
+    [% file.user  %]      # abw
+
+    [% USE file('foo.html') %]
+    
+    [% file.path %]           # foo.html
+    [% file.dir  %]       # ''
+    [% file.name %]           # foo.html
+    [% file.root %]       # ''
+    [% file.home %]       # ''
+    [% file.abs  %]       # foo.html
+
+    [% USE file('foo/bar/baz.html') %]
+    
+    [% file.path %]           # foo/bar/baz.html
+    [% file.dir  %]       # foo/bar
+    [% file.name %]           # baz.html
+    [% file.root %]       # ''
+    [% file.home %]       # ../..
+    [% file.abs  %]       # foo/bar/baz.html
+
+    [% USE file('foo/bar/baz.html', root='/tmp') %]
+    
+    [% file.path %]           # foo/bar/baz.html
+    [% file.dir  %]       # foo/bar
+    [% file.name %]           # baz.html
+    [% file.root %]       # /tmp
+    [% file.home %]       # ../..
+    [% file.abs  %]       # /tmp/foo/bar/baz.html
+
+    # calculate other file paths relative to this file and its root
+    [% USE file('foo/bar/baz.html', root => '/tmp/tt2') %]
+    
+    [% file.path('baz/qux.html') %]         # ../../baz/qux.html
+    [% file.dir('wiz/woz.html')  %]     # ../../wiz/woz.html
+
+=head1 AUTHORS
+
+Michael Stevens wrote the original C<Directory> plugin on which this is based.
+Andy Wardley split it into separate C<File> and C<Directory> plugins, added
+some extra code and documentation for C<VIEW> support, and made a few other
+minor tweaks.
+
+=head1 COPYRIGHT
+
+Copyright 2000-2007 Michael Stevens, Andy Wardley.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>, L<Template::Plugin::Directory>, L<Template::View>
+
diff --git a/bench/perl/Template/Plugin/Filter.pm b/bench/perl/Template/Plugin/Filter.pm
new file mode 100644
index 0000000..420cc94
--- /dev/null
+++ b/bench/perl/Template/Plugin/Filter.pm
@@ -0,0 +1,411 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::Filter
+#
+# DESCRIPTION
+#   Template Toolkit module implementing a base class plugin
+#   object which acts like a filter and can be used with the 
+#   FILTER directive.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 2001-2009 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::Filter;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+use Scalar::Util 'weaken';
+
+
+our $VERSION = 1.38;
+our $DYNAMIC = 0 unless defined $DYNAMIC;
+
+
+sub new {
+    my ($class, $context, @args) = @_;
+    my $config = @args && ref $args[-1] eq 'HASH' ? pop(@args) : { };
+
+    # look for $DYNAMIC
+    my $dynamic;
+    {
+        no strict 'refs';
+        $dynamic = ${"$class\::DYNAMIC"};
+    }
+    $dynamic = $DYNAMIC unless defined $dynamic;
+
+    my $self = bless {
+        _CONTEXT => $context,
+        _DYNAMIC => $dynamic,
+        _ARGS    => \@args,
+        _CONFIG  => $config,
+    }, $class;
+
+    return $self->init($config)
+        || $class->error($self->error());
+}
+
+
+sub init {
+    my ($self, $config) = @_;
+    return $self;
+}
+
+
+sub factory {
+    my $self = shift;
+    my $this = $self;
+    
+    # This causes problems: https://rt.cpan.org/Ticket/Display.html?id=46691
+    # If the plugin is loaded twice in different templates (one INCLUDEd into
+    # another) then the filter gets garbage collected when the inner template 
+    # ends (at least, I think that's what's happening).  So I'm going to take
+    # the "suck it and see" approach, comment it out, and wait for someone to
+    # complain that this module is leaking memory.  
+    
+    # weaken($this);
+
+    if ($self->{ _DYNAMIC }) {
+        return $self->{ _DYNAMIC_FILTER } ||= [ sub {
+            my ($context, @args) = @_;
+            my $config = ref $args[-1] eq 'HASH' ? pop(@args) : { };
+
+            return sub {
+                $this->filter(shift, \@args, $config);
+            };
+        }, 1 ];
+    }
+    else {
+        return $self->{ _STATIC_FILTER } ||= sub {
+            $this->filter(shift);
+        };
+    }
+}
+
+sub filter {
+    my ($self, $text, $args, $config) = @_;
+    return $text;
+}
+
+
+sub merge_config {
+    my ($self, $newcfg) = @_;
+    my $owncfg = $self->{ _CONFIG };
+    return $owncfg unless $newcfg;
+    return { %$owncfg, %$newcfg };
+}
+
+
+sub merge_args {
+    my ($self, $newargs) = @_;
+    my $ownargs = $self->{ _ARGS };
+    return $ownargs unless $newargs;
+    return [ @$ownargs, @$newargs ];
+}
+
+
+sub install_filter {
+    my ($self, $name) = @_;
+    $self->{ _CONTEXT }->define_filter( $name => $self->factory );
+    return $self;
+}
+
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Filter - Base class for plugin filters
+
+=head1 SYNOPSIS
+
+    package MyOrg::Template::Plugin::MyFilter;
+    
+    use Template::Plugin::Filter;
+    use base qw( Template::Plugin::Filter );
+    
+    sub filter {
+        my ($self, $text) = @_;
+        
+        # ...mungify $text...
+        
+        return $text;
+    }
+
+    # now load it...
+    [% USE MyFilter %]
+    
+    # ...and use the returned object as a filter
+    [% FILTER $MyFilter %]
+      ...
+    [% END %]
+
+=head1 DESCRIPTION
+
+This module implements a base class for plugin filters.  It hides
+the underlying complexity involved in creating and using filters
+that get defined and made available by loading a plugin.
+
+To use the module, simply create your own plugin module that is 
+inherited from the C<Template::Plugin::Filter> class.
+
+    package MyOrg::Template::Plugin::MyFilter;
+    
+    use Template::Plugin::Filter;
+    use base qw( Template::Plugin::Filter );
+
+Then simply define your C<filter()> method.  When called, you get
+passed a reference to your plugin object (C<$self>) and the text
+to be filtered.
+
+    sub filter {
+        my ($self, $text) = @_;
+        
+        # ...mungify $text...
+        
+        return $text;
+    }
+
+To use your custom plugin, you have to make sure that the Template
+Toolkit knows about your plugin namespace.
+
+    my $tt2 = Template->new({
+        PLUGIN_BASE => 'MyOrg::Template::Plugin',
+    });
+
+Or for individual plugins you can do it like this:
+
+    my $tt2 = Template->new({
+        PLUGINS => {
+            MyFilter => 'MyOrg::Template::Plugin::MyFilter',
+        },
+    });
+
+Then you C<USE> your plugin in the normal way.
+
+    [% USE MyFilter %]
+
+The object returned is stored in the variable of the same name,
+'C<MyFilter>'.  When you come to use it as a C<FILTER>, you should add
+a dollar prefix.  This indicates that you want to use the filter 
+stored in the variable 'C<MyFilter>' rather than the filter named 
+'C<MyFilter>', which is an entirely different thing (see later for 
+information on defining filters by name).
+
+    [% FILTER $MyFilter %]
+       ...text to be filtered...
+    [% END %]
+
+You can, of course, assign it to a different variable.
+
+    [% USE blat = MyFilter %]
+    
+    [% FILTER $blat %]
+       ...text to be filtered...
+    [% END %]
+
+Any configuration parameters passed to the plugin constructor from the
+C<USE> directive are stored internally in the object for inspection by
+the C<filter()> method (or indeed any other method).  Positional
+arguments are stored as a reference to a list in the C<_ARGS> item while
+named configuration parameters are stored as a reference to a hash
+array in the C<_CONFIG> item.
+
+For example, loading a plugin as shown here:
+
+    [% USE blat = MyFilter 'foo' 'bar' baz = 'blam' %]
+
+would allow the C<filter()> method to do something like this:
+
+    sub filter {
+        my ($self, $text) = @_;
+        
+        my $args = $self->{ _ARGS   };  # [ 'foo', 'bar' ]
+        my $conf = $self->{ _CONFIG };  # { baz => 'blam' }
+        
+        # ...munge $text...
+        
+        return $text;
+    }
+
+By default, plugins derived from this module will create static
+filters.  A static filter is created once when the plugin gets 
+loaded via the C<USE> directive and re-used for all subsequent
+C<FILTER> operations.  That means that any argument specified with
+the C<FILTER> directive are ignored.
+
+Dynamic filters, on the other hand, are re-created each time 
+they are used by a C<FILTER> directive.  This allows them to act
+on any parameters passed from the C<FILTER> directive and modify
+their behaviour accordingly.  
+
+There are two ways to create a dynamic filter.  The first is to
+define a C<$DYNAMIC> class variable set to a true value.
+
+    package MyOrg::Template::Plugin::MyFilter;
+    use base 'Template::Plugin::Filter';
+    our $DYNAMIC = 1;
+
+The other way is to set the internal C<_DYNAMIC> value within the C<init()>
+method which gets called by the C<new()> constructor.
+
+    sub init {
+        my $self = shift;
+        $self->{ _DYNAMIC } = 1;
+        return $self;
+    }
+
+When this is set to a true value, the plugin will automatically
+create a dynamic filter.  The outcome is that the C<filter()> method
+will now also get passed a reference to an array of postional
+arguments and a reference to a hash array of named parameters.
+
+So, using a plugin filter like this:
+
+    [% FILTER $blat 'foo' 'bar' baz = 'blam' %]
+
+would allow the C<filter()> method to work like this:
+
+    sub filter {
+        my ($self, $text, $args, $conf) = @_;
+        
+        # $args = [ 'foo', 'bar' ]
+        # $conf = { baz => 'blam' }
+    }
+
+In this case can pass parameters to both the USE and FILTER directives,
+so your filter() method should probably take that into account.  
+
+    [% USE MyFilter 'foo' wiz => 'waz' %]
+    
+    [% FILTER $MyFilter 'bar' biz => 'baz' %]
+       ...
+    [% END %]
+
+You can use the C<merge_args()> and C<merge_config()> methods to do a quick
+and easy job of merging the local (e.g. C<FILTER>) parameters with the
+internal (e.g. C<USE>) values and returning new sets of conglomerated
+data.
+
+    sub filter {
+        my ($self, $text, $args, $conf) = @_;
+        
+        $args = $self->merge_args($args); 
+        $conf = $self->merge_config($conf);
+        
+        # $args = [ 'foo', 'bar' ]      
+        # $conf = { wiz => 'waz', biz => 'baz' }        
+        ...
+    }
+
+You can also have your plugin install itself as a named filter by
+calling the C<install_filter()> method from the C<init()> method.  You 
+should provide a name for the filter, something that you might 
+like to make a configuration option.
+
+    sub init {
+        my $self = shift;
+        my $name = $self->{ _CONFIG }->{ name } || 'myfilter';
+        $self->install_filter($name);
+        return $self;
+    }
+
+This allows the plugin filter to be used as follows:
+
+    [% USE MyFilter %]
+    
+    [% FILTER myfilter %] 
+       ... 
+    [% END %]
+
+or
+
+    [% USE MyFilter name = 'swipe' %]
+        
+    [% FILTER swipe %] 
+       ... 
+    [% END %]
+
+Alternately, you can allow a filter name to be specified as the 
+first positional argument.
+
+    sub init {
+        my $self = shift;
+        my $name = $self->{ _ARGS }->[0] || 'myfilter';
+        $self->install_filter($name);
+        return $self;
+    }
+
+    [% USE MyFilter 'swipe' %]
+    
+    [% FILTER swipe %]
+       ...
+    [% END %]
+
+=head1 EXAMPLE
+
+Here's a complete example of a plugin filter module.
+
+    package My::Template::Plugin::Change;
+    use Template::Plugin::Filter;
+    use base qw( Template::Plugin::Filter );
+    
+    sub init {
+        my $self = shift;
+        
+        $self->{ _DYNAMIC } = 1;
+        
+        # first arg can specify filter name
+        $self->install_filter($self->{ _ARGS }->[0] || 'change');
+        
+        return $self;
+    }
+    
+    sub filter {
+        my ($self, $text, $args, $config) = @_;
+        
+        $config = $self->merge_config($config);
+        my $regex = join('|', keys %$config);
+        
+        $text =~ s/($regex)/$config->{ $1 }/ge;
+        
+        return $text;
+    }
+    
+    1;
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>, L<Template::Filters>, L<Template::Manual::Filters>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/Format.pm b/bench/perl/Template/Plugin/Format.pm
new file mode 100644
index 0000000..3c00ce4
--- /dev/null
+++ b/bench/perl/Template/Plugin/Format.pm
@@ -0,0 +1,93 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::Format
+#
+# DESCRIPTION
+#
+#   Simple Template Toolkit Plugin which creates formatting functions.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::Format;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+
+our $VERSION = 2.70;
+
+
+sub new {
+    my ($class, $context, $format) = @_;;
+    return defined $format
+        ? make_formatter($format)
+        : \&make_formatter;
+}
+
+
+sub make_formatter {
+    my $format = shift;
+    $format = '%s' unless defined $format;
+    return sub { 
+        my @args = @_;
+        push(@args, '') unless @args;
+        return sprintf($format, @args); 
+    }
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Format - Plugin to create formatting functions
+
+=head1 SYNOPSIS
+
+    [% USE format %]
+    [% commented = format('# %s') %]
+    [% commented('The cat sat on the mat') %]
+    
+    [% USE bold = format('<b>%s</b>') %]
+    [% bold('Hello') %]
+
+=head1 DESCRIPTION
+
+The format plugin constructs sub-routines which format text according to
+a C<printf()>-like format string.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/HTML.pm b/bench/perl/Template/Plugin/HTML.pm
new file mode 100644
index 0000000..61dd69f
--- /dev/null
+++ b/bench/perl/Template/Plugin/HTML.pm
@@ -0,0 +1,163 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::HTML
+#
+# DESCRIPTION
+#   Template Toolkit plugin providing useful functionality for generating
+#   HTML.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::HTML;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+
+our $VERSION = 2.62;
+
+sub new {
+    my ($class, $context, @args) = @_;
+    my $hash = ref $args[-1] eq 'HASH' ? pop @args : { };
+    bless {
+        _SORTED => $hash->{ sorted } || 0,
+    }, $class;
+}
+
+sub element {
+    my ($self, $name, $attr) = @_;
+    ($name, $attr) = %$name if ref $name eq 'HASH';
+    return '' unless defined $name and length $name;
+    $attr = $self->attributes($attr);
+    $attr = " $attr" if $attr;
+    return "<$name$attr>";
+}
+
+sub attributes {
+    my ($self, $hash) = @_;
+    return '' unless ref $hash eq 'HASH';
+
+    my @keys = keys %$hash;
+    @keys = sort @keys if $self->{ _SORTED };
+
+    join(' ', map { 
+        "$_=\"" . $self->escape( $hash->{ $_ } ) . '"';
+    } @keys);
+}
+
+sub escape {
+    my ($self, $text) = @_;
+    for ($text) {
+        s/&/&/g;
+        s/</</g;
+        s/>/>/g;
+        s/"/"/g;
+    }
+    $text;
+}
+
+sub url {
+    my ($self, $text) = @_;
+    return undef unless defined $text;
+    $text =~ s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/eg;
+    return $text;
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::HTML - Plugin to create HTML elements
+
+=head1 SYNOPSIS
+
+    [% USE HTML %]
+    
+    [% HTML.escape("if (a < b && c > d) ..." %]
+    
+    [% HTML.element(table => { border => 1, cellpadding => 2 }) %]
+    
+    [% HTML.attributes(border => 1, cellpadding => 2) %]
+
+=head1 DESCRIPTION
+
+The C<HTML> plugin is a very basic plugin, implementing a few useful
+methods for generating HTML.  
+
+=head1 METHODS
+
+=head2 escape(text)
+
+Returns the source text with any HTML reserved characters such as 
+C<E<lt>>, C<E<gt>>, etc., correctly esacped to their entity equivalents.
+
+=head2 attributes(hash)
+
+Returns the elements of the hash array passed by reference correctly
+formatted (e.g. values quoted and correctly escaped) as attributes for
+an HTML element.
+
+=head2 element(type, attributes)
+
+Generates an HTML element of the specified type and with the attributes
+provided as an optional hash array reference as the second argument or
+as named arguments.
+
+    [% HTML.element(table => { border => 1, cellpadding => 2 }) %]
+    [% HTML.element('table', border=1, cellpadding=2) %]
+    [% HTML.element(table => attribs) %]
+
+=head1 DEBUGGING
+
+The HTML plugin accepts a C<sorted> option as a constructor argument
+which, when set to any true value, causes the attributes generated by
+the C<attributes()> method (either directly or via C<element()>) to be
+returned in sorted order.  Order of attributes isn't important in
+HTML, but this is provided mainly for the purposes of debugging where
+it is useful to have attributes generated in a deterministic order
+rather than whatever order the hash happened to feel like returning
+the keys in.
+
+    [% USE HTML(sorted=1) %]
+    [% HTML.element( foo => { charlie => 1, bravo => 2, alpha => 3 } ) %]
+
+generates:
+
+    <foo alpha="3" bravo="2" charlie="1">
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/Image.pm b/bench/perl/Template/Plugin/Image.pm
new file mode 100644
index 0000000..6109fdd
--- /dev/null
+++ b/bench/perl/Template/Plugin/Image.pm
@@ -0,0 +1,436 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::Image
+#
+# DESCRIPTION
+#  Plugin for encapsulating information about an image.
+#
+# AUTHOR
+#   Andy Wardley <abw at wardley.org>
+#
+# COPYRIGHT
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::Image;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+use Template::Exception;
+use File::Spec;
+
+our $VERSION = 1.21;
+our $AUTOLOAD;
+
+BEGIN {
+    if (eval { require Image::Info; }) {
+        *img_info = \&Image::Info::image_info;
+    }
+    elsif (eval { require Image::Size; }) {
+        *img_info = sub {
+            my $file = shift;
+            my @stuff = Image::Size::imgsize($file);
+            return { "width"  => $stuff[0],
+                     "height" => $stuff[1],
+                     "error"  =>
+                        # imgsize returns either a three letter file type
+                        # or an error message as third value
+                        (defined($stuff[2]) && length($stuff[2]) > 3
+                            ? $stuff[2]
+                            : undef),
+                   };
+        }
+    }
+    else {
+        die(Template::Exception->new("image",
+            "Couldn't load Image::Info or Image::Size: $@"));
+    }
+
+}
+
+#------------------------------------------------------------------------
+# new($context, $name, \%config)
+#
+# Create a new Image object.  Takes the pathname of the file as
+# the argument following the context and an optional 
+# hash reference of configuration parameters.
+#------------------------------------------------------------------------
+
+sub new {
+    my $config = ref($_[-1]) eq 'HASH' ? pop(@_) : { };
+    my ($class, $context, $name) = @_;
+    my ($root, $file, $type);
+
+    # name can be a positional or named argument
+    $name = $config->{ name } unless defined $name;
+
+    return $class->throw('no image file specified')
+        unless defined $name and length $name;
+
+    # name can be specified as an absolute path or relative
+    # to a root directory 
+
+    if ($root = $config->{ root }) {
+        $file = File::Spec->catfile($root, $name);
+    }
+    else {
+        $file = defined $config->{file} ? $config->{file} : $name;
+    }
+
+    # Make a note of whether we are using Image::Size or
+    # Image::Info -- at least for the test suite
+    $type = $INC{"Image/Size.pm"} ? "Image::Size" : "Image::Info";
+
+    # set a default (empty) alt attribute for tag()
+    $config->{ alt } = '' unless defined $config->{ alt };
+
+    # do we want to check to see if file exists?
+    bless { 
+        %$config,
+        name => $name,
+        file => $file,
+        root => $root,
+        type => $type,
+    }, $class;
+}
+
+#------------------------------------------------------------------------
+# init()
+#
+# Calls image_info on $self->{ file }
+#------------------------------------------------------------------------
+
+sub init {
+    my $self = shift;
+    return $self if $self->{ size };
+
+    my $image = img_info($self->{ file });
+    return $self->throw($image->{ error }) if defined $image->{ error };
+
+    @$self{ keys %$image } = values %$image;
+    $self->{ size } = [ $image->{ width }, $image->{ height } ];
+
+    $self->{ modtime } = (stat $self->{ file })[10];
+
+    return $self;
+}
+
+#------------------------------------------------------------------------
+# attr()
+#
+# Return the width and height as HTML/XML attributes.
+#------------------------------------------------------------------------
+
+sub attr {
+    my $self = shift;
+    my $size = $self->size();
+    return "width=\"$size->[0]\" height=\"$size->[1]\"";
+}
+
+
+#------------------------------------------------------------------------
+# modtime()
+#
+# Return last modification time as a time_t:
+#
+#   [% date.format(image.modtime, "%Y/%m/%d") %]
+#------------------------------------------------------------------------
+
+sub modtime {
+    my $self = shift;
+    $self->init;
+    return $self->{ modtime };
+}
+
+
+#------------------------------------------------------------------------
+# tag(\%options)
+#
+# Return an XHTML img tag.
+#------------------------------------------------------------------------
+
+sub tag {
+    my $self = shift;
+    my $options = ref $_[0] eq 'HASH' ? shift : { @_ };
+
+    my $tag = '<img src="' . $self->name() . '" ' . $self->attr();
+ 
+    # XHTML spec says that the alt attribute is mandatory, so who
+    # are we to argue?
+
+    $options->{ alt } = $self->{ alt }
+        unless defined $options->{ alt };
+
+    if (%$options) {
+        while (my ($key, $val) = each %$options) {
+            my $escaped = escape( $val );
+            $tag .= qq[ $key="$escaped"];
+        }
+    }
+
+    $tag .= ' />';
+
+    return $tag;
+}
+
+sub escape {
+    my ($text) = @_;
+    for ($text) {
+        s/&/&/g;
+        s/</</g;
+        s/>/>/g;
+        s/"/"/g;
+    }
+    $text;
+}
+
+sub throw {
+    my ($self, $error) = @_;
+    die (Template::Exception->new('Image', $error));
+}
+
+sub AUTOLOAD {
+    my $self = shift;
+   (my $a = $AUTOLOAD) =~ s/.*:://;
+
+    $self->init;
+    return $self->{ $a };
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Image - Plugin access to image sizes
+
+=head1 SYNOPSIS
+
+    [% USE Image(filename) %]
+    [% Image.width %]
+    [% Image.height %]
+    [% Image.size.join(', ') %]
+    [% Image.attr %]
+    [% Image.tag %]
+
+=head1 DESCRIPTION
+
+This plugin provides an interface to the L<Image::Info> or L<Image::Size>
+modules for determining the size of image files.
+
+You can specify the plugin name as either 'C<Image>' or 'C<image>'.  The
+plugin object created will then have the same name.  The file name of
+the image should be specified as a positional or named argument.
+
+    [% # all these are valid, take your pick %]
+    [% USE Image('foo.gif') %]
+    [% USE image('bar.gif') %]
+    [% USE Image 'ping.gif' %]
+    [% USE image(name='baz.gif') %]
+    [% USE Image name='pong.gif' %]
+
+A C<root> parameter can be used to specify the location of the image file:
+
+    [% USE Image(root='/path/to/root', name='images/home.png') %]
+    # image path: /path/to/root/images/home.png
+    # img src: images/home.png
+
+In cases where the image path and image url do not match up, specify the
+file name directly:
+
+    [% USE Image(file='/path/to/home.png', name='/images/home.png') %]
+
+The C<alt> parameter can be used to specify an alternate name for the
+image, for use in constructing an XHTML element (see the C<tag()> method
+below).
+
+    [% USE Image('home.png', alt="Home") %]
+
+You can also provide an alternate name for an C<Image> plugin object.
+
+    [% USE img1 = image 'foo.gif' %]
+    [% USE img2 = image 'bar.gif' %]
+
+The C<name> method returns the image file name.
+
+    [% img1.name %]     # foo.gif
+
+The C<width> and C<height> methods return the width and height of the
+image, respectively.  The C<size> method returns a reference to a 2
+element list containing the width and height.
+
+    [% USE image 'foo.gif' %]
+    width: [% image.width %]
+    height: [% image.height %]
+    size: [% image.size.join(', ') %]
+
+The C<modtime> method returns the modification time of the file in question,
+suitable for use with the L<Date|Template::Plugin::Date> plugin, for example:
+
+    [% USE image 'foo.gif' %]
+    [% USE date %]
+    [% date.format(image.modtime, "%B, %e %Y") %]
+
+The C<attr> method returns the height and width as HTML/XML attributes.
+
+    [% USE image 'foo.gif' %]
+    [% image.attr %]
+
+Typical output:
+
+    width="60" height="20"
+
+The C<tag> method returns a complete XHTML tag referencing the image.
+
+    [% USE image 'foo.gif' %]
+    [% image.tag %]
+
+Typical output:
+
+    <img src="foo.gif" width="60" height="20" alt="" />
+
+You can provide any additional attributes that should be added to the 
+XHTML tag.
+
+    [% USE image 'foo.gif' %]
+    [% image.tag(class="logo" alt="Logo") %]
+
+Typical output:
+
+    <img src="foo.gif" width="60" height="20" alt="Logo" class="logo" />
+
+Note that the C<alt> attribute is mandatory in a strict XHTML C<img>
+element (even if it's empty) so it is always added even if you don't
+explicitly provide a value for it.  You can do so as an argument to 
+the C<tag> method, as shown in the previous example, or as an argument
+
+    [% USE image('foo.gif', alt='Logo') %]
+
+=head1 CATCHING ERRORS
+
+If the image file cannot be found then the above methods will throw an
+C<Image> error.  You can enclose calls to these methods in a
+C<TRY...CATCH> block to catch any potential errors.
+
+    [% TRY;
+         image.width;
+       CATCH;
+         error;      # print error
+       END
+    %]
+
+=head1 USING Image::Info
+
+At run time, the plugin tries to load L<Image::Info> in preference to
+L<Image::Size>. If L<Image::Info> is found, then some additional methods are
+available, in addition to C<size>, C<width>, C<height>, C<attr>, and C<tag>.
+These additional methods are named after the elements that L<Image::Info>
+retrieves from the image itself. The types of methods available depend on the
+type of image (see L<Image::Info> for more details). These additional methods
+will always include the following:
+
+=head2 file_media_type
+
+This is the MIME type that is appropriate for the given file format.
+The corresponding value is a string like: "C<image/png>" or "C<image/jpeg>".
+
+=head2 file_ext
+
+The is the suggested file name extention for a file of the given
+file format.  The value is a 3 letter, lowercase string like
+"C<png>", "C<jpg>".
+
+=head2 color_type
+
+The value is a short string describing what kind of values the pixels
+encode.  The value can be one of the following:
+
+    Gray
+    GrayA
+    RGB
+    RGBA
+    CMYK
+    YCbCr
+    CIELab
+
+These names can also be prefixed by "C<Indexed->" if the image is
+composed of indexes into a palette.  Of these, only "C<Indexed-RGB>" is
+likely to occur.
+
+(It is similar to the TIFF field PhotometricInterpretation, but this
+name was found to be too long, so we used the PNG inpired term
+instead.)
+
+=head2 resolution
+
+The value of this field normally gives the physical size of the image
+on screen or paper. When the unit specifier is missing then this field
+denotes the squareness of pixels in the image.
+
+The syntax of this field is:
+
+   <res> <unit>
+   <xres> "/" <yres> <unit>
+   <xres> "/" <yres>
+
+The C<E<lt>resE<gt>>, C<E<lt>xresE<gt>> and C<E<lt>yresE<gt>> fields are
+numbers.  The C<E<lt>unitE<gt>> is a string like C<dpi>, C<dpm> or
+C<dpcm> (denoting "dots per inch/cm/meter).
+
+=head2 SamplesPerPixel
+
+This says how many channels there are in the image.  For some image
+formats this number might be higher than the number implied from the
+C<color_type>.
+
+=head2 BitsPerSample
+
+This says how many bits are used to encode each of samples.  The value
+is a reference to an array containing numbers. The number of elements
+in the array should be the same as C<SamplesPerPixel>.
+
+=head2 Comment
+
+Textual comments found in the file.  The value is a reference to an
+array if there are multiple comments found.
+
+=head2 Interlace
+
+If the image is interlaced, then this returns the interlace type.
+
+=head2 Compression
+
+This returns the name of the compression algorithm is used.
+
+=head2 Gamma
+
+A number indicating the gamma curve of the image (e.g. 2.2)
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>, L<Image::Info>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/Iterator.pm b/bench/perl/Template/Plugin/Iterator.pm
new file mode 100644
index 0000000..fb6bd48
--- /dev/null
+++ b/bench/perl/Template/Plugin/Iterator.pm
@@ -0,0 +1,88 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::Iterator
+#
+# DESCRIPTION
+#
+#   Plugin to create a Template::Iterator from a list of items and optional
+#   configuration parameters.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 2000-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::Iterator;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+use Template::Iterator;
+
+our $VERSION = 2.68;
+
+#------------------------------------------------------------------------
+# new($context, \@data, \%args)
+#------------------------------------------------------------------------
+
+sub new {
+    my $class   = shift;
+    my $context = shift;
+    Template::Iterator->new(@_);
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Iterator - Plugin to create iterators (Template::Iterator)
+
+=head1 SYNOPSIS
+
+    [% USE iterator(list, args) %]
+    
+    [% FOREACH item = iterator %]
+       [% '<ul>' IF iterator.first %]
+       <li>[% item %]
+       [% '</ul>' IF iterator.last %]
+    [% END %]
+
+=head1 DESCRIPTION
+
+The iterator plugin provides a way to create a L<Template::Iterator> object 
+to iterate over a data set.  An iterator is implicitly automatically by the
+L<FOREACH> directive.  This plugin allows the iterator to be explicitly created
+with a given name.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>, L<Template::Iterator>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/Math.pm b/bench/perl/Template/Plugin/Math.pm
new file mode 100644
index 0000000..ff40004
--- /dev/null
+++ b/bench/perl/Template/Plugin/Math.pm
@@ -0,0 +1,242 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::Math
+#
+# DESCRIPTION
+#   Plugin implementing numerous mathematical functions.
+#
+# AUTHORS
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 2002-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::Math;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+
+our $VERSION = 1.16;
+our $AUTOLOAD;
+
+
+#------------------------------------------------------------------------
+# new($context, \%config)
+#
+# This constructor method creates a simple, empty object to act as a 
+# receiver for future object calls.  No doubt there are many interesting
+# configuration options that might be passed, but I'll leave that for 
+# someone more knowledgable in these areas to contribute...
+#------------------------------------------------------------------------
+
+sub new {
+    my ($class, $context, $config) = @_;
+    $config ||= { };
+
+    bless {
+        %$config,
+    }, $class;
+}
+
+sub abs   { shift; CORE::abs($_[0]);          }
+sub atan2 { shift; CORE::atan2($_[0], $_[1]); } # prototyped (ugg)
+sub cos   { shift; CORE::cos($_[0]);          }
+sub exp   { shift; CORE::exp($_[0]);          }
+sub hex   { shift; CORE::hex($_[0]);          }
+sub int   { shift; CORE::int($_[0]);          }
+sub log   { shift; CORE::log($_[0]);          }
+sub oct   { shift; CORE::oct($_[0]);          }
+sub rand  { shift; CORE::rand($_[0]);         }
+sub sin   { shift; CORE::sin($_[0]);          }
+sub sqrt  { shift; CORE::sqrt($_[0]);         }
+sub srand { shift; CORE::srand($_[0]);        }
+
+# Use the Math::TrulyRandom module
+# XXX This is *sloooooooowwwwwwww*
+sub truly_random {
+    eval { require Math::TrulyRandom; }
+         or die(Template::Exception->new("plugin",
+            "Can't load Math::TrulyRandom"));
+    return Math::TrulyRandom::truly_random_value();
+}
+
+eval {
+    require Math::Trig;
+    no strict qw(refs);
+    for my $trig_func (@Math::Trig::EXPORT) {
+        my $sub = Math::Trig->can($trig_func);
+        *{$trig_func} = sub { shift; &$sub(@_) };
+    }
+};
+
+# To catch errors from a missing Math::Trig
+sub AUTOLOAD { return; }
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Math - Plugin providing mathematical functions
+
+=head1 SYNOPSIS
+
+    [% USE Math %]
+
+    [% Math.sqrt(9) %]
+
+=head1 DESCRIPTION
+
+The Math plugin provides numerous mathematical functions for use
+within templates.
+
+=head1 METHODS
+
+C<Template::Plugin::Math> makes available the following functions from
+the Perl core:
+
+=over 4
+
+=item abs
+
+=item atan2
+
+=item cos
+
+=item exp
+
+=item hex
+
+=item int
+
+=item log
+
+=item oct
+
+=item rand
+
+=item sin
+
+=item sqrt
+
+=item srand
+
+=back
+
+In addition, if the L<Math::Trig> module can be loaded, the following
+functions are also available:
+
+=over 4
+
+=item pi
+
+=item tan
+
+=item csc
+
+=item cosec
+
+=item sec
+
+=item cot
+
+=item cotan
+
+=item asin
+
+=item acos
+
+=item atan
+
+=item acsc
+
+=item acosec
+
+=item asec
+
+=item acot
+
+=item acotan
+
+=item sinh
+
+=item cosh
+
+=item tanh
+
+=item csch
+
+=item cosech
+
+=item sech
+
+=item coth
+
+=item cotanh
+
+=item asinh
+
+=item acosh
+
+=item atanh
+
+=item acsch
+
+=item acosech
+
+=item asech
+
+=item acoth
+
+=item acotanh
+
+=item rad2deg
+
+=item rad2grad
+
+=item deg2rad
+
+=item deg2grad
+
+=item grad2rad
+
+=item grad2deg
+
+=back
+
+If the L<Math::TrulyRandom> module is available, and you've got the time
+to wait, the C<truly_random_number> method is available:
+
+    [% Math.truly_random_number %]
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/Pod.pm b/bench/perl/Template/Plugin/Pod.pm
new file mode 100644
index 0000000..15d8038
--- /dev/null
+++ b/bench/perl/Template/Plugin/Pod.pm
@@ -0,0 +1,87 @@
+#==============================================================================
+# 
+# Template::Plugin::Pod
+#
+# DESCRIPTION
+#  Pod parser and object model.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 2000-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::Pod;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+use Pod::POM;
+
+
+our $VERSION = 2.69;
+
+#------------------------------------------------------------------------
+# new($context, \%config)
+#------------------------------------------------------------------------
+
+sub new {
+    my $class = shift;
+    my $context = shift;
+
+    Pod::POM->new(@_);
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Pod - Plugin interface to Pod::POM (Pod Object Model)
+
+=head1 SYNOPSIS
+
+    [% USE Pod(podfile) %]
+    
+    [% FOREACH head1 = Pod.head1;
+         FOREACH head2 = head1/head2;
+           ...
+         END;
+       END
+    %]
+
+=head1 DESCRIPTION
+
+This plugin is an interface to the L<Pod::POM> module.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>, L<Pod::POM>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/Procedural.pm b/bench/perl/Template/Plugin/Procedural.pm
new file mode 100644
index 0000000..3477591
--- /dev/null
+++ b/bench/perl/Template/Plugin/Procedural.pm
@@ -0,0 +1,133 @@
+#==============================================================================
+# 
+# Template::Plugin::Procedural
+#
+# DESCRIPTION
+#   A Template Plugin to provide a Template Interface to Data::Dumper
+#
+# AUTHOR
+#   Mark Fowler <mark at twoshortplanks.com>
+#
+# COPYRIGHT
+#   Copyright (C) 2002 Mark Fowler.  All Rights Reserved
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#==============================================================================
+
+package Template::Plugin::Procedural;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+
+our $VERSION = 1.17;
+our $DEBUG   = 0 unless defined $DEBUG;
+our $AUTOLOAD;
+
+#------------------------------------------------------------------------
+# load
+#------------------------------------------------------------------------
+
+sub load {
+    my ($class, $context) = @_;
+
+    # create a proxy namespace that will be used for objects
+    my $proxy = "Template::Plugin::" . $class;
+
+    # okay, in our proxy create the autoload routine that will
+    # call the right method in the real class
+    no strict "refs";
+    *{ $proxy . "::AUTOLOAD" } = sub {
+        # work out what the method is called
+        $AUTOLOAD =~ s!^.*::!!;
+
+        print STDERR "Calling '$AUTOLOAD' in '$class'\n"
+            if $DEBUG;
+
+        # look up the sub for that method (but in a OO way)
+        my $uboat = $class->can($AUTOLOAD);
+
+        # if it existed call it as a subroutine, not as a method
+        if ($uboat) {
+            shift @_;
+            return $uboat->(@_);
+        }
+
+        print STDERR "Eeek, no such method '$AUTOLOAD'\n"
+            if $DEBUG;
+
+        return "";
+    };
+
+    # create a simple new method that simply returns a blessed
+    # scalar as the object.
+    *{ $proxy . "::new" } = sub {
+        my $this;
+        return bless \$this, $_[0];
+    };
+
+    return $proxy;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Procedural - Base class for procedural plugins
+
+=head1 SYNOPSIS
+
+    package Template::Plugin::LWPSimple;
+    use base qw(Template::Plugin::Procedural);
+    use LWP::Simple;  # exports 'get'
+    1;
+
+    [% USE LWPSimple %]
+    [% LWPSimple.get("http://www.tt2.org/") %]
+
+=head1 DESCRIPTION
+
+C<Template::Plugin::Procedural> is a base class for Template Toolkit
+plugins that causes defined subroutines to be called directly rather
+than as a method.  Essentially this means that subroutines will not
+receive the class name or object as its first argument.
+
+This is most useful when creating plugins for modules that normally
+work by exporting subroutines that do not expect such additional
+arguments.
+
+Despite the fact that subroutines will not be called in an OO manner,
+inheritance still function as normal.  A class that uses
+C<Template::Plugin::Procedural> can be subclassed and both subroutines
+defined in the subclass and subroutines defined in the original class
+will be available to the Template Toolkit and will be called without
+the class/object argument.
+
+=head1 AUTHOR
+
+Mark Fowler E<lt>mark at twoshortplanks.comE<gt> L<http://www.twoshortplanks.com>
+
+=head1 COPYRIGHT
+
+Copyright (C) 2002 Mark Fowler E<lt>mark at twoshortplanks.comE<gt>
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template>, L<Template::Plugin>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/Scalar.pm b/bench/perl/Template/Plugin/Scalar.pm
new file mode 100644
index 0000000..0bd0a0f
--- /dev/null
+++ b/bench/perl/Template/Plugin/Scalar.pm
@@ -0,0 +1,163 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::Scalar
+#
+# DESCRIPTION
+#   Template Toolkit plugin module which allows you to call object methods
+#   in scalar context.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 2008 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::Scalar;
+use base 'Template::Plugin';
+use strict;
+use warnings;
+use Template::Exception;
+use Scalar::Util qw();
+
+our $VERSION   = 1.00;
+our $MONAD     = 'Template::Monad::Scalar';
+our $EXCEPTION = 'Template::Exception';
+our $AUTOLOAD;
+
+sub load {
+    my $class   = shift;
+    my $context = shift;
+
+    # define .scalar vmethods for hash and list objects
+    $context->define_vmethod( hash => scalar => \&scalar_monad );
+    $context->define_vmethod( list => scalar => \&scalar_monad );
+
+    return $class;
+}
+
+sub scalar_monad {
+    # create a .scalar monad which wraps the hash- or list-based object
+    # and delegates any method calls back to it, calling them in scalar 
+    # context, e.g. foo.scalar.bar becomes $MONAD->new($foo)->bar and 
+    # the monad calls $foo->bar in scalar context
+    $MONAD->new(shift);
+}
+
+sub new {
+    my ($class, $context, @args) = @_;
+    # create a scalar plugin object which will lookup a variable subroutine
+    # and call it.  e.g. scalar.foo results in a call to foo() in scalar context
+    my $self = bless {
+        _CONTEXT => $context,
+    }, $class;
+    return $self;
+}
+
+sub AUTOLOAD {
+    my $self = shift;
+    my $item = $AUTOLOAD;
+    $item =~ s/.*:://;
+    return if $item eq 'DESTROY';
+    
+    # lookup the named values
+    my $stash = $self->{ _CONTEXT }->stash;
+    my $value = $stash->{ $item };
+
+    if (! defined $value) {
+        die $EXCEPTION->new( scalar => "undefined value for scalar call: $item" );
+    }
+    elsif (ref $value eq 'CODE') {
+        $value = $value->(@_);
+    }
+    return $value;
+}
+
+
+package Template::Monad::Scalar;
+
+our $EXCEPTION = 'Template::Exception';
+our $AUTOLOAD;
+
+sub new {
+    my ($class, $this) = @_;
+    bless \$this, $class;
+}
+
+sub AUTOLOAD {
+    my $self = shift;
+    my $this = $$self;
+    my $item = $AUTOLOAD;
+    $item =~ s/.*:://;
+    return if $item eq 'DESTROY';
+
+    my $method;
+    if (Scalar::Util::blessed($this)) {
+        # lookup the method...
+        $method = $this->can($item);
+    }
+    else {
+        die $EXCEPTION->new( scalar => "invalid object method: $item" );
+    }
+
+    # ...and call it in scalar context
+    my $result = $method->($this, @_);
+
+    return $result;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Scalar - call object methods in scalar context
+
+=head1 SYNOPSIS
+
+    [% USE scalar %]
+    
+    # TT2 calls object methods in array context by default
+    [% object.method %]
+    
+    # force it to use scalar context
+    [% object.scalar.method %]
+    
+    # also works with subroutine references
+    [% scalar.my_sub_ref %]
+
+=head1 DESCRIPTION
+
+The Template Toolkit calls user-defined subroutines and object methods
+using Perl's array context by default.  This plugin module provides a way 
+for you to call subroutines and methods in scalar context.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 2008 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/String.pm b/bench/perl/Template/Plugin/String.pm
new file mode 100644
index 0000000..68f3a3b
--- /dev/null
+++ b/bench/perl/Template/Plugin/String.pm
@@ -0,0 +1,761 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::String
+#
+# DESCRIPTION
+#   Template Toolkit plugin to implement a basic String object.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 2001-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::String;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+use Template::Exception;
+
+use overload q|""| => "text",
+             fallback => 1;
+
+our $VERSION = 2.40;
+our $ERROR   = '';
+
+*centre  = \*center;
+*append  = \*push;
+*prepend = \*unshift; 
+
+#------------------------------------------------------------------------
+
+sub new {
+    my ($class, @args) = @_;
+    my $context = ref $class ? undef : shift(@args);
+    my $config = @args && ref $args[-1] eq 'HASH' ? pop(@args) : { };
+
+    $class = ref($class) || $class;
+
+    my $text = defined $config->{ text } 
+        ? $config->{ text }
+        : (@args ? shift(@args) : '');
+
+#    print STDERR "text: [$text]\n";
+#    print STDERR "class: [$class]\n";
+    
+    my $self = bless {
+        text     => $text,
+        filters  => [ ],
+        _CONTEXT => $context,
+    }, $class;
+
+    my $filter = $config->{ filter } || $config->{ filters };
+
+    # install any output filters specified as 'filter' or 'filters' option
+    $self->output_filter($filter)
+        if $filter;
+
+    return $self;
+}
+
+
+sub text {
+    my $self = shift;
+    return $self->{ text } unless @{ $self->{ filters } };
+
+    my $text = $self->{ text };
+    my $context = $self->{ _CONTEXT };
+
+    foreach my $dispatch (@{ $self->{ filters } }) {
+        my ($name, $args) = @$dispatch;
+        my $code = $context->filter($name, $args)
+            || $self->throw($context->error());
+        $text = &$code($text);
+    }
+    return $text;
+}
+
+
+sub copy {
+    my $self = shift;
+    $self->new($self->{ text });
+}
+
+
+sub throw {
+    my $self = shift;
+
+    die (Template::Exception->new('String', join('', @_)));
+}
+
+
+#------------------------------------------------------------------------
+# output_filter($filter)
+#
+# Install automatic output filter(s) for the string.  $filter can a list:
+# [ 'name1', 'name2' => [ ..args.. ], name4 => { ..args.. } ] or a hash
+# { name1 => '', name2 => [ args ], name3 => { args } }
+#------------------------------------------------------------------------
+
+sub output_filter {
+    my ($self, $filter) = @_;
+    my ($name, $args, $dispatch);
+    my $filters = $self->{ filters };
+    my $count = 0;
+
+    if (ref $filter eq 'HASH') {
+        $filter = [ %$filter ];
+    }
+    elsif (ref $filter ne 'ARRAY') {
+        $filter = [ split(/\s*\W+\s*/, $filter) ];
+    }
+
+    while (@$filter) {
+        $name = shift @$filter;
+
+        # args may follow as a reference (or empty string, e.g. { foo => '' }
+        if (@$filter && (ref($filter->[0]) || ! length $filter->[0])) {
+            $args = shift @$filter;
+            if ($args) {
+                $args = [ $args ] unless ref $args eq 'ARRAY';
+            }
+            else {
+                $args = [ ];
+            }
+        }
+        else {
+            $args = [ ];
+        }
+
+#       $self->DEBUG("adding output filter $name(@$args)\n");
+
+        push(@$filters, [ $name, $args ]);
+        $count++;
+    }
+
+    return '';
+}
+
+
+#------------------------------------------------------------------------
+
+sub push {
+    my $self = shift;
+    $self->{ text } .= join('', @_);
+    return $self;
+}
+
+
+sub unshift {
+    my $self = shift;
+    $self->{ text } = join('', @_) . $self->{ text };
+    return $self;
+}
+
+
+sub pop {
+    my $self = shift;
+    my $strip = shift || return $self;
+    $self->{ text } =~ s/$strip$//;
+    return $self;
+}
+
+
+sub shift {
+    my $self = shift;
+    my $strip = shift || return $self;
+    $self->{ text } =~ s/^$strip//;
+    return $self;
+}
+
+#------------------------------------------------------------------------
+
+sub center {
+    my ($self, $width) = @_;
+    my $text = $self->{ text };
+    my $len = length $text;
+    $width ||= 0;
+
+    if ($len < $width) {
+        my $lpad = int(($width - $len) / 2);
+        my $rpad = $width - $len - $lpad;
+        $self->{ text } = (' ' x $lpad) . $self->{ text } . (' ' x $rpad);
+    }
+
+    return $self;
+}
+
+
+sub left {
+    my ($self, $width) = @_;
+    my $len = length $self->{ text };
+    $width ||= 0;
+
+    $self->{ text } .= (' ' x ($width - $len))
+        if $width > $len;
+
+    return $self;
+}
+
+
+sub right {
+    my ($self, $width) = @_;
+    my $len = length $self->{ text };
+    $width ||= 0;
+
+    $self->{ text } = (' ' x ($width - $len)) . $self->{ text }
+        if $width > $len;
+
+    return $self;
+}
+
+
+sub format {
+    my ($self, $format) = @_;
+    $format = '%s' unless defined $format;
+    $self->{ text } = sprintf($format, $self->{ text });
+    return $self;
+}
+
+
+sub filter {
+    my ($self, $name, @args) = @_;
+
+    my $context = $self->{ _CONTEXT };
+
+    my $code = $context->filter($name, \@args)
+        || $self->throw($context->error());
+    return &$code($self->{ text });
+}
+
+
+#------------------------------------------------------------------------
+
+sub upper {
+    my $self = CORE::shift;
+    $self->{ text } = uc $self->{ text };
+    return $self;
+}
+
+
+sub lower {
+    my $self = CORE::shift;
+    $self->{ text } = lc $self->{ text };
+    return $self;    
+}
+
+
+sub capital {
+    my $self = CORE::shift;
+    $self->{ text } =~ s/^(.)/\U$1/;
+    return $self;    
+}
+
+#------------------------------------------------------------------------
+
+sub chop {
+    my $self = CORE::shift;
+    chop $self->{ text };
+    return $self;
+}
+
+
+sub chomp {
+    my $self = CORE::shift;
+    chomp $self->{ text };
+    return $self;
+}
+
+
+sub trim {
+    my $self = CORE::shift;
+    for ($self->{ text }) {
+        s/^\s+//; 
+        s/\s+$//; 
+    }
+    return $self;    
+}
+
+
+sub collapse {
+    my $self = CORE::shift;
+    for ($self->{ text }) {
+        s/^\s+//; 
+        s/\s+$//; 
+        s/\s+/ /g 
+    }
+    return $self;    
+
+}
+
+#------------------------------------------------------------------------
+
+sub length {
+    my $self = CORE::shift;
+    return length $self->{ text };
+}
+
+
+sub truncate {
+    my ($self, $length, $suffix) = @_;
+    return $self unless defined $length;
+    $suffix ||= '';
+    return $self if CORE::length $self->{ text } <= $length;
+    $self->{ text } = CORE::substr($self->{ text }, 0, 
+                             $length - CORE::length($suffix)) . $suffix;
+    return $self;
+}
+
+
+sub substr {
+    my ($self, $offset, $length, $replacement) = @_;
+    $offset ||= 0;
+
+    if(defined $length) {
+        if (defined $replacement) {
+            my $removed = CORE::substr( $self->{text}, $offset, $length );
+            CORE::substr( $self->{text}, $offset, $length ) = $replacement;
+            return $removed;
+        }
+        else {
+            return CORE::substr( $self->{text}, $offset, $length );
+        }
+    } 
+    else {
+        return CORE::substr( $self->{text}, $offset );
+    }
+}
+
+
+sub repeat {
+    my ($self, $n) = @_;
+    return $self unless defined $n;
+    $self->{ text } = $self->{ text } x $n;
+    return $self;
+}
+
+
+sub replace {
+    my ($self, $search, $replace) = @_;
+    return $self unless defined $search;
+    $replace = '' unless defined $replace;
+    $self->{ text } =~ s/$search/$replace/g;
+    return $self;
+}
+
+
+sub remove {
+    my ($self, $search) = @_;
+    $search = '' unless defined $search;
+    $self->{ text } =~ s/$search//g;
+    return $self;
+}
+
+
+sub split {
+    my $self  = CORE::shift;
+    my $split = CORE::shift;
+    my $limit = CORE::shift || 0;
+    $split = '\s+' unless defined $split;
+    return [ split($split, $self->{ text }, $limit) ];
+}
+
+
+sub search {
+    my ($self, $pattern) = @_;
+    return $self->{ text } =~ /$pattern/;
+}
+
+
+sub equals {
+    my ($self, $comparison) = @_;
+    return $self->{ text } eq $comparison;
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::String - Object oriented interface for string manipulation
+
+=head1 SYNOPSIS
+
+    # create String objects via USE directive
+    [% USE String %]
+    [% USE String 'initial text' %]
+    [% USE String text => 'initial text' %]
+
+    # or from an existing String via new()
+    [% newstring = String.new %]
+    [% newstring = String.new('newstring text') %]
+    [% newstring = String.new( text => 'newstring text' ) %]
+
+    # or from an existing String via copy()
+    [% newstring = String.copy %]
+
+    # append text to string
+    [% String.append('text to append') %]
+
+    # format left, right or center/centre padded
+    [% String.left(20) %]
+    [% String.right(20) %]
+    [% String.center(20) %]   # American spelling
+    [% String.centre(20) %]   # European spelling
+
+    # and various other methods...
+
+=head1 DESCRIPTION
+
+This module implements a C<String> class for doing stringy things to
+text in an object-oriented way. 
+
+You can create a C<String> object via the C<USE> directive, adding any 
+initial text value as an argument or as the named parameter C<text>.
+
+    [% USE String %]
+    [% USE String 'initial text' %]
+    [% USE String text='initial text' %]
+
+The object created will be referenced as C<String> by default, but you
+can provide a different variable name for the object to be assigned
+to:
+
+    [% USE greeting = String 'Hello World' %]
+
+Once you've got a C<String> object, you can use it as a prototype to 
+create other C<String> objects with the C<new()> method.
+
+    [% USE String %]
+    [% greeting = String.new('Hello World') %]
+
+The C<new()> method also accepts an initial text string as an argument
+or the named parameter C<text>.
+
+    [% greeting = String.new( text => 'Hello World' ) %]
+
+You can also call C<copy()> to create a new C<String> as a copy of the 
+original.
+
+    [% greet2 = greeting.copy %]
+
+The C<String> object has a C<text()> method to return the content of the 
+string.
+
+    [% greeting.text %]
+
+However, it is sufficient to simply print the string and let the
+overloaded stringification operator call the C<text()> method
+automatically for you.
+
+    [% greeting %]
+
+Thus, you can treat C<String> objects pretty much like any regular piece
+of text, interpolating it into other strings, for example:
+
+    [% msg = "It printed '$greeting' and then dumped core\n" %]
+
+You also have the benefit of numerous other methods for manipulating
+the string.  
+
+    [% msg.append("PS  Don't eat the yellow snow") %]
+
+Note that all methods operate on and mutate the contents of the string
+itself.  If you want to operate on a copy of the string then simply
+take a copy first:
+
+    [% msg.copy.append("PS  Don't eat the yellow snow") %]
+
+These methods return a reference to the C<String> object itself.  This
+allows you to chain multiple methods together.
+
+    [% msg.copy.append('foo').right(72) %]
+
+It also means that in the above examples, the C<String> is returned which
+causes the C<text()> method to be called, which results in the new value of
+the string being printed.  To suppress printing of the string, you can
+use the C<CALL> directive.
+
+    [% foo = String.new('foo') %]
+    
+    [% foo.append('bar') %]         # prints "foobar"
+    
+    [% CALL foo.append('bar') %]    # nothing
+
+=head1 CONSTRUCTOR METHODS
+
+These methods are used to create new C<String> objects.
+
+=head2 new()
+
+Creates a new string using an initial value passed as a positional
+argument or the named parameter C<text>.
+
+    [% USE String %]
+    [% msg = String.new('Hello World') %]
+    [% msg = String.new( text => 'Hello World' ) %]
+
+=head2 copy()
+
+Creates a new C<String> object which contains a copy of the original string.
+
+    [% msg2 = msg.copy %]
+
+=head1 INSPECTOR METHODS
+
+These methods are used to examine the string.
+
+=head2 text()
+
+Returns the internal text value of the string.  The stringification
+operator is overloaded to call this method.  Thus the following are
+equivalent:
+
+    [% msg.text %]
+    [% msg %]
+
+=head2 length()
+
+Returns the length of the string.
+
+    [% USE String("foo") %]
+    [% String.length %]   # => 3
+
+=head2 search($pattern)
+
+Searches the string for the regular expression specified in C<$pattern>
+returning true if found or false otherwise.
+
+    [% item = String.new('foo bar baz wiz waz woz') %]
+    [% item.search('wiz') ? 'WIZZY! :-)' : 'not wizzy :-(' %]
+
+=head2 split($pattern, $limit)
+
+Splits the string based on the delimiter C<$pattern> and optional C<$limit>.
+Delegates to Perl's internal C<split()> so the parameters are exactly the same.
+
+    [% FOREACH item.split %]
+         ...
+    [% END %]
+
+    [% FOREACH item.split('baz|waz') %]
+         ...
+    [% END %]
+
+=head1 MUTATOR METHODS
+
+These methods modify the internal value of the string.  For example:
+
+    [% USE str=String('foobar') %]
+    [% str.append('.html') %]   # str => 'foobar.html'
+
+The value of C<str> is now 'C<foobar.html>'.  If you don't want
+to modify the string then simply take a copy first.
+
+    [% str.copy.append('.html') %]
+
+These methods all return a reference to the C<String> object itself.  This
+has two important benefits.  The first is that when used as above, the 
+C<String> object 'C<str>' returned by the C<append()> method will be stringified
+with a call to its C<text()> method.  This will return the newly modified 
+string content.  In other words, a directive like:
+
+    [% str.append('.html') %]
+
+will update the string and also print the new value.  If you just want
+to update the string but not print the new value then use C<CALL>.
+
+    [% CALL str.append('.html') %]
+
+The other benefit of these methods returning a reference to the C<String>
+is that you can chain as many different method calls together as you
+like.  For example:
+
+    [% String.append('.html').trim.format(href) %]
+
+Here are the methods:
+
+=head2 push($suffix, ...) / append($suffix, ...)
+
+Appends all arguments to the end of the string.  The 
+C<append()> method is provided as an alias for C<push()>.
+
+    [% msg.push('foo', 'bar') %]
+    [% msg.append('foo', 'bar') %]
+
+=head2 pop($suffix)
+
+Removes the suffix passed as an argument from the end of the String.
+
+    [% USE String 'foo bar' %]
+    [% String.pop(' bar')   %]   # => 'foo'
+
+=head2 unshift($prefix, ...) / prepend($prefix, ...)
+
+Prepends all arguments to the beginning of the string.  The
+C<prepend()> method is provided as an alias for C<unshift()>.
+
+    [% msg.unshift('foo ', 'bar ') %]
+    [% msg.prepend('foo ', 'bar ') %]
+
+=head2 shift($prefix)
+
+Removes the prefix passed as an argument from the start of the String.
+
+    [% USE String 'foo bar' %]
+    [% String.shift('foo ') %]   # => 'bar'
+
+=head2 left($pad)
+
+If the length of the string is less than C<$pad> then the string is left
+formatted and padded with spaces to C<$pad> length.
+
+    [% msg.left(20) %]
+
+=head2 right($pad)
+
+As per L<left()> but right padding the C<String> to a length of C<$pad>.
+
+    [% msg.right(20) %]
+
+=head2 center($pad) / centre($pad)
+
+As per L<left()> and L<right()> but formatting the C<String> to be centered within 
+a space padded string of length C<$pad>.  The C<centre()> method is provided as 
+an alias for C<center()>.
+
+    [% msg.center(20) %]    # American spelling
+    [% msg.centre(20) %]    # European spelling
+
+=head2 format($format)
+
+Apply a format in the style of C<sprintf()> to the string.
+
+    [% USE String("world") %]
+    [% String.format("Hello %s\n") %]  # => "Hello World\n"
+
+=head2 upper()
+
+Converts the string to upper case.
+
+    [% USE String("foo") %]
+    [% String.upper %]  # => 'FOO'
+
+=head2 lower()
+
+Converts the string to lower case
+
+    [% USE String("FOO") %]
+    [% String.lower %]  # => 'foo'
+
+=head2 capital()
+
+Converts the first character of the string to upper case.  
+
+    [% USE String("foo") %]
+    [% String.capital %]  # => 'Foo'
+
+The remainder of the string is left untouched.  To force the string to
+be all lower case with only the first letter capitalised, you can do 
+something like this:
+
+    [% USE String("FOO") %]
+    [% String.lower.capital %]  # => 'Foo'
+
+=head2 chop()
+
+Removes the last character from the string.
+
+    [% USE String("foop") %]
+    [% String.chop %]   # => 'foo'
+
+=head2 chomp()
+
+Removes the trailing newline from the string.
+
+    [% USE String("foo\n") %]
+    [% String.chomp %]  # => 'foo'
+
+=head2 trim()
+
+Removes all leading and trailing whitespace from the string
+
+    [% USE String("   foo   \n\n ") %]
+    [% String.trim %]   # => 'foo'
+
+=head2 collapse()
+
+Removes all leading and trailing whitespace and collapses any sequences
+of multiple whitespace to a single space.
+
+    [% USE String(" \n\r  \t  foo   \n \n bar  \n") %]
+    [% String.collapse %]   # => "foo bar"
+
+=head2 truncate($length, $suffix)
+
+Truncates the string to C<$length> characters.
+
+    [% USE String('long string') %]
+    [% String.truncate(4) %]  # => 'long'
+
+If C<$suffix> is specified then it will be appended to the truncated
+string.  In this case, the string will be further shortened by the 
+length of the suffix to ensure that the newly constructed string
+complete with suffix is exactly C<$length> characters long.
+
+    [% USE msg = String('Hello World') %]
+    [% msg.truncate(8, '...') %]   # => 'Hello...'
+
+=head2 replace($search, $replace)
+
+Replaces all occurences of C<$search> in the string with C<$replace>.
+
+    [% USE String('foo bar foo baz') %]
+    [% String.replace('foo', 'wiz')  %]  # => 'wiz bar wiz baz'
+
+=head2 remove($search)
+
+Remove all occurences of C<$search> in the string.
+
+    [% USE String('foo bar foo baz') %]
+    [% String.remove('foo ')  %]  # => 'bar baz'
+
+=head2 repeat($count)
+
+Repeats the string C<$count> times.
+
+    [% USE String('foo ') %]
+    [% String.repeat(3)  %]  # => 'foo foo foo '
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/Table.pm b/bench/perl/Template/Plugin/Table.pm
new file mode 100644
index 0000000..dabc890
--- /dev/null
+++ b/bench/perl/Template/Plugin/Table.pm
@@ -0,0 +1,441 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::Table
+#
+# DESCRIPTION
+#   Plugin to order a linear data set into a virtual 2-dimensional table
+#   from which row and column permutations can be fetched.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 2000-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::Table;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+use Scalar::Util 'blessed';
+
+our $VERSION = 2.71;
+our $AUTOLOAD;
+
+
+#------------------------------------------------------------------------
+# new($context, \@data, \%args)
+#
+# This constructor method initialises the object to iterate through
+# the data set passed by reference to a list as the first parameter.
+# It calculates the shape of the permutation table based on the ROWS
+# or COLS parameters specified in the $args hash reference.  The
+# OVERLAP parameter may be provided to specify the number of common
+# items that should be shared between subseqent columns.
+#------------------------------------------------------------------------
+
+sub new {
+    my ($class, $context, $data, $params) = @_;
+    my ($size, $rows, $cols, $coloff, $overlap, $error);
+
+    # if the data item is a reference to a Template::Iterator object,
+    # or subclass thereof, we call its get_all() method to extract all
+    # the data it contains
+    if (blessed($data) && $data->isa('Template::Iterator')) {
+        ($data, $error) = $data->get_all();
+        return $class->error("iterator failed to provide data for table: ",
+                             $error)
+            if $error;
+    }
+        
+    return $class->error('invalid table data, expecting a list')
+        unless ref $data eq 'ARRAY';
+
+    $params ||= { };
+    return $class->error('invalid table parameters, expecting a hash')
+        unless ref $params eq 'HASH';
+
+    # ensure keys are folded to upper case
+    @$params{ map { uc } keys %$params } = values %$params;
+
+    $size = scalar @$data;
+    $overlap = $params->{ OVERLAP } || 0;
+
+    # calculate number of columns based on a specified number of rows
+    if ($rows = $params->{ ROWS }) {
+        if ($size < $rows) {
+            $rows = $size;   # pad?
+            $cols = 1;
+            $coloff = 0;
+        }
+        else {
+            $coloff = $rows - $overlap;
+            $cols = int ($size / $coloff) 
+                + ($size % $coloff > $overlap ? 1 : 0)
+            }
+    }
+    # calculate number of rows based on a specified number of columns
+    elsif ($cols = $params->{ COLS }) {
+        if ($size < $cols) {
+            $cols = $size;
+            $rows = 1;
+            $coloff = 1;
+        }
+        else {
+            $coloff = int ($size / $cols) 
+                + ($size % $cols > $overlap ? 1 : 0);
+            $rows = $coloff + $overlap;
+        }
+    }
+    else {
+        $rows = $size;
+        $cols = 1;
+        $coloff = 0;
+    }
+    
+    bless {
+        _DATA    => $data,
+        _SIZE    => $size,
+        _NROWS   => $rows,
+        _NCOLS   => $cols,
+        _COLOFF  => $coloff,
+        _OVERLAP => $overlap,
+        _PAD     => defined $params->{ PAD } ? $params->{ PAD } : 1,
+    }, $class;
+}
+
+
+#------------------------------------------------------------------------
+# row($n)
+#
+# Returns a reference to a list containing the items in the row whose 
+# number is specified by parameter.  If the row number is undefined,
+# it calls rows() to return a list of all rows.
+#------------------------------------------------------------------------
+
+sub row {
+    my ($self, $row) = @_;
+    my ($data, $cols, $offset, $size, $pad) 
+        = @$self{ qw( _DATA _NCOLS _COLOFF _SIZE _PAD) };
+    my @set;
+
+    # return all rows if row number not specified
+    return $self->rows()
+        unless defined $row;
+
+    return () if $row >= $self->{ _NROWS } || $row < 0;
+    
+    my $index = $row;
+
+    for (my $c = 0; $c < $cols; $c++) {
+        push(@set, $index < $size 
+             ? $data->[$index] 
+             : ($pad ? undef : ()));
+        $index += $offset;
+    }
+    return \@set;
+}
+
+
+#------------------------------------------------------------------------
+# col($n)
+#
+# Returns a reference to a list containing the items in the column whose
+# number is specified by parameter.  If the column number is undefined,
+# it calls cols() to return a list of all columns.
+#------------------------------------------------------------------------
+
+sub col {
+    my ($self, $col) = @_;
+    my ($data, $size) = @$self{ qw( _DATA _SIZE ) };
+    my ($start, $end);
+    my $blanks = 0;
+
+    # return all cols if row number not specified
+    return $self->cols()
+        unless defined $col;
+
+    return () if $col >= $self->{ _NCOLS } || $col < 0;
+
+    $start = $self->{ _COLOFF } * $col;
+    $end = $start + $self->{ _NROWS } - 1;
+    $end = $start if $end < $start;
+    if ($end >= $size) {
+        $blanks = ($end - $size) + 1;
+        $end = $size - 1;
+    }
+    return () if $start >= $size;
+    return [ @$data[$start..$end], 
+             $self->{ _PAD } ? ((undef) x $blanks) : () ];
+}
+
+
+#------------------------------------------------------------------------
+# rows()
+#
+# Returns all rows as a reference to a list of rows.
+#------------------------------------------------------------------------
+
+sub rows {
+    my $self = shift;
+    return [ map { $self->row($_) } (0..$self->{ _NROWS }-1) ];
+}
+
+
+#------------------------------------------------------------------------
+# cols()
+#
+# Returns all rows as a reference to a list of rows.
+#------------------------------------------------------------------------
+
+sub cols {
+    my $self = shift;
+    return [ map { $self->col($_) } (0..$self->{ _NCOLS }-1) ];
+}
+
+
+#------------------------------------------------------------------------
+# AUTOLOAD
+#
+# Provides read access to various internal data members.
+#------------------------------------------------------------------------
+
+sub AUTOLOAD {
+    my $self = shift;
+    my $item = $AUTOLOAD;
+    $item =~ s/.*:://;
+    return if $item eq 'DESTROY';
+
+    if ($item =~ /^(?:data|size|nrows|ncols|overlap|pad)$/) {
+        return $self->{ $item };
+    }
+    else {
+        return (undef, "no such table method: $item");
+    }
+}
+
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Table - Plugin to present data in a table
+
+=head1 SYNOPSIS
+
+    [% USE table(list, rows=n, cols=n, overlap=n, pad=0) %]
+    
+    [% FOREACH item IN table.row(n) %]
+       [% item %]
+    [% END %]
+    
+    [% FOREACH item IN table.col(n) %]
+       [% item %]
+    [% END %]
+    
+    [% FOREACH row IN table.rows %]
+       [% FOREACH item IN row %]
+          [% item %]
+       [% END %]
+    [% END %]
+    
+    [% FOREACH col IN table.cols %]
+       [% col.first %] - [% col.last %] ([% col.size %] entries)
+    [% END %]
+
+=head1 DESCRIPTION
+
+The C<Table> plugin allows you to format a list of data items into a 
+virtual table.  When you create a C<Table> plugin via the C<USE> directive,
+simply pass a list reference as the first parameter and then specify 
+a fixed number of rows or columns.
+
+    [% USE Table(list, rows=5) %]
+    [% USE table(list, cols=5) %]
+
+The C<Table> plugin name can also be specified in lower case as shown
+in the second example above.  You can also specify an alternative variable
+name for the plugin as per regular Template Toolkit syntax.
+
+    [% USE mydata = table(list, rows=5) %]
+
+The plugin then presents a table based view on the data set.  The data
+isn't actually reorganised in any way but is available via the C<row()>,
+C<col()>, C<rows()> and C<cols()> as if formatted into a simple two dimensional
+table of C<n> rows x C<n> columns.  
+
+So if we had a sample C<alphabet> list contained the letters 'C<a>' to 'C<z>', 
+the above C<USE> directives would create plugins that represented the following 
+views of the alphabet.
+
+    [% USE table(alphabet, ... %]
+    
+    rows=5                  cols=5
+    a  f  k  p  u  z        a  g  m  s  y
+    b  g  l  q  v           b  h  n  t  z
+    c  h  m  r  w           c  i  o  u
+    d  i  n  s  x           d  j  p  v
+    e  j  o  t  y           e  k  q  w
+                            f  l  r  x
+
+We can request a particular row or column using the C<row()> and C<col()>
+methods.
+
+    [% USE table(alphabet, rows=5) %]
+    [% FOREACH item = table.row(0) %]
+       # [% item %] set to each of [ a f k p u z ] in turn
+    [% END %]
+    
+    [% FOREACH item = table.col(2) %]
+       # [% item %] set to each of [ m n o p q r ] in turn
+    [% END %]
+
+Data in rows is returned from left to right, columns from top to
+bottom.  The first row/column is 0.  By default, rows or columns that
+contain empty values will be padded with the undefined value to fill
+it to the same size as all other rows or columns.  
+
+For example, the last row (row 4) in the first example would contain the
+values C<[ e j o t y undef ]>. The Template Toolkit will safely accept these
+undefined values and print a empty string. You can also use the IF directive
+to test if the value is set.
+
+   [% FOREACH item = table.row(4) %]
+      [% IF item %]
+         Item: [% item %]
+      [% END %]
+   [% END %]
+
+You can explicitly disable the C<pad> option when creating the plugin to 
+returned shortened rows/columns where the data is empty.
+
+   [% USE table(alphabet, cols=5, pad=0) %]
+   [% FOREACH item = table.col(4) %]
+      # [% item %] set to each of 'y z'
+   [% END %]
+
+The C<rows()> method returns all rows/columns in the table as a reference
+to a list of rows (themselves list references).  The C<row()> methods
+when called without any arguments calls C<rows()> to return all rows in
+the table.
+
+Ditto for C<cols()> and C<col()>.
+
+    [% USE table(alphabet, cols=5) %]
+    [% FOREACH row = table.rows %]
+       [% FOREACH item = row %]
+          [% item %]
+       [% END %]
+    [% END %]
+
+The Template Toolkit provides the C<first>, C<last> and C<size> virtual
+methods that can be called on list references to return the first/last entry
+or the number of entries in a list. The following example shows how we might
+use this to provide an alphabetical index split into 3 even parts.
+
+    [% USE table(alphabet, cols=3, pad=0) %]
+    [% FOREACH group = table.col %]
+       [ [% group.first %] - [% group.last %] ([% group.size %] letters) ]
+    [% END %]
+
+This produces the following output:
+
+    [ a - i (9 letters) ]
+    [ j - r (9 letters) ]
+    [ s - z (8 letters) ]
+
+We can also use the general purpose C<join> virtual method which joins 
+the items of the list using the connecting string specified.
+
+    [% USE table(alphabet, cols=5) %]
+    [% FOREACH row = table.rows %]
+       [% row.join(' - ') %]
+    [% END %]
+
+Data in the table is ordered downwards rather than across but can easily
+be transformed on output.  For example, to format our data in 5 columns
+with data ordered across rather than down, we specify C<rows=5> to order
+the data as such:
+
+    a  f  .  .
+    b  g  .
+    c  h
+    d  i
+    e  j
+
+and then iterate down through each column (a-e, f-j, etc.) printing
+the data across.
+
+    a  b  c  d  e
+    f  g  h  i  j
+    .  .
+    .
+
+Example code to do so would be much like the following:
+
+    [% USE table(alphabet, rows=3) %]
+    [% FOREACH cols = table.cols %]
+      [% FOREACH item = cols %]
+        [% item %]
+      [% END %]
+    [% END %]
+
+Output:
+
+    a  b  c
+    d  e  f
+    g  h  i
+    j  .  .
+    .
+
+In addition to a list reference, the C<Table> plugin constructor may be passed
+a reference to a L<Template::Iterator> object or subclass thereof. The
+L<Template::Iterator> L<get_all()|Template::Iterator#get_all()> method is
+first called on the iterator to return all remaining items. These are then
+available via the usual Table interface.
+
+    [% USE DBI(dsn,user,pass) -%]
+    
+    # query() returns an iterator
+    [% results = DBI.query('SELECT * FROM alphabet ORDER BY letter') %]
+
+    # pass into Table plugin
+    [% USE table(results, rows=8 overlap=1 pad=0) -%]
+    
+    [% FOREACH row = table.cols -%]
+       [% row.first.letter %] - [% row.last.letter %]:
+          [% row.join(', ') %]
+    [% END %]
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/URL.pm b/bench/perl/Template/Plugin/URL.pm
new file mode 100644
index 0000000..c3474c1
--- /dev/null
+++ b/bench/perl/Template/Plugin/URL.pm
@@ -0,0 +1,203 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::URL
+#
+# DESCRIPTION
+#   Template Toolkit Plugin for constructing URL's from a base stem 
+#   and adaptable parameters.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 2000-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::URL;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+
+our $VERSION = 2.74;
+our $JOINT   = '&';
+
+
+#------------------------------------------------------------------------
+# new($context, $baseurl, \%url_params)
+#
+# Constructor method which returns a sub-routine closure for constructing
+# complex URL's from a base part and hash of additional parameters.
+#------------------------------------------------------------------------
+
+sub new {
+    my ($class, $context, $base, $args) = @_;
+    $args ||= { };
+
+    return sub {
+        my $newbase = shift unless ref $_[0] eq 'HASH';
+        my $newargs = shift || { };
+        my $combo   = { %$args, %$newargs };
+        my $urlargs = join($JOINT,
+                           map  { args($_, $combo->{ $_ }) }
+                           grep { defined $combo->{ $_ } && length $combo->{ $_ } }
+                           sort keys %$combo);
+
+        my $query = $newbase || $base || '';
+        $query .= '?' if length $query && length $urlargs;
+        $query .= $urlargs if length $urlargs;
+
+        return $query
+    }
+}
+
+
+sub args {
+    my ($key, $val) = @_;
+    $key = escape($key);
+    
+    return map {
+        "$key=" . escape($_);
+    } ref $val eq 'ARRAY' ? @$val : $val;
+    
+}
+
+#------------------------------------------------------------------------
+# escape($url)
+# 
+# URL-encode data.  Borrowed with minor modifications from CGI.pm.  
+# Kudos to Lincold Stein.
+#------------------------------------------------------------------------
+
+sub escape {
+    my $toencode = shift;
+    return undef unless defined($toencode);
+    $toencode=~s/([^a-zA-Z0-9_.-])/uc sprintf("%%%02x",ord($1))/eg;
+    return $toencode;
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::URL - Plugin to construct complex URLs
+
+=head1 SYNOPSIS
+
+    [% USE url('/cgi-bin/foo.pl') %]
+    
+    [% url(debug = 1, id = 123) %]
+       # ==> /cgi/bin/foo.pl?debug=1&id=123
+
+    [% USE mycgi = url('/cgi-bin/bar.pl', mode='browse', debug=1) %]
+    
+    [% mycgi %]
+       # ==> /cgi/bin/bar.pl?mode=browse&debug=1
+    
+    [% mycgi(mode='submit') %]
+       # ==> /cgi/bin/bar.pl?mode=submit&debug=1
+    
+    [% mycgi(debug='d2 p0', id='D4-2k[4]') %]
+       # ==> /cgi-bin/bar.pl?mode=browse&debug=d2%20p0&id=D4-2k%5B4%5D
+
+=head1 DESCRIPTION
+
+The C<URL> plugin can be used to construct complex URLs from a base stem 
+and a hash array of additional query parameters.
+
+The constructor should be passed a base URL and optionally, a hash array
+reference of default parameters and values.  Used from with a template,
+it would look something like the following:
+
+    [% USE url('http://www.somewhere.com/cgi-bin/foo.pl') %]
+    [% USE url('/cgi-bin/bar.pl', mode='browse') %]
+    [% USE url('/cgi-bin/baz.pl', mode='browse', debug=1) %]
+
+When the plugin is then called without any arguments, the default base
+and parameters are returned as a formatted query string.  
+
+    [% url %]
+
+For the above three examples, these will produce the following outputs:
+
+    http://www.somewhere.com/cgi-bin/foo.pl
+    /cgi-bin/bar.pl?mode=browse
+    /cgi-bin/baz.pl?mode=browse&debug=1
+
+Note that additional parameters are separated by 'C<&>' rather than
+simply 'C<&>'.  This is the correct behaviour for HTML pages but is,
+unfortunately, incorrect when creating URLs that do not need to be
+encoded safely for HTML.  This is likely to be corrected in a future
+version of the plugin (most probably with TT3).  In the mean time, you
+can set C<$Template::Plugin::URL::JOINT> to C<&> to get the correct
+behaviour.
+
+Additional parameters may be also be specified to the URL:
+
+    [% url(mode='submit', id='wiz') %]
+
+Which, for the same three examples, produces:
+
+    http://www.somewhere.com/cgi-bin/foo.pl?mode=submit&id=wiz
+    /cgi-bin/bar.pl?mode=browse&id=wiz
+    /cgi-bin/baz.pl?mode=browse&debug=1&id=wiz
+
+A new base URL may also be specified as the first option:
+
+    [% url('/cgi-bin/waz.pl', test=1) %]
+
+producing
+
+    /cgi-bin/waz.pl?test=1
+    /cgi-bin/waz.pl?mode=browse&test=1
+    /cgi-bin/waz.pl?mode=browse&debug=1&test=1
+
+The ordering of the parameters is non-deterministic due to fact that 
+Perl's hashes themselves are unordered.  This isn't a problem as the 
+ordering of CGI parameters is insignificant (to the best of my knowledge).
+All values will be properly escaped thanks to some code borrowed from
+Lincoln Stein's C<CGI> module.  e.g.
+
+    [% USE url('/cgi-bin/woz.pl') %]
+    [% url(name="Elrich von Benjy d'Weiro") %]
+
+Here the spaces and "C<'>" character are escaped in the output:
+
+    /cgi-bin/woz.pl?name=Elrich%20von%20Benjy%20d%27Weiro
+
+An alternate name may be provided for the plugin at construction time
+as per regular Template Toolkit syntax.
+
+    [% USE mycgi = url('cgi-bin/min.pl') %]
+    [% mycgi(debug=1) %]
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/View.pm b/bench/perl/Template/Plugin/View.pm
new file mode 100644
index 0000000..5dbbfb6
--- /dev/null
+++ b/bench/perl/Template/Plugin/View.pm
@@ -0,0 +1,97 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::View
+#
+# DESCRIPTION
+#   A user-definable view based on templates.  Similar to the concept of
+#   a "Skin".
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 2000-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::View;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+
+our $VERSION = 2.68;
+
+use Template::View;
+
+#------------------------------------------------------------------------
+# new($context, \%config)
+#------------------------------------------------------------------------
+
+sub new {
+    my $class = shift;
+    my $context = shift;
+    my $view = Template::View->new($context, @_)
+        || return $class->error($Template::View::ERROR);
+    $view->seal();
+    return $view;
+}
+
+1;
+
+__END__
+
+
+=head1 NAME
+
+Template::Plugin::View - Plugin to create views (Template::View)
+
+=head1 SYNOPSIS
+
+    [% USE view(
+            prefix = 'splash/'          # template prefix/suffix
+            suffix = '.tt2'             
+            bgcol  = '#ffffff'          # and any other variables you 
+            style  = 'Fancy HTML'       # care to define as view metadata,
+            items  = [ foo, bar.baz ]   # including complex data and
+            foo    = bar ? baz : x.y.z  # expressions
+    %]
+    
+    [% view.title %]                    # access view metadata
+    
+    [% view.header(title = 'Foo!') %]   # view "methods" process blocks or
+    [% view.footer %]                   # templates with prefix/suffix added
+
+=head1 DESCRIPTION
+
+This plugin module creates L<Template::View> objects.  Views are an
+experimental feature and are subject to change in the near future.
+In the mean time, please consult L<Template::View> for further info.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>, L<Template::View>, L<Template::Manual::Views>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Plugin/Wrap.pm b/bench/perl/Template/Plugin/Wrap.pm
new file mode 100644
index 0000000..bb68042
--- /dev/null
+++ b/bench/perl/Template/Plugin/Wrap.pm
@@ -0,0 +1,142 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugin::Wrap
+#
+# DESCRIPTION
+#   Plugin for wrapping text via the Text::Wrap module.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Plugin::Wrap;
+
+use strict;
+use warnings;
+use base 'Template::Plugin';
+use Text::Wrap;
+
+our $VERSION = 2.68;
+
+sub new {
+    my ($class, $context, $format) = @_;;
+    $context->define_filter('wrap', [ \&wrap_filter_factory => 1 ]);
+    return \&tt_wrap;
+}
+
+sub tt_wrap {
+    my $text  = shift;
+    my $width = shift || 72;
+    my $itab  = shift;
+    my $ntab  = shift;
+    $itab = '' unless defined $itab;
+    $ntab = '' unless defined $ntab;
+    $Text::Wrap::columns = $width;
+    Text::Wrap::wrap($itab, $ntab, $text);
+}
+
+sub wrap_filter_factory {
+    my ($context, @args) = @_;
+    return sub {
+        my $text = shift;
+        tt_wrap($text, @args);
+    }
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugin::Wrap - Plugin interface to Text::Wrap
+
+=head1 SYNOPSIS
+
+    [% USE wrap %]
+    
+    # call wrap subroutine
+    [% wrap(mytext, width, initial_tab,  subsequent_tab) %]
+    
+    # or use wrap FILTER
+    [% mytext FILTER wrap(width, initital_tab, subsequent_tab) %]
+
+=head1 DESCRIPTION
+
+This plugin provides an interface to the L<Text::Wrap> module which 
+provides simple paragraph formatting.
+
+It defines a C<wrap> subroutine which can be called, passing the input
+text and further optional parameters to specify the page width (default:
+72), and tab characters for the first and subsequent lines (no defaults).
+
+    [% USE wrap %]
+    
+    [% text = BLOCK %]
+    First, attach the transmutex multiplier to the cross-wired 
+    quantum homogeniser.
+    [% END %]
+    
+    [% wrap(text, 40, '* ', '  ') %]
+
+Output:
+
+    * First, attach the transmutex
+      multiplier to the cross-wired quantum
+      homogeniser.
+
+It also registers a C<wrap> filter which accepts the same three optional 
+arguments but takes the input text directly via the filter input.
+
+Example 1:
+
+    [% FILTER bullet = wrap(40, '* ', '  ') -%]
+    First, attach the transmutex multiplier to the cross-wired quantum
+    homogeniser.
+    [%- END %]
+
+Output:
+
+    * First, attach the transmutex
+      multiplier to the cross-wired quantum
+      homogeniser.
+
+Example 2:
+
+    [% FILTER bullet -%]
+    Then remodulate the shield to match the harmonic frequency, taking 
+    care to correct the phase difference.
+    [% END %]
+
+Output:
+
+    * Then remodulate the shield to match
+      the harmonic frequency, taking 
+      care to correct the phase difference.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+The L<Text::Wrap> module was written by David Muir Sharnoff
+with help from Tim Pierce and many others.
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>, L<Text::Wrap>
+
diff --git a/bench/perl/Template/Plugins.pm b/bench/perl/Template/Plugins.pm
new file mode 100644
index 0000000..41c5b2c
--- /dev/null
+++ b/bench/perl/Template/Plugins.pm
@@ -0,0 +1,466 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Plugins
+#
+# DESCRIPTION
+#   Plugin provider which handles the loading of plugin modules and 
+#   instantiation of plugin objects.
+#
+# AUTHORS
+#   Andy Wardley <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2006 Andy Wardley.  All Rights Reserved.
+#   Copyright (C) 1998-2000 Canon Research Centre Europe Ltd.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+# REVISION
+#   $Id: Plugins.pm 1179 2008-12-09 19:29:21Z abw $
+#
+#============================================================================
+
+package Template::Plugins;
+
+use strict;
+use warnings;
+use base 'Template::Base';
+use Template::Constants;
+
+our $VERSION = 2.77;
+our $DEBUG   = 0 unless defined $DEBUG;
+our $PLUGIN_BASE = 'Template::Plugin';
+our $STD_PLUGINS = {
+    'assert'     => 'Template::Plugin::Assert',
+    'autoformat' => 'Template::Plugin::Autoformat',
+    'cgi'        => 'Template::Plugin::CGI',
+    'datafile'   => 'Template::Plugin::Datafile',
+    'date'       => 'Template::Plugin::Date',
+    'debug'      => 'Template::Plugin::Debug',
+    'directory'  => 'Template::Plugin::Directory',
+    'dbi'        => 'Template::Plugin::DBI',
+    'dumper'     => 'Template::Plugin::Dumper',
+    'file'       => 'Template::Plugin::File',
+    'format'     => 'Template::Plugin::Format',
+    'html'       => 'Template::Plugin::HTML',
+    'image'      => 'Template::Plugin::Image',
+    'iterator'   => 'Template::Plugin::Iterator',
+    'latex'      => 'Template::Plugin::Latex',
+    'pod'        => 'Template::Plugin::Pod',
+    'scalar'     => 'Template::Plugin::Scalar',
+    'table'      => 'Template::Plugin::Table',
+    'url'        => 'Template::Plugin::URL',
+    'view'       => 'Template::Plugin::View',
+    'wrap'       => 'Template::Plugin::Wrap',
+    'xml'        => 'Template::Plugin::XML',
+    'xmlstyle'   => 'Template::Plugin::XML::Style',
+};
+
+
+#========================================================================
+#                         -- PUBLIC METHODS --
+#========================================================================
+
+#------------------------------------------------------------------------
+# fetch($name, \@args, $context)
+#
+# General purpose method for requesting instantiation of a plugin
+# object.  The name of the plugin is passed as the first parameter.
+# The internal FACTORY lookup table is consulted to retrieve the
+# appropriate factory object or class name.  If undefined, the _load()
+# method is called to attempt to load the module and return a factory
+# class/object which is then cached for subsequent use.  A reference
+# to the calling context should be passed as the third parameter.
+# This is passed to the _load() class method.  The new() method is
+# then called against the factory class name or prototype object to
+# instantiate a new plugin object, passing any arguments specified by
+# list reference as the second parameter.  e.g. where $factory is the
+# class name 'MyClass', the new() method is called as a class method,
+# $factory->new(...), equivalent to MyClass->new(...) .  Where
+# $factory is a prototype object, the new() method is called as an
+# object method, $myobject->new(...).  This latter approach allows
+# plugins to act as Singletons, cache shared data, etc.  
+#
+# Returns a reference to a plugin, (undef, STATUS_DECLINE) to decline
+# the request or ($error, STATUS_ERROR) on error.
+#------------------------------------------------------------------------
+
+sub fetch {
+    my ($self, $name, $args, $context) = @_;
+    my ($factory, $plugin, $error);
+
+    $self->debug("fetch($name, ", 
+                 defined $args ? ('[ ', join(', ', @$args), ' ]') : '<no args>', ', ',
+                 defined $context ? $context : '<no context>', 
+                 ')') if $self->{ DEBUG };
+
+    # NOTE:
+    # the $context ref gets passed as the first parameter to all regular
+    # plugins, but not to those loaded via LOAD_PERL;  to hack around
+    # this until we have a better implementation, we pass the $args
+    # reference to _load() and let it unshift the first args in the 
+    # LOAD_PERL case
+
+    $args ||= [ ];
+    unshift @$args, $context;
+
+    $factory = $self->{ FACTORY }->{ $name } ||= do {
+        ($factory, $error) = $self->_load($name, $context);
+        return ($factory, $error) if $error;			## RETURN
+        $factory;
+    };
+
+    # call the new() method on the factory object or class name
+    eval {
+        if (ref $factory eq 'CODE') {
+            defined( $plugin = &$factory(@$args) )
+                || die "$name plugin failed\n";
+        }
+        else {
+            defined( $plugin = $factory->new(@$args) )
+                || die "$name plugin failed: ", $factory->error(), "\n";
+        }
+    };
+    if ($error = $@) {
+#	chomp $error;
+        return $self->{ TOLERANT } 
+	       ? (undef,  Template::Constants::STATUS_DECLINED)
+	       : ($error, Template::Constants::STATUS_ERROR);
+    }
+
+    return $plugin;
+}
+
+
+
+#========================================================================
+#                        -- PRIVATE METHODS --
+#========================================================================
+
+#------------------------------------------------------------------------
+# _init(\%config)
+#
+# Private initialisation method.
+#------------------------------------------------------------------------
+
+sub _init {
+    my ($self, $params) = @_;
+    my ($pbase, $plugins, $factory) = 
+        @$params{ qw( PLUGIN_BASE PLUGINS PLUGIN_FACTORY ) };
+
+    $plugins ||= { };
+
+    # update PLUGIN_BASE to an array ref if necessary
+    $pbase = [ ] unless defined $pbase;
+    $pbase = [ $pbase ] unless ref($pbase) eq 'ARRAY';
+    
+    # add default plugin base (Template::Plugin) if set
+    push(@$pbase, $PLUGIN_BASE) if $PLUGIN_BASE;
+
+    $self->{ PLUGIN_BASE } = $pbase;
+    $self->{ PLUGINS     } = { %$STD_PLUGINS, %$plugins };
+    $self->{ TOLERANT    } = $params->{ TOLERANT }  || 0;
+    $self->{ LOAD_PERL   } = $params->{ LOAD_PERL } || 0;
+    $self->{ FACTORY     } = $factory || { };
+    $self->{ DEBUG       } = ( $params->{ DEBUG } || 0 )
+                             & Template::Constants::DEBUG_PLUGINS;
+
+    return $self;
+}
+
+
+
+#------------------------------------------------------------------------
+# _load($name, $context)
+#
+# Private method which attempts to load a plugin module and determine the 
+# correct factory name or object by calling the load() class method in
+# the loaded module.
+#------------------------------------------------------------------------
+
+sub _load {
+    my ($self, $name, $context) = @_;
+    my ($factory, $module, $base, $pkg, $file, $ok, $error);
+
+    if ($module = $self->{ PLUGINS }->{ $name } || $self->{ PLUGINS }->{ lc $name }) {
+        # plugin module name is explicitly stated in PLUGIN_NAME
+        $pkg = $module;
+        ($file = $module) =~ s|::|/|g;
+        $file =~ s|::|/|g;
+        $self->debug("loading $module.pm (PLUGIN_NAME)")
+            if $self->{ DEBUG };
+        $ok = eval { require "$file.pm" };
+        $error = $@;
+    }
+    else {
+        # try each of the PLUGIN_BASE values to build module name
+        ($module = $name) =~ s/\./::/g;
+        
+        foreach $base (@{ $self->{ PLUGIN_BASE } }) {
+            $pkg = $base . '::' . $module;
+            ($file = $pkg) =~ s|::|/|g;
+            
+            $self->debug("loading $file.pm (PLUGIN_BASE)")
+                if $self->{ DEBUG };
+            
+            $ok = eval { require "$file.pm" };
+            last unless $@;
+            
+            $error .= "$@\n" 
+                unless ($@ =~ /^Can\'t locate $file\.pm/);
+        }
+    }
+    
+    if ($ok) {
+        $self->debug("calling $pkg->load()") if $self->{ DEBUG };
+
+	$factory = eval { $pkg->load($context) };
+        $error   = '';
+        if ($@ || ! $factory) {
+            $error = $@ || 'load() returned a false value';
+        }
+    }
+    elsif ($self->{ LOAD_PERL }) {
+        # fallback - is it a regular Perl module?
+        ($file = $module) =~ s|::|/|g;
+        eval { require "$file.pm" };
+        if ($@) {
+            $error = $@;
+        }
+        else {
+            # this is a regular Perl module so the new() constructor
+            # isn't expecting a $context reference as the first argument;
+            # so we construct a closure which removes it before calling
+            # $module->new(@_);
+            $factory = sub {
+                shift;
+                $module->new(@_);
+            };
+            $error   = '';
+        }
+    }
+    
+    if ($factory) {
+        $self->debug("$name => $factory") if $self->{ DEBUG };
+        return $factory;
+    }
+    elsif ($error) {
+        return $self->{ TOLERANT } 
+	    ? (undef,  Template::Constants::STATUS_DECLINED) 
+            : ($error, Template::Constants::STATUS_ERROR);
+    }
+    else {
+        return (undef, Template::Constants::STATUS_DECLINED);
+    }
+}
+
+
+#------------------------------------------------------------------------
+# _dump()
+# 
+# Debug method which constructs and returns text representing the current
+# state of the object.
+#------------------------------------------------------------------------
+
+sub _dump {
+    my $self = shift;
+    my $output = "[Template::Plugins] {\n";
+    my $format = "    %-16s => %s\n";
+    my $key;
+
+    foreach $key (qw( TOLERANT LOAD_PERL )) {
+        $output .= sprintf($format, $key, $self->{ $key });
+    }
+
+    local $" = ', ';
+    my $fkeys = join(", ", keys %{$self->{ FACTORY }});
+    my $plugins = $self->{ PLUGINS };
+    $plugins = join('', map { 
+        sprintf("    $format", $_, $plugins->{ $_ });
+    } keys %$plugins);
+    $plugins = "{\n$plugins    }";
+    
+    $output .= sprintf($format, 'PLUGIN_BASE', "[ @{ $self->{ PLUGIN_BASE } } ]");
+    $output .= sprintf($format, 'PLUGINS', $plugins);
+    $output .= sprintf($format, 'FACTORY', $fkeys);
+    $output .= '}';
+    return $output;
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Plugins - Plugin provider module
+
+=head1 SYNOPSIS
+
+    use Template::Plugins;
+    
+    $plugin_provider = Template::Plugins->new(\%options);
+    
+    ($plugin, $error) = $plugin_provider->fetch($name, @args);
+
+=head1 DESCRIPTION
+
+The C<Template::Plugins> module defines a provider class which can be used
+to load and instantiate Template Toolkit plugin modules.
+
+=head1 METHODS
+
+=head2 new(\%params) 
+
+Constructor method which instantiates and returns a reference to a
+C<Template::Plugins> object.  A reference to a hash array of configuration
+items may be passed as a parameter.  These are described below.  
+
+Note that the L<Template> front-end module creates a C<Template::Plugins>
+provider, passing all configuration items.  Thus, the examples shown
+below in the form:
+
+    $plugprov = Template::Plugins->new({
+        PLUGIN_BASE => 'MyTemplate::Plugin',
+        LOAD_PERL   => 1,
+        ...
+    });
+
+can also be used via the L<Template> module as:
+
+    $ttengine = Template->new({
+        PLUGIN_BASE => 'MyTemplate::Plugin',
+        LOAD_PERL   => 1,
+        ...
+    });
+
+as well as the more explicit form of:
+
+    $plugprov = Template::Plugins->new({
+        PLUGIN_BASE => 'MyTemplate::Plugin',
+        LOAD_PERL   => 1,
+        ...
+    });
+    
+    $ttengine = Template->new({
+        LOAD_PLUGINS => [ $plugprov ],
+    });
+
+=head2 fetch($name, @args)
+
+Called to request that a plugin of a given name be provided. The relevant
+module is first loaded (if necessary) and the
+L<load()|Template::Plugin#load()> class method called to return the factory
+class name (usually the same package name) or a factory object (a prototype).
+The L<new()|Template::Plugin#new()> method is then called as a class or object
+method against the factory, passing all remaining parameters.
+
+Returns a reference to a new plugin object or C<($error, STATUS_ERROR)>
+on error.  May also return C<(undef, STATUS_DECLINED)> to decline to
+serve the request.  If C<TOLERANT> is set then all errors will be
+returned as declines.
+
+=head1 CONFIGURATION OPTIONS
+
+The following list summarises the configuration options that can be provided
+to the C<Template::Plugins> L<new()> constructor.  Please consult 
+L<Template::Manual::Config> for further details and examples of each 
+configuration option in use.
+
+=head2 PLUGINS
+
+The L<PLUGINS|Template::Manual::Config#PLUGINS> option can be used to provide
+a reference to a hash array that maps plugin names to Perl module names.
+
+    my $plugins = Template::Plugins->new({
+        PLUGINS => {
+            cgi => 'MyOrg::Template::Plugin::CGI',
+            foo => 'MyOrg::Template::Plugin::Foo',
+            bar => 'MyOrg::Template::Plugin::Bar',
+        },  
+    }); 
+
+=head2 PLUGIN_BASE
+
+If a plugin is not defined in the L<PLUGINS|Template::Manual::Config#PLUGINS>
+hash then the L<PLUGIN_BASE|Template::Manual::Config#PLUGIN_BASE> is used to
+attempt to construct a correct Perl module name which can be successfully
+loaded.
+
+    # single value PLUGIN_BASE
+    my $plugins = Template::Plugins->new({
+        PLUGIN_BASE => 'MyOrg::Template::Plugin',
+    });
+
+    # multiple value PLUGIN_BASE
+    my $plugins = Template::Plugins->new({
+        PLUGIN_BASE => [   'MyOrg::Template::Plugin',
+                           'YourOrg::Template::Plugin'  ],
+    });
+
+=head2 LOAD_PERL
+
+The L<LOAD_PERL|Template::Manual::Config#LOAD_PERL> option can be set to allow
+you to load regular Perl modules (i.e. those that don't reside in the
+C<Template::Plugin> or another user-defined namespace) as plugins.
+
+If a plugin cannot be loaded using the
+L<PLUGINS|Template::Manual::Config#PLUGINS> or
+L<PLUGIN_BASE|Template::Manual::Config#PLUGIN_BASE> approaches then,
+if the L<LOAD_PERL|Template::Manual::Config#LOAD_PERL> is set, the
+provider will make a final attempt to load the module without prepending any
+prefix to the module path. 
+
+Unlike regular plugins, modules loaded using L<LOAD_PERL|Template::Manual::Config#LOAD_PERL>
+do not receive a L<Template::Context> reference as the first argument to the 
+C<new()> constructor method.
+
+=head2 TOLERANT
+
+The L<TOLERANT|Template::Manual::Config#TOLERANT> flag can be set to indicate
+that the C<Template::Plugins> module should ignore any errors encountered while
+loading a plugin and instead return C<STATUS_DECLINED>.
+
+=head2 DEBUG
+
+The L<DEBUG|Template::Manual::Config#DEBUG> option can be used to enable
+debugging messages for the C<Template::Plugins> module by setting it to
+include the C<DEBUG_PLUGINS> value.
+
+    use Template::Constants qw( :debug );
+    
+    my $template = Template->new({
+        DEBUG => DEBUG_FILTERS | DEBUG_PLUGINS,
+    });
+
+=head1 TEMPLATE TOOLKIT PLUGINS
+
+Please see L<Template::Manual::Plugins> For a complete list of all the plugin 
+modules distributed with the Template Toolkit.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Manual::Plugins>, L<Template::Plugin>, L<Template::Context>, L<Template>.
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Provider.pm b/bench/perl/Template/Provider.pm
new file mode 100644
index 0000000..4551b20
--- /dev/null
+++ b/bench/perl/Template/Provider.pm
@@ -0,0 +1,1391 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Provider
+#
+# DESCRIPTION
+#   This module implements a class which handles the loading, compiling
+#   and caching of templates.  Multiple Template::Provider objects can
+#   be stacked and queried in turn to effect a Chain-of-Command between
+#   them.  A provider will attempt to return the requested template,
+#   an error (STATUS_ERROR) or decline to provide the template
+#   (STATUS_DECLINE), allowing subsequent providers to attempt to
+#   deliver it.   See 'Design Patterns' for further details.
+#
+# AUTHORS
+#   Andy Wardley <abw at wardley.org>
+#
+#   Refactored by Bill Moseley for v2.19 to add negative caching (i.e. 
+#   tracking templates that are NOTFOUND so that we can decline quickly)
+#   and to provide better support for subclassing the provider.
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+# WARNING:
+#   This code is ugly and contorted and is being totally re-written for TT3.
+#   In particular, we'll be throwing errors rather than messing around 
+#   returning (value, status) pairs.  With the benefit of hindsight, that 
+#   was a really bad design decision on my part. I deserve to be knocked
+#   to the ground and kicked around a bit by hoards of angry TT developers
+#   for that one.  Bill's refactoring has made the module easier to subclass, 
+#   (so you can ease off the kicking now), but it really needs to be totally
+#   redesigned and rebuilt from the ground up along with the bits of TT that
+#   use it.                                           -- abw 2007/04/27
+#============================================================================
+
+package Template::Provider;
+
+use strict;
+use warnings;
+use base 'Template::Base';
+use Template::Config;
+use Template::Constants;
+use Template::Document;
+use File::Basename;
+use File::Spec;
+
+use constant PREV   => 0;
+use constant NAME   => 1;   # template name -- indexed by this name in LOOKUP
+use constant DATA   => 2;   # Compiled template
+use constant LOAD   => 3;   # mtime of template
+use constant NEXT   => 4;   # link to next item in cache linked list
+use constant STAT   => 5;   # Time last stat()ed
+
+our $VERSION = 2.94;
+our $DEBUG   = 0 unless defined $DEBUG;
+our $ERROR   = '';
+
+# name of document class
+our $DOCUMENT = 'Template::Document' unless defined $DOCUMENT;
+
+# maximum time between performing stat() on file to check staleness
+our $STAT_TTL = 1 unless defined $STAT_TTL;
+
+# maximum number of directories in an INCLUDE_PATH, to prevent runaways
+our $MAX_DIRS = 64 unless defined $MAX_DIRS;
+
+# UNICODE is supported in versions of Perl from 5.007 onwards
+our $UNICODE = $] > 5.007 ? 1 : 0;
+
+my $boms = [
+    'UTF-8'    => "\x{ef}\x{bb}\x{bf}",
+    'UTF-32BE' => "\x{0}\x{0}\x{fe}\x{ff}",
+    'UTF-32LE' => "\x{ff}\x{fe}\x{0}\x{0}",
+    'UTF-16BE' => "\x{fe}\x{ff}",
+    'UTF-16LE' => "\x{ff}\x{fe}",
+];
+
+# regex to match relative paths
+our $RELATIVE_PATH = qr[(?:^|/)\.+/];
+
+
+# hack so that 'use bytes' will compile on versions of Perl earlier than
+# 5.6, even though we never call _decode_unicode() on those systems
+BEGIN {
+    if ($] < 5.006) {
+        package bytes;
+        $INC{'bytes.pm'} = 1;
+    }
+}
+
+
+#========================================================================
+#                         -- PUBLIC METHODS --
+#========================================================================
+
+#------------------------------------------------------------------------
+# fetch($name)
+#
+# Returns a compiled template for the name specified by parameter.
+# The template is returned from the internal cache if it exists, or
+# loaded and then subsequently cached.  The ABSOLUTE and RELATIVE
+# configuration flags determine if absolute (e.g. '/something...')
+# and/or relative (e.g. './something') paths should be honoured.  The
+# INCLUDE_PATH is otherwise used to find the named file. $name may
+# also be a reference to a text string containing the template text,
+# or a file handle from which the content is read.  The compiled
+# template is not cached in these latter cases given that there is no
+# filename to cache under.  A subsequent call to store($name,
+# $compiled) can be made to cache the compiled template for future
+# fetch() calls, if necessary.
+#
+# Returns a compiled template or (undef, STATUS_DECLINED) if the
+# template could not be found.  On error (e.g. the file was found
+# but couldn't be read or parsed), the pair ($error, STATUS_ERROR)
+# is returned.  The TOLERANT configuration option can be set to
+# downgrade any errors to STATUS_DECLINE.
+#------------------------------------------------------------------------
+
+sub fetch {
+    my ($self, $name) = @_;
+    my ($data, $error);
+
+
+    if (ref $name) {
+        # $name can be a reference to a scalar, GLOB or file handle
+        ($data, $error) = $self->_load($name);
+        ($data, $error) = $self->_compile($data)
+            unless $error;
+        $data = $data->{ data }
+            unless $error;
+    }
+    elsif (File::Spec->file_name_is_absolute($name)) {
+        # absolute paths (starting '/') allowed if ABSOLUTE set
+        ($data, $error) = $self->{ ABSOLUTE }
+            ? $self->_fetch($name)
+            : $self->{ TOLERANT }
+                ? (undef, Template::Constants::STATUS_DECLINED)
+            : ("$name: absolute paths are not allowed (set ABSOLUTE option)",
+               Template::Constants::STATUS_ERROR);
+    }
+    elsif ($name =~ m/$RELATIVE_PATH/o) {
+        # anything starting "./" is relative to cwd, allowed if RELATIVE set
+        ($data, $error) = $self->{ RELATIVE }
+            ? $self->_fetch($name)
+            : $self->{ TOLERANT }
+                ? (undef, Template::Constants::STATUS_DECLINED)
+            : ("$name: relative paths are not allowed (set RELATIVE option)",
+               Template::Constants::STATUS_ERROR);
+    }
+    else {
+        # otherwise, it's a file name relative to INCLUDE_PATH
+        ($data, $error) = $self->{ INCLUDE_PATH }
+            ? $self->_fetch_path($name)
+            : (undef, Template::Constants::STATUS_DECLINED);
+    }
+
+#    $self->_dump_cache()
+#       if $DEBUG > 1;
+
+    return ($data, $error);
+}
+
+
+#------------------------------------------------------------------------
+# store($name, $data)
+#
+# Store a compiled template ($data) in the cached as $name.
+# Returns compiled template
+#------------------------------------------------------------------------
+
+sub store {
+    my ($self, $name, $data) = @_;
+    $self->_store($name, {
+        data => $data,
+        load => 0,
+    });
+}
+
+
+#------------------------------------------------------------------------
+# load($name)
+#
+# Load a template without parsing/compiling it, suitable for use with
+# the INSERT directive.  There's some duplication with fetch() and at
+# some point this could be reworked to integrate them a little closer.
+#------------------------------------------------------------------------
+
+sub load {
+    my ($self, $name) = @_;
+    my ($data, $error);
+    my $path = $name;
+
+    if (File::Spec->file_name_is_absolute($name)) {
+        # absolute paths (starting '/') allowed if ABSOLUTE set
+        $error = "$name: absolute paths are not allowed (set ABSOLUTE option)"
+            unless $self->{ ABSOLUTE };
+    }
+    elsif ($name =~ m[$RELATIVE_PATH]o) {
+        # anything starting "./" is relative to cwd, allowed if RELATIVE set
+        $error = "$name: relative paths are not allowed (set RELATIVE option)"
+            unless $self->{ RELATIVE };
+    }
+    else {
+      INCPATH: {
+          # otherwise, it's a file name relative to INCLUDE_PATH
+          my $paths = $self->paths()
+              || return ($self->error(), Template::Constants::STATUS_ERROR);
+
+          foreach my $dir (@$paths) {
+              $path = File::Spec->catfile($dir, $name);
+              last INCPATH
+                  if $self->_template_modified($path);
+          }
+          undef $path;      # not found
+      }
+    }
+
+    # Now fetch the content
+    ($data, $error) = $self->_template_content($path)
+        if defined $path && !$error;
+
+    if ($error) {
+        return $self->{ TOLERANT }
+            ? (undef, Template::Constants::STATUS_DECLINED)
+            : ($error, Template::Constants::STATUS_ERROR);
+    }
+    elsif (! defined $path) {
+        return (undef, Template::Constants::STATUS_DECLINED);
+    }
+    else {
+        return ($data, Template::Constants::STATUS_OK);
+    }
+}
+
+
+
+#------------------------------------------------------------------------
+# include_path(\@newpath)
+#
+# Accessor method for the INCLUDE_PATH setting.  If called with an
+# argument, this method will replace the existing INCLUDE_PATH with
+# the new value.
+#------------------------------------------------------------------------
+
+sub include_path {
+     my ($self, $path) = @_;
+     $self->{ INCLUDE_PATH } = $path if $path;
+     return $self->{ INCLUDE_PATH };
+}
+
+
+#------------------------------------------------------------------------
+# paths()
+#
+# Evaluates the INCLUDE_PATH list, ignoring any blank entries, and
+# calling and subroutine or object references to return dynamically
+# generated path lists.  Returns a reference to a new list of paths
+# or undef on error.
+#------------------------------------------------------------------------
+
+sub paths {
+    my $self   = shift;
+    my @ipaths = @{ $self->{ INCLUDE_PATH } };
+    my (@opaths, $dpaths, $dir);
+    my $count = $MAX_DIRS;
+
+    while (@ipaths && --$count) {
+        $dir = shift @ipaths || next;
+
+        # $dir can be a sub or object ref which returns a reference
+        # to a dynamically generated list of search paths.
+
+        if (ref $dir eq 'CODE') {
+            eval { $dpaths = &$dir() };
+            if ($@) {
+                chomp $@;
+                return $self->error($@);
+            }
+            unshift(@ipaths, @$dpaths);
+            next;
+        }
+        elsif (ref($dir) && UNIVERSAL::can($dir, 'paths')) {
+            $dpaths = $dir->paths()
+                || return $self->error($dir->error());
+            unshift(@ipaths, @$dpaths);
+            next;
+        }
+        else {
+            push(@opaths, $dir);
+        }
+    }
+    return $self->error("INCLUDE_PATH exceeds $MAX_DIRS directories")
+        if @ipaths;
+
+    return \@opaths;
+}
+
+
+#------------------------------------------------------------------------
+# DESTROY
+#
+# The provider cache is implemented as a doubly linked list which Perl
+# cannot free by itself due to the circular references between NEXT <=>
+# PREV items.  This cleanup method walks the list deleting all the NEXT/PREV
+# references, allowing the proper cleanup to occur and memory to be
+# repooled.
+#------------------------------------------------------------------------
+
+sub DESTROY {
+    my $self = shift;
+    my ($slot, $next);
+
+    $slot = $self->{ HEAD };
+    while ($slot) {
+        $next = $slot->[ NEXT ];
+        undef $slot->[ PREV ];
+        undef $slot->[ NEXT ];
+        $slot = $next;
+    }
+    undef $self->{ HEAD };
+    undef $self->{ TAIL };
+}
+
+
+
+
+#========================================================================
+#                        -- PRIVATE METHODS --
+#========================================================================
+
+#------------------------------------------------------------------------
+# _init()
+#
+# Initialise the cache.
+#------------------------------------------------------------------------
+
+sub _init {
+    my ($self, $params) = @_;
+    my $size = $params->{ CACHE_SIZE   };
+    my $path = $params->{ INCLUDE_PATH } || '.';
+    my $cdir = $params->{ COMPILE_DIR  } || '';
+    my $dlim = $params->{ DELIMITER    };
+    my $debug;
+
+    # tweak delim to ignore C:/
+    unless (defined $dlim) {
+        $dlim = ($^O eq 'MSWin32') ? ':(?!\\/)' : ':';
+    }
+
+    # coerce INCLUDE_PATH to an array ref, if not already so
+    $path = [ split(/$dlim/, $path) ]
+        unless ref $path eq 'ARRAY';
+
+    # don't allow a CACHE_SIZE 1 because it breaks things and the
+    # additional checking isn't worth it
+    $size = 2
+        if defined $size && ($size == 1 || $size < 0);
+
+    if (defined ($debug = $params->{ DEBUG })) {
+        $self->{ DEBUG } = $debug & ( Template::Constants::DEBUG_PROVIDER
+                                    | Template::Constants::DEBUG_FLAGS );
+    }
+    else {
+        $self->{ DEBUG } = $DEBUG;
+    }
+
+    if ($self->{ DEBUG }) {
+        local $" = ', ';
+        $self->debug("creating cache of ",
+                     defined $size ? $size : 'unlimited',
+                     " slots for [ @$path ]");
+    }
+
+    # create COMPILE_DIR and sub-directories representing each INCLUDE_PATH
+    # element in which to store compiled files
+    if ($cdir) {
+        require File::Path;
+        foreach my $dir (@$path) {
+            next if ref $dir;
+            my $wdir = $dir;
+            $wdir =~ s[:][]g if $^O eq 'MSWin32';
+            $wdir =~ /(.*)/;  # untaint
+            $wdir = "$1";     # quotes work around bug in Strawberry Perl
+            $wdir = File::Spec->catfile($cdir, $wdir);
+            File::Path::mkpath($wdir) unless -d $wdir;
+        }
+    }
+
+    $self->{ LOOKUP       } = { };
+    $self->{ NOTFOUND     } = { };  # Tracks templates *not* found.
+    $self->{ SLOTS        } = 0;
+    $self->{ SIZE         } = $size;
+    $self->{ INCLUDE_PATH } = $path;
+    $self->{ DELIMITER    } = $dlim;
+    $self->{ COMPILE_DIR  } = $cdir;
+    $self->{ COMPILE_EXT  } = $params->{ COMPILE_EXT } || '';
+    $self->{ ABSOLUTE     } = $params->{ ABSOLUTE } || 0;
+    $self->{ RELATIVE     } = $params->{ RELATIVE } || 0;
+    $self->{ TOLERANT     } = $params->{ TOLERANT } || 0;
+    $self->{ DOCUMENT     } = $params->{ DOCUMENT } || $DOCUMENT;
+    $self->{ PARSER       } = $params->{ PARSER   };
+    $self->{ DEFAULT      } = $params->{ DEFAULT  };
+    $self->{ ENCODING     } = $params->{ ENCODING };
+#   $self->{ PREFIX       } = $params->{ PREFIX   };
+    $self->{ STAT_TTL     } = $params->{ STAT_TTL } || $STAT_TTL;
+    $self->{ PARAMS       } = $params;
+
+    # look for user-provided UNICODE parameter or use default from package var
+    $self->{ UNICODE      } = defined $params->{ UNICODE }
+                                    ? $params->{ UNICODE } : $UNICODE;
+
+    return $self;
+}
+
+
+#------------------------------------------------------------------------
+# _fetch($name, $t_name)
+#
+# Fetch a file from cache or disk by specification of an absolute or
+# relative filename.  No search of the INCLUDE_PATH is made.  If the
+# file is found and loaded, it is compiled and cached.
+# Call with:
+#   $name       = path to search (possible prefixed by INCLUDE_PATH)
+#   $t_name     = template name
+#------------------------------------------------------------------------
+
+sub _fetch {
+    my ($self, $name, $t_name) = @_;
+    my $stat_ttl = $self->{ STAT_TTL };
+
+    $self->debug("_fetch($name)") if $self->{ DEBUG };
+
+    # First see if the named template is in the memory cache
+    if ((my $slot = $self->{ LOOKUP }->{ $name })) {
+        # Test if cache is fresh, and reload/compile if not.
+        my ($data, $error) = $self->_refresh($slot);
+
+        return $error
+            ? ( $data, $error )     # $data may contain error text
+            : $slot->[ DATA ];      # returned document object
+    }
+
+    # Otherwise, see if we already know the template is not found
+    if (my $last_stat_time = $self->{ NOTFOUND }->{ $name }) {
+        my $expires_in = $last_stat_time + $stat_ttl - time;
+        if ($expires_in > 0) {
+            $self->debug(" file [$name] in negative cache.  Expires in $expires_in seconds")
+                if $self->{ DEBUG };
+            return (undef, Template::Constants::STATUS_DECLINED);
+        }
+        else {
+            delete $self->{ NOTFOUND }->{ $name };
+        }
+    }
+
+    # Is there an up-to-date compiled version on disk?
+    if ($self->_compiled_is_current($name)) {
+        # require() the compiled template.
+        my $compiled_template = $self->_load_compiled( $self->_compiled_filename($name) );
+
+        # Store and return the compiled template
+        return $self->store( $name, $compiled_template ) if $compiled_template;
+
+        # Problem loading compiled template:
+        # warn and continue to fetch source template
+        warn($self->error(), "\n");
+    }
+
+    # load template from source
+    my ($template, $error) = $self->_load($name, $t_name);
+
+    if ($error) {
+        # Template could not be fetched.  Add to the negative/notfound cache.
+        $self->{ NOTFOUND }->{ $name } = time;
+        return ( $template, $error );
+    }
+
+    # compile template source
+    ($template, $error) = $self->_compile($template, $self->_compiled_filename($name) );
+
+    if ($error) {
+        # return any compile time error
+        return ($template, $error);
+    }
+    else {
+        # Store compiled template and return it
+        return $self->store($name, $template->{data}) ;
+    }
+}
+
+
+#------------------------------------------------------------------------
+# _fetch_path($name)
+#
+# Fetch a file from cache or disk by specification of an absolute cache
+# name (e.g. 'header') or filename relative to one of the INCLUDE_PATH
+# directories.  If the file isn't already cached and can be found and
+# loaded, it is compiled and cached under the full filename.
+#------------------------------------------------------------------------
+
+sub _fetch_path {
+    my ($self, $name) = @_;
+
+    $self->debug("_fetch_path($name)") if $self->{ DEBUG };
+
+    # the template may have been stored using a non-filename name
+    # so look for the plain name in the cache first
+    if ((my $slot = $self->{ LOOKUP }->{ $name })) {
+        # cached entry exists, so refresh slot and extract data
+        my ($data, $error) = $self->_refresh($slot);
+
+        return $error
+            ? ($data, $error)
+            : ($slot->[ DATA ], $error );
+    }
+
+    my $paths = $self->paths
+        || return ( $self->error, Template::Constants::STATUS_ERROR );
+
+    # search the INCLUDE_PATH for the file, in cache or on disk
+    foreach my $dir (@$paths) {
+        my $path = File::Spec->catfile($dir, $name);
+
+        $self->debug("searching path: $path\n") if $self->{ DEBUG };
+
+        my ($data, $error) = $self->_fetch( $path, $name );
+
+        # Return if no error or if a serious error.
+        return ( $data, $error )
+            if !$error || $error == Template::Constants::STATUS_ERROR;
+
+    }
+
+    # not found in INCLUDE_PATH, now try DEFAULT
+    return $self->_fetch_path( $self->{DEFAULT} )
+        if defined $self->{DEFAULT} && $name ne $self->{DEFAULT};
+
+    # We could not handle this template name
+    return (undef, Template::Constants::STATUS_DECLINED);
+}
+
+sub _compiled_filename {
+    my ($self, $file) = @_;
+    my ($compext, $compdir) = @$self{ qw( COMPILE_EXT COMPILE_DIR ) };
+    my ($path, $compiled);
+
+    return undef
+        unless $compext || $compdir;
+
+    $path = $file;
+    $path =~ /^(.+)$/s or die "invalid filename: $path";
+    $path =~ s[:][]g if $^O eq 'MSWin32';
+
+    $compiled = "$path$compext";
+    $compiled = File::Spec->catfile($compdir, $compiled) if length $compdir;
+
+    return $compiled;
+}
+
+sub _load_compiled {
+    my ($self, $file) = @_;
+    my $compiled;
+
+    # load compiled template via require();  we zap any
+    # %INC entry to ensure it is reloaded (we don't
+    # want 1 returned by require() to say it's in memory)
+    delete $INC{ $file };
+    eval { $compiled = require $file; };
+    return $@
+        ? $self->error("compiled template $compiled: $@")
+        : $compiled;
+}
+
+#------------------------------------------------------------------------
+# _load($name, $alias)
+#
+# Load template text from a string ($name = scalar ref), GLOB or file
+# handle ($name = ref), or from an absolute filename ($name = scalar).
+# Returns a hash array containing the following items:
+#   name    filename or $alias, if provided, or 'input text', etc.
+#   text    template text
+#   time    modification time of file, or current time for handles/strings
+#   load    time file was loaded (now!)
+#
+# On error, returns ($error, STATUS_ERROR), or (undef, STATUS_DECLINED)
+# if TOLERANT is set.
+#------------------------------------------------------------------------
+
+sub _load {
+    my ($self, $name, $alias) = @_;
+    my ($data, $error);
+    my $tolerant = $self->{ TOLERANT };
+    my $now = time;
+
+    $alias = $name unless defined $alias or ref $name;
+
+    $self->debug("_load($name, ", defined $alias ? $alias : '<no alias>',
+                 ')') if $self->{ DEBUG };
+
+    # SCALAR ref is the template text
+    if (ref $name eq 'SCALAR') {
+        # $name can be a SCALAR reference to the input text...
+        return {
+            name => defined $alias ? $alias : 'input text',
+            path => defined $alias ? $alias : 'input text',
+            text => $$name,
+            time => $now,
+            load => 0,
+        };
+    }
+
+    # Otherwise, assume GLOB as a file handle
+    if (ref $name) {
+        local $/;
+        my $text = <$name>;
+        $text = $self->_decode_unicode($text) if $self->{ UNICODE };
+        return {
+            name => defined $alias ? $alias : 'input file handle',
+            path => defined $alias ? $alias : 'input file handle',
+            text => $text,
+            time => $now,
+            load => 0,
+        };
+    }
+
+    # Otherwise, it's the name of the template
+    if ( $self->_template_modified( $name ) ) {  # does template exist?
+        my ($text, $error, $mtime ) = $self->_template_content( $name );
+        unless ( $error )  {
+            $text = $self->_decode_unicode($text) if $self->{ UNICODE };
+            return {
+                name => $alias,
+                path => $name,
+                text => $text,
+                time => $mtime,
+                load => $now,
+            };
+        }
+
+        return ( "$alias: $!", Template::Constants::STATUS_ERROR )
+            unless $tolerant;
+    }
+
+    # Unable to process template, pass onto the next Provider.
+    return (undef, Template::Constants::STATUS_DECLINED);
+}
+
+
+#------------------------------------------------------------------------
+# _refresh(\@slot)
+#
+# Private method called to mark a cache slot as most recently used.
+# A reference to the slot array should be passed by parameter.  The
+# slot is relocated to the head of the linked list.  If the file from
+# which the data was loaded has been upated since it was compiled, then
+# it is re-loaded from disk and re-compiled.
+#------------------------------------------------------------------------
+
+sub _refresh {
+    my ($self, $slot) = @_;
+    my $stat_ttl = $self->{ STAT_TTL };
+    my ($head, $file, $data, $error);
+
+    $self->debug("_refresh([ ",
+                 join(', ', map { defined $_ ? $_ : '<undef>' } @$slot),
+                 '])') if $self->{ DEBUG };
+
+    # if it's more than $STAT_TTL seconds since we last performed a
+    # stat() on the file then we need to do it again and see if the file
+    # time has changed
+    my $now = time;
+    my $expires_in_sec = $slot->[ STAT ] + $stat_ttl - $now;
+
+    if ( $expires_in_sec <= 0 ) {  # Time to check!
+        $slot->[ STAT ] = $now;
+
+        # Grab mtime of template.
+        # Seems like this should be abstracted to compare to
+        # just ask for a newer compiled template (if it's newer)
+        # and let that check for a newer template source.
+        my $template_mtime = $self->_template_modified( $slot->[ NAME ] );
+        if ( ! defined $template_mtime || ( $template_mtime != $slot->[ LOAD ] )) {
+            $self->debug("refreshing cache file ", $slot->[ NAME ])
+                if $self->{ DEBUG };
+
+            ($data, $error) = $self->_load($slot->[ NAME ], $slot->[ DATA ]->{ name });
+            ($data, $error) = $self->_compile($data)
+                unless $error;
+
+            if ($error) {
+                # if the template failed to load/compile then we wipe out the
+                # STAT entry.  This forces the provider to try and reload it
+                # each time instead of using the previously cached version
+                # until $STAT_TTL is next up
+                $slot->[ STAT ] = 0;
+            }
+            else {
+                $slot->[ DATA ] = $data->{ data };
+                $slot->[ LOAD ] = $data->{ time };
+            }
+        }
+
+    } elsif ( $self->{ DEBUG } ) {
+        $self->debug( sprintf('STAT_TTL not met for file [%s].  Expires in %d seconds',
+                        $slot->[ NAME ], $expires_in_sec ) );
+    }
+
+    # Move this slot to the head of the list
+    unless( $self->{ HEAD } == $slot ) {
+        # remove existing slot from usage chain...
+        if ($slot->[ PREV ]) {
+            $slot->[ PREV ]->[ NEXT ] = $slot->[ NEXT ];
+        }
+        else {
+            $self->{ HEAD } = $slot->[ NEXT ];
+        }
+        if ($slot->[ NEXT ]) {
+            $slot->[ NEXT ]->[ PREV ] = $slot->[ PREV ];
+        }
+        else {
+            $self->{ TAIL } = $slot->[ PREV ];
+        }
+
+        # ..and add to start of list
+        $head = $self->{ HEAD };
+        $head->[ PREV ] = $slot if $head;
+        $slot->[ PREV ] = undef;
+        $slot->[ NEXT ] = $head;
+        $self->{ HEAD } = $slot;
+    }
+
+    return ($data, $error);
+}
+
+
+
+#------------------------------------------------------------------------
+# _store($name, $data)
+#
+# Private method called to add a data item to the cache.  If the cache
+# size limit has been reached then the oldest entry at the tail of the
+# list is removed and its slot relocated to the head of the list and
+# reused for the new data item.  If the cache is under the size limit,
+# or if no size limit is defined, then the item is added to the head
+# of the list.
+# Returns compiled template
+#------------------------------------------------------------------------
+
+sub _store {
+    my ($self, $name, $data, $compfile) = @_;
+    my $size = $self->{ SIZE };
+    my ($slot, $head);
+
+    # Return if memory cache disabled.  (overridding code should also check)
+    # $$$ What's the expected behaviour of store()?  Can't tell from the
+    # docs if you can call store() when SIZE = 0.
+    return $data->{data} if defined $size and !$size;
+
+    # extract the compiled template from the data hash
+    $data = $data->{ data };
+    $self->debug("_store($name, $data)") if $self->{ DEBUG };
+
+    # check the modification time -- extra stat here
+    my $load = $self->_modified($name);
+
+    if (defined $size && $self->{ SLOTS } >= $size) {
+        # cache has reached size limit, so reuse oldest entry
+        $self->debug("reusing oldest cache entry (size limit reached: $size)\nslots: $self->{ SLOTS }") if $self->{ DEBUG };
+
+        # remove entry from tail of list
+        $slot = $self->{ TAIL };
+        $slot->[ PREV ]->[ NEXT ] = undef;
+        $self->{ TAIL } = $slot->[ PREV ];
+
+        # remove name lookup for old node
+        delete $self->{ LOOKUP }->{ $slot->[ NAME ] };
+
+        # add modified node to head of list
+        $head = $self->{ HEAD };
+        $head->[ PREV ] = $slot if $head;
+        @$slot = ( undef, $name, $data, $load, $head, time );
+        $self->{ HEAD } = $slot;
+
+        # add name lookup for new node
+        $self->{ LOOKUP }->{ $name } = $slot;
+    }
+    else {
+        # cache is under size limit, or none is defined
+
+        $self->debug("adding new cache entry") if $self->{ DEBUG };
+
+        # add new node to head of list
+        $head = $self->{ HEAD };
+        $slot = [ undef, $name, $data, $load, $head, time ];
+        $head->[ PREV ] = $slot if $head;
+        $self->{ HEAD } = $slot;
+        $self->{ TAIL } = $slot unless $self->{ TAIL };
+
+        # add lookup from name to slot and increment nslots
+        $self->{ LOOKUP }->{ $name } = $slot;
+        $self->{ SLOTS }++;
+    }
+
+    return $data;
+}
+
+
+#------------------------------------------------------------------------
+# _compile($data)
+#
+# Private method called to parse the template text and compile it into
+# a runtime form.  Creates and delegates a Template::Parser object to
+# handle the compilation, or uses a reference passed in PARSER.  On
+# success, the compiled template is stored in the 'data' item of the
+# $data hash and returned.  On error, ($error, STATUS_ERROR) is returned,
+# or (undef, STATUS_DECLINED) if the TOLERANT flag is set.
+# The optional $compiled parameter may be passed to specify
+# the name of a compiled template file to which the generated Perl
+# code should be written.  Errors are (for now...) silently
+# ignored, assuming that failures to open a file for writing are
+# intentional (e.g directory write permission).
+#------------------------------------------------------------------------
+
+sub _compile {
+    my ($self, $data, $compfile) = @_;
+    my $text = $data->{ text };
+    my ($parsedoc, $error);
+
+    $self->debug("_compile($data, ",
+                 defined $compfile ? $compfile : '<no compfile>', ')')
+        if $self->{ DEBUG };
+
+    my $parser = $self->{ PARSER }
+        ||= Template::Config->parser($self->{ PARAMS })
+        ||  return (Template::Config->error(), Template::Constants::STATUS_ERROR);
+
+    # discard the template text - we don't need it any more
+    delete $data->{ text };
+
+    # call parser to compile template into Perl code
+    if ($parsedoc = $parser->parse($text, $data)) {
+
+        $parsedoc->{ METADATA } = {
+            'name'    => $data->{ name },
+            'modtime' => $data->{ time },
+            %{ $parsedoc->{ METADATA } },
+        };
+
+        # write the Perl code to the file $compfile, if defined
+        if ($compfile) {
+            my $basedir = &File::Basename::dirname($compfile);
+            $basedir =~ /(.*)/;
+            $basedir = $1;
+
+            unless (-d $basedir) {
+                eval { File::Path::mkpath($basedir) };
+                $error = "failed to create compiled templates directory: $basedir ($@)"
+                    if ($@);
+            }
+
+            unless ($error) {
+                my $docclass = $self->{ DOCUMENT };
+                $error = 'cache failed to write '
+                    . &File::Basename::basename($compfile)
+                    . ': ' . $docclass->error()
+                    unless $docclass->write_perl_file($compfile, $parsedoc);
+            }
+
+            # set atime and mtime of newly compiled file, don't bother
+            # if time is undef
+            if (!defined($error) && defined $data->{ time }) {
+                my ($cfile) = $compfile =~ /^(.+)$/s or do {
+                    return("invalid filename: $compfile",
+                           Template::Constants::STATUS_ERROR);
+                };
+
+                my ($ctime) = $data->{ time } =~ /^(\d+)$/;
+                unless ($ctime || $ctime eq 0) {
+                    return("invalid time: $ctime",
+                           Template::Constants::STATUS_ERROR);
+                }
+                utime($ctime, $ctime, $cfile);
+
+                $self->debug(" cached compiled template to file [$compfile]")
+                    if $self->{ DEBUG };
+            }
+        }
+
+        unless ($error) {
+            return $data                                        ## RETURN ##
+                if $data->{ data } = $DOCUMENT->new($parsedoc);
+            $error = $Template::Document::ERROR;
+        }
+    }
+    else {
+        $error = Template::Exception->new( 'parse', "$data->{ name } " .
+                                           $parser->error() );
+    }
+
+    # return STATUS_ERROR, or STATUS_DECLINED if we're being tolerant
+    return $self->{ TOLERANT }
+        ? (undef, Template::Constants::STATUS_DECLINED)
+        : ($error,  Template::Constants::STATUS_ERROR)
+}
+
+#------------------------------------------------------------------------
+# _compiled_is_current( $template_name )
+#
+# Returns true if $template_name and its compiled name
+# exist and they have the same mtime.
+#------------------------------------------------------------------------
+
+sub _compiled_is_current {
+    my ( $self, $template_name ) = @_;
+    my $compiled_name   = $self->_compiled_filename($template_name) || return;
+    my $compiled_mtime  = (stat($compiled_name))[9] || return;
+    my $template_mtime  = $self->_template_modified( $template_name ) || return;
+
+    # This was >= in the 2.15, but meant that downgrading
+    # a source template would not get picked up.
+    return $compiled_mtime == $template_mtime;
+}
+
+
+#------------------------------------------------------------------------
+# _template_modified($path)
+#
+# Returns the last modified time of the $path.
+# Returns undef if the path does not exist.
+# Override if templates are not on disk, for example
+#------------------------------------------------------------------------
+
+sub _template_modified {
+    my $self = shift;
+    my $template = shift || return;
+    return (stat( $template ))[9];
+}
+
+#------------------------------------------------------------------------
+# _template_content($path)
+#
+# Fetches content pointed to by $path.
+# Returns the content in scalar context.
+# Returns ($data, $error, $mtime) in list context where
+#   $data       - content
+#   $error      - error string if there was an error, otherwise undef
+#   $mtime      - last modified time from calling stat() on the path
+#------------------------------------------------------------------------
+
+sub _template_content {
+    my ($self, $path) = @_;
+
+    return (undef, "No path specified to fetch content from ")
+        unless $path;
+
+    my $data;
+    my $mod_date;
+    my $error;
+
+    local *FH;
+    if (open(FH, "< $path")) {
+        local $/;
+        binmode(FH);
+        $data = <FH>;
+        $mod_date = (stat($path))[9];
+        close(FH);
+    }
+    else {
+        $error = "$path: $!";
+    }
+
+    return wantarray
+        ? ( $data, $error, $mod_date )
+        : $data;
+}
+
+
+#------------------------------------------------------------------------
+# _modified($name)
+# _modified($name, $time)
+#
+# When called with a single argument, it returns the modification time
+# of the named template.  When called with a second argument it returns
+# true if $name has been modified since $time.
+#------------------------------------------------------------------------
+
+sub _modified {
+    my ($self, $name, $time) = @_;
+    my $load = $self->_template_modified($name)
+        || return $time ? 1 : 0;
+
+    return $time
+         ? $load > $time
+         : $load;
+}
+
+#------------------------------------------------------------------------
+# _dump()
+#
+# Debug method which returns a string representing the internal object
+# state.
+#------------------------------------------------------------------------
+
+sub _dump {
+    my $self = shift;
+    my $size = $self->{ SIZE };
+    my $parser = $self->{ PARSER };
+    $parser = $parser ? $parser->_dump() : '<no parser>';
+    $parser =~ s/\n/\n    /gm;
+    $size = 'unlimited' unless defined $size;
+
+    my $output = "[Template::Provider] {\n";
+    my $format = "    %-16s => %s\n";
+    my $key;
+
+    $output .= sprintf($format, 'INCLUDE_PATH',
+                       '[ ' . join(', ', @{ $self->{ INCLUDE_PATH } }) . ' ]');
+    $output .= sprintf($format, 'CACHE_SIZE', $size);
+
+    foreach $key (qw( ABSOLUTE RELATIVE TOLERANT DELIMITER
+                      COMPILE_EXT COMPILE_DIR )) {
+        $output .= sprintf($format, $key, $self->{ $key });
+    }
+    $output .= sprintf($format, 'PARSER', $parser);
+
+
+    local $" = ', ';
+    my $lookup = $self->{ LOOKUP };
+    $lookup = join('', map {
+        sprintf("    $format", $_, defined $lookup->{ $_ }
+                ? ('[ ' . join(', ', map { defined $_ ? $_ : '<undef>' }
+                               @{ $lookup->{ $_ } }) . ' ]') : '<undef>');
+    } sort keys %$lookup);
+    $lookup = "{\n$lookup    }";
+
+    $output .= sprintf($format, LOOKUP => $lookup);
+
+    $output .= '}';
+    return $output;
+}
+
+
+#------------------------------------------------------------------------
+# _dump_cache()
+#
+# Debug method which prints the current state of the cache to STDERR.
+#------------------------------------------------------------------------
+
+sub _dump_cache {
+    my $self = shift;
+    my ($node, $lut, $count);
+
+    $count = 0;
+    if ($node = $self->{ HEAD }) {
+        while ($node) {
+            $lut->{ $node } = $count++;
+            $node = $node->[ NEXT ];
+        }
+        $node = $self->{ HEAD };
+        print STDERR "CACHE STATE:\n";
+        print STDERR "  HEAD: ", $self->{ HEAD }->[ NAME ], "\n";
+        print STDERR "  TAIL: ", $self->{ TAIL }->[ NAME ], "\n";
+        while ($node) {
+            my ($prev, $name, $data, $load, $next) = @$node;
+#           $name = '...' . substr($name, -10) if length $name > 10;
+            $prev = $prev ? "#$lut->{ $prev }<-": '<undef>';
+            $next = $next ? "->#$lut->{ $next }": '<undef>';
+            print STDERR "   #$lut->{ $node } : [ $prev, $name, $data, $load, $next ]\n";
+            $node = $node->[ NEXT ];
+        }
+    }
+}
+
+#------------------------------------------------------------------------
+# _decode_unicode
+#
+# Decodes encoded unicode text that starts with a BOM and
+# turns it into perl's internal representation
+#------------------------------------------------------------------------
+
+sub _decode_unicode {
+    my $self   = shift;
+    my $string = shift;
+    return undef unless defined $string;
+
+    use bytes;
+    require Encode;
+
+    return $string if Encode::is_utf8( $string );
+
+    # try all the BOMs in order looking for one (order is important
+    # 32bit BOMs look like 16bit BOMs)
+
+    my $count  = 0;
+
+    while ($count < @{ $boms }) {
+        my $enc = $boms->[$count++];
+        my $bom = $boms->[$count++];
+
+        # does the string start with the bom?
+        if ($bom eq substr($string, 0, length($bom))) {
+            # decode it and hand it back
+            return Encode::decode($enc, substr($string, length($bom)), 1);
+        }
+    }
+
+    return $self->{ ENCODING }
+        ? Encode::decode( $self->{ ENCODING }, $string )
+        : $string;
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Provider - Provider module for loading/compiling templates
+
+=head1 SYNOPSIS
+
+    $provider = Template::Provider->new(\%options);
+    
+    ($template, $error) = $provider->fetch($name);
+
+=head1 DESCRIPTION
+
+The L<Template::Provider> is used to load, parse, compile and cache template
+documents. This object may be sub-classed to provide more specific facilities
+for loading, or otherwise providing access to templates.
+
+The L<Template::Context> objects maintain a list of L<Template::Provider>
+objects which are polled in turn (via L<fetch()|Template::Context#fetch()>) to
+return a requested template. Each may return a compiled template, raise an
+error, or decline to serve the request, giving subsequent providers a chance
+to do so.
+
+The L<Template::Provider> can also be subclassed to provide templates from
+a different source, e.g. a database. See L<SUBCLASSING> below.
+
+This documentation needs work.
+
+=head1 PUBLIC METHODS
+
+=head2 new(\%options) 
+
+Constructor method which instantiates and returns a new C<Template::Provider>
+object.  A reference to a hash array of configuration options may be passed.
+
+See L<CONFIGURATION OPTIONS> below for a summary of configuration options
+and L<Template::Manual::Config> for full details.
+
+=head2 fetch($name)
+
+Returns a compiled template for the name specified. If the template cannot be
+found then C<(undef, STATUS_DECLINED)> is returned. If an error occurs (e.g.
+read error, parse error) then C<($error, STATUS_ERROR)> is returned, where
+C<$error> is the error message generated. If the L<TOLERANT> option is set the
+the method returns C<(undef, STATUS_DECLINED)> instead of returning an error.
+
+=head2 store($name, $template)
+
+Stores the compiled template, C<$template>, in the cache under the name, 
+C<$name>.  Susbequent calls to C<fetch($name)> will return this template in
+preference to any disk-based file.
+
+=head2 include_path(\@newpath)
+
+Accessor method for the C<INCLUDE_PATH> setting.  If called with an
+argument, this method will replace the existing C<INCLUDE_PATH> with
+the new value.
+
+=head2 paths()
+
+This method generates a copy of the C<INCLUDE_PATH> list.  Any elements in the
+list which are dynamic generators (e.g. references to subroutines or objects
+implementing a C<paths()> method) will be called and the list of directories 
+returned merged into the output list.
+
+It is possible to provide a generator which returns itself, thus sending
+this method into an infinite loop.  To detect and prevent this from happening,
+the C<$MAX_DIRS> package variable, set to C<64> by default, limits the maximum
+number of paths that can be added to, or generated for the output list.  If
+this number is exceeded then the method will immediately return an error 
+reporting as much.
+
+=head1 CONFIGURATION OPTIONS
+
+The following list summarises the configuration options that can be provided
+to the C<Template::Provider> L<new()> constructor. Please consult
+L<Template::Manual::Config> for further details and examples of each
+configuration option in use.
+
+=head2 INCLUDE_PATH
+
+The L<INCLUDE_PATH|Template::Manual::Config#INCLUDE_PATH> option is used to
+specify one or more directories in which template files are located.
+
+    # single path
+    my $provider = Template::Provider->new({
+        INCLUDE_PATH => '/usr/local/templates',
+    });
+
+    # multiple paths
+    my $provider = Template::Provider->new({
+        INCLUDE_PATH => [ '/usr/local/templates', 
+                          '/tmp/my/templates' ],
+    });
+
+=head2 ABSOLUTE
+
+The L<ABSOLUTE|Template::Manual::Config#ABSOLUTE> flag is used to indicate if
+templates specified with absolute filenames (e.g. 'C</foo/bar>') should be
+processed. It is disabled by default and any attempt to load a template by
+such a name will cause a 'C<file>' exception to be raised.
+
+    my $provider = Template::Provider->new({
+        ABSOLUTE => 1,
+    });
+
+=head2 RELATIVE
+
+The L<RELATIVE|Template::Manual::Config#RELATIVE> flag is used to indicate if
+templates specified with filenames relative to the current directory (e.g.
+C<./foo/bar> or C<../../some/where/else>) should be loaded. It is also disabled
+by default, and will raise a C<file> error if such template names are
+encountered.
+
+    my $provider = Template::Provider->new({
+        RELATIVE => 1,
+    });
+
+=head2 DEFAULT
+
+The L<DEFAULT|Template::Manual::Config#DEFAULT> option can be used to specify
+a default template which should be used whenever a specified template can't be
+found in the L<INCLUDE_PATH>.
+
+    my $provider = Template::Provider->new({
+        DEFAULT => 'notfound.html',
+    });
+
+If a non-existant template is requested through the L<Template>
+L<process()|Template#process()> method, or by an C<INCLUDE>, C<PROCESS> or
+C<WRAPPER> directive, then the C<DEFAULT> template will instead be processed, if
+defined. Note that the C<DEFAULT> template is not used when templates are
+specified with absolute or relative filenames, or as a reference to a input
+file handle or text string.
+
+=head2 ENCODING
+
+The Template Toolkit will automatically decode Unicode templates that
+have a Byte Order Marker (BOM) at the start of the file.  This option
+can be used to set the default encoding for templates that don't define
+a BOM.
+
+    my $provider = Template::Provider->new({
+        ENCODING => 'utf8',
+    });
+
+See L<Encode> for further information.
+
+=head2 CACHE_SIZE
+
+The L<CACHE_SIZE|Template::Manual::Config#CACHE_SIZE> option can be used to
+limit the number of compiled templates that the module should cache. By
+default, the L<CACHE_SIZE|Template::Manual::Config#CACHE_SIZE> is undefined
+and all compiled templates are cached.
+
+    my $provider = Template::Provider->new({
+        CACHE_SIZE => 64,   # only cache 64 compiled templates
+    });
+
+
+=head2 STAT_TTL
+
+The L<STAT_TTL|Template::Manual::Config#STAT_TTL> value can be set to control
+how long the C<Template::Provider> will keep a template cached in memory
+before checking to see if the source template has changed.
+
+    my $provider = Template::Provider->new({
+        STAT_TTL => 60,  # one minute
+    });
+
+=head2 COMPILE_EXT
+
+The L<COMPILE_EXT|Template::Manual::Config#COMPILE_EXT> option can be
+provided to specify a filename extension for compiled template files.
+It is undefined by default and no attempt will be made to read or write 
+any compiled template files.
+
+    my $provider = Template::Provider->new({
+        COMPILE_EXT => '.ttc',
+    });
+
+=head2 COMPILE_DIR
+
+The L<COMPILE_DIR|Template::Manual::Config#COMPILE_DIR> option is used to
+specify an alternate directory root under which compiled template files should
+be saved.
+
+    my $provider = Template::Provider->new({
+        COMPILE_DIR => '/tmp/ttc',
+    });
+
+=head2 TOLERANT
+
+The L<TOLERANT|Template::Manual::Config#TOLERANT> flag can be set to indicate
+that the C<Template::Provider> module should ignore any errors encountered while
+loading a template and instead return C<STATUS_DECLINED>.
+
+=head2 PARSER
+
+The L<PARSER|Template::Manual::Config#PARSER> option can be used to define
+a parser module other than the default of L<Template::Parser>.
+
+    my $provider = Template::Provider->new({
+        PARSER => MyOrg::Template::Parser->new({ ... }),
+    });
+
+=head2 DEBUG
+
+The L<DEBUG|Template::Manual::Config#DEBUG> option can be used to enable
+debugging messages from the L<Template::Provider> module by setting it to include
+the C<DEBUG_PROVIDER> value.
+
+    use Template::Constants qw( :debug );
+    
+    my $template = Template->new({
+        DEBUG => DEBUG_PROVIDER,
+    });
+
+=head1 SUBCLASSING
+
+The C<Template::Provider> module can be subclassed to provide templates from a 
+different source (e.g. a database).  In most cases you'll just need to provide
+custom implementations of the C<_template_modified()> and C<_template_content()>
+methods.  If your provider requires and custom initialisation then you'll also
+need to implement a new C<_init()> method.
+
+Caching in memory and on disk will still be applied (if enabled)
+when overriding these methods.
+
+=head2 _template_modified($path)
+
+Returns a timestamp of the C<$path> passed in by calling C<stat()>.
+This can be overridden, for example, to return a last modified value from
+a database.  The value returned should be a timestamp value (as returned by C<time()>,
+although a sequence number should work as well.
+
+=head2 _template_content($path)
+
+This method returns the content of the template for all C<INCLUDE>, C<PROCESS>,
+and C<INSERT> directives.
+
+When called in scalar context, the method returns the content of the template
+located at C<$path>, or C<undef> if C<$path> is not found.
+
+When called in list context it returns C<($content, $error, $mtime)>,
+where C<$content> is the template content, C<$error> is an error string
+(e.g. "C<$path: File not found>"), and C<$mtime> is the template modification
+time.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template>, L<Template::Parser>, L<Template::Context>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Service.pm b/bench/perl/Template/Service.pm
new file mode 100644
index 0000000..16ed854
--- /dev/null
+++ b/bench/perl/Template/Service.pm
@@ -0,0 +1,573 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Service
+#
+# DESCRIPTION
+#   Module implementing a template processing service which wraps a
+#   template within PRE_PROCESS and POST_PROCESS templates and offers 
+#   ERROR recovery.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+# 
+#============================================================================
+
+package Template::Service;
+
+use strict;
+use warnings;
+use base 'Template::Base';
+use Template::Config;
+use Template::Exception;
+use Template::Constants;
+use Scalar::Util 'blessed';
+
+use constant EXCEPTION => 'Template::Exception';
+
+our $VERSION = 2.80;
+our $DEBUG   = 0 unless defined $DEBUG;
+our $ERROR   = '';
+
+
+#========================================================================
+#                     -----  PUBLIC METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# process($template, \%params)
+#
+# Process a template within a service framework.  A service may encompass
+# PRE_PROCESS and POST_PROCESS templates and an ERROR hash which names
+# templates to be substituted for the main template document in case of
+# error.  Each service invocation begins by resetting the state of the 
+# context object via a call to reset().  The AUTO_RESET option may be set 
+# to 0 (default: 1) to bypass this step.
+#------------------------------------------------------------------------
+
+sub process {
+    my ($self, $template, $params) = @_;
+    my $context = $self->{ CONTEXT };
+    my ($name, $output, $procout, $error);
+    $output = '';
+
+    $self->debug("process($template, ", 
+                 defined $params ? $params : '<no params>',
+                 ')') if $self->{ DEBUG };
+
+    $context->reset()
+        if $self->{ AUTO_RESET };
+
+    # pre-request compiled template from context so that we can alias it 
+    # in the stash for pre-processed templates to reference
+    eval { $template = $context->template($template) };
+    return $self->error($@)
+        if $@;
+
+    # localise the variable stash with any parameters passed
+    # and set the 'template' variable
+    $params ||= { };
+    # TODO: change this to C<||=> so we can use a template parameter
+    $params->{ template } = $template 
+        unless ref $template eq 'CODE';
+    $context->localise($params);
+
+    SERVICE: {
+        # PRE_PROCESS
+        eval {
+            foreach $name (@{ $self->{ PRE_PROCESS } }) {
+                $self->debug("PRE_PROCESS: $name") if $self->{ DEBUG };
+                $output .= $context->process($name);
+            }
+        };
+        last SERVICE if ($error = $@);
+
+        # PROCESS
+        eval {
+            foreach $name (@{ $self->{ PROCESS } || [ $template ] }) {
+                $self->debug("PROCESS: $name") if $self->{ DEBUG };
+                $procout .= $context->process($name);
+            }
+        };
+        if ($error = $@) {
+            last SERVICE
+                unless defined ($procout = $self->_recover(\$error));
+        }
+        
+        if (defined $procout) {
+            # WRAPPER
+            eval {
+                foreach $name (reverse @{ $self->{ WRAPPER } }) {
+                    $self->debug("WRAPPER: $name") if $self->{ DEBUG };
+                    $procout = $context->process($name, { content => $procout });
+                }
+            };
+            last SERVICE if ($error = $@);
+            $output .= $procout;
+        }
+        
+        # POST_PROCESS
+        eval {
+            foreach $name (@{ $self->{ POST_PROCESS } }) {
+                $self->debug("POST_PROCESS: $name") if $self->{ DEBUG };
+                $output .= $context->process($name);
+            }
+        };
+        last SERVICE if ($error = $@);
+    }
+
+    $context->delocalise();
+    delete $params->{ template };
+
+    if ($error) {
+    #   $error = $error->as_string if ref $error;
+        return $self->error($error);
+    }
+
+    return $output;
+}
+
+
+#------------------------------------------------------------------------
+# context()
+# 
+# Returns the internal CONTEXT reference.
+#------------------------------------------------------------------------
+
+sub context {
+    return $_[0]->{ CONTEXT };
+}
+
+
+#========================================================================
+#                     -- PRIVATE METHODS --
+#========================================================================
+
+sub _init {
+    my ($self, $config) = @_;
+    my ($item, $data, $context, $block, $blocks);
+    my $delim = $config->{ DELIMITER };
+    $delim = ':' unless defined $delim;
+
+    # coerce PRE_PROCESS, PROCESS and POST_PROCESS to arrays if necessary, 
+    # by splitting on non-word characters
+    foreach $item (qw( PRE_PROCESS PROCESS POST_PROCESS WRAPPER )) {
+        $data = $config->{ $item };
+        $self->{ $item } = [ ], next unless (defined $data);
+        $data = [ split($delim, $data || '') ]
+            unless ref $data eq 'ARRAY';
+        $self->{ $item } = $data;
+    }
+    # unset PROCESS option unless explicitly specified in config
+    $self->{ PROCESS } = undef
+        unless defined $config->{ PROCESS };
+    
+    $self->{ ERROR      } = $config->{ ERROR } || $config->{ ERRORS };
+    $self->{ AUTO_RESET } = defined $config->{ AUTO_RESET }
+                            ? $config->{ AUTO_RESET } : 1;
+    $self->{ DEBUG      } = ( $config->{ DEBUG } || 0 )
+                            & Template::Constants::DEBUG_SERVICE;
+    
+    $context = $self->{ CONTEXT } = $config->{ CONTEXT }
+        || Template::Config->context($config)
+        || return $self->error(Template::Config->error);
+    
+    return $self;
+}
+
+
+#------------------------------------------------------------------------
+# _recover(\$exception)
+#
+# Examines the internal ERROR hash array to find a handler suitable 
+# for the exception object passed by reference.  Selecting the handler
+# is done by delegation to the exception's select_handler() method, 
+# passing the set of handler keys as arguments.  A 'default' handler 
+# may also be provided.  The handler value represents the name of a 
+# template which should be processed. 
+#------------------------------------------------------------------------
+
+sub _recover {
+    my ($self, $error) = @_;
+    my $context = $self->{ CONTEXT };
+    my ($hkey, $handler, $output);
+
+    # there shouldn't ever be a non-exception object received at this
+    # point... unless a module like CGI::Carp messes around with the 
+    # DIE handler. 
+    return undef
+        unless blessed($$error) && $$error->isa(EXCEPTION);
+
+    # a 'stop' exception is thrown by [% STOP %] - we return the output
+    # buffer stored in the exception object
+    return $$error->text()
+        if $$error->type() eq 'stop';
+
+    my $handlers = $self->{ ERROR }
+        || return undef;                    ## RETURN
+
+    if (ref $handlers eq 'HASH') {
+        if ($hkey = $$error->select_handler(keys %$handlers)) {
+            $handler = $handlers->{ $hkey };
+            $self->debug("using error handler for $hkey") if $self->{ DEBUG };
+        }
+        elsif ($handler = $handlers->{ default }) {
+            # use default handler
+            $self->debug("using default error handler") if $self->{ DEBUG };
+        }
+        else {
+            return undef;                   ## RETURN
+        }
+    }
+    else {
+        $handler = $handlers;
+        $self->debug("using default error handler") if $self->{ DEBUG };
+    }
+    
+    eval { $handler = $context->template($handler) };
+    if ($@) {
+        $$error = $@;
+        return undef;                       ## RETURN
+    };
+    
+    $context->stash->set('error', $$error);
+    eval {
+        $output .= $context->process($handler);
+    };
+    if ($@) {
+        $$error = $@;
+        return undef;                       ## RETURN
+    }
+
+    return $output;
+}
+
+
+
+#------------------------------------------------------------------------
+# _dump()
+#
+# Debug method which return a string representing the internal object
+# state. 
+#------------------------------------------------------------------------
+
+sub _dump {
+    my $self = shift;
+    my $context = $self->{ CONTEXT }->_dump();
+    $context =~ s/\n/\n    /gm;
+
+    my $error = $self->{ ERROR };
+    $error = join('', 
+          "{\n",
+          (map { "    $_ => $error->{ $_ }\n" }
+           keys %$error),
+          "}\n")
+    if ref $error;
+    
+    local $" = ', ';
+    return <<EOF;
+$self
+PRE_PROCESS  => [ @{ $self->{ PRE_PROCESS } } ]
+POST_PROCESS => [ @{ $self->{ POST_PROCESS } } ]
+ERROR        => $error
+CONTEXT      => $context
+EOF
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Service - General purpose template processing service
+
+=head1 SYNOPSIS
+
+    use Template::Service;
+    
+    my $service = Template::Service->new({
+        PRE_PROCESS  => [ 'config', 'header' ],
+        POST_PROCESS => 'footer',
+        ERROR        => {
+            user     => 'user/index.html', 
+            dbi      => 'error/database',
+            default  => 'error/default',
+        },
+    });
+    
+    my $output = $service->process($template_name, \%replace)
+        || die $service->error(), "\n";
+
+=head1 DESCRIPTION
+
+The C<Template::Service> module implements an object class for providing
+a consistent template processing service. 
+
+Standard header (L<PRE_PROCESS|PRE_PROCESS_POST_PROCESS>) and footer
+(L<POST_PROCESS|PRE_PROCESS_POST_PROCESS>) templates may be specified which
+are prepended and appended to all templates processed by the service (but not
+any other templates or blocks C<INCLUDE>d or C<PROCESS>ed from within). An
+L<ERROR> hash may be specified which redirects the service to an alternate
+template file in the case of uncaught exceptions being thrown. This allows
+errors to be automatically handled by the service and a guaranteed valid
+response to be generated regardless of any processing problems encountered.
+
+A default C<Template::Service> object is created by the L<Template> module.
+Any C<Template::Service> options may be passed to the L<Template>
+L<new()|Template#new()> constructor method and will be forwarded to the
+L<Template::Service> constructor.
+
+    use Template;
+    
+    my $template = Template->new({
+        PRE_PROCESS  => 'header',
+        POST_PROCESS => 'footer',
+    });
+
+Similarly, the C<Template::Service> constructor will forward all configuration
+parameters onto other default objects (e.g. L<Template::Context>) that it may
+need to instantiate.
+
+A C<Template::Service> object (or subclass) can be explicitly instantiated and
+passed to the L<Template> L<new()|Template#new()> constructor method as the
+L<SERVICE> item.
+
+    use Template;
+    use Template::Service;
+    
+    my $service = Template::Service->new({
+        PRE_PROCESS  => 'header',
+        POST_PROCESS => 'footer',
+    });
+    
+    my $template = Template->new({
+        SERVICE => $service,
+    });
+
+The C<Template::Service> module can be sub-classed to create custom service
+handlers.
+
+    use Template;
+    use MyOrg::Template::Service;
+    
+    my $service = MyOrg::Template::Service->new({
+        PRE_PROCESS  => 'header',
+        POST_PROCESS => 'footer',
+        COOL_OPTION  => 'enabled in spades',
+    });
+    
+    my $template = Template->new({
+        SERVICE => $service,
+    });
+
+The L<Template> module uses the L<Template::Config>
+L<service()|Template::Config#service()> factory method to create a default
+service object when required. The C<$Template::Config::SERVICE> package
+variable may be set to specify an alternate service module. This will be
+loaded automatically and its L<new()> constructor method called by the
+L<service()|Template::Config#service()> factory method when a default service
+object is required. Thus the previous example could be written as:
+
+    use Template;
+    
+    $Template::Config::SERVICE = 'MyOrg::Template::Service';
+    
+    my $template = Template->new({
+        PRE_PROCESS  => 'header',
+        POST_PROCESS => 'footer',
+        COOL_OPTION  => 'enabled in spades',
+    });
+
+=head1 METHODS
+
+=head2 new(\%config)
+
+The C<new()> constructor method is called to instantiate a C<Template::Service>
+object.  Configuration parameters may be specified as a HASH reference or
+as a list of C<name =E<gt> value> pairs.
+
+    my $service1 = Template::Service->new({
+        PRE_PROCESS  => 'header',
+        POST_PROCESS => 'footer',
+    });
+    
+    my $service2 = Template::Service->new( ERROR => 'error.html' );
+
+The C<new()> method returns a C<Template::Service> object or C<undef> on
+error. In the latter case, a relevant error message can be retrieved by the
+L<error()|Template::Base#error()> class method or directly from the
+C<$Template::Service::ERROR> package variable.
+
+    my $service = Template::Service->new(\%config)
+        || die Template::Service->error();
+        
+    my $service = Template::Service->new(\%config)
+        || die $Template::Service::ERROR;
+
+=head2 process($input, \%replace)
+
+The C<process()> method is called to process a template specified as the first
+parameter, C<$input>. This may be a file name, file handle (e.g. C<GLOB> or
+C<IO::Handle>) or a reference to a text string containing the template text. An
+additional hash reference may be passed containing template variable
+definitions.
+
+The method processes the template, adding any
+L<PRE_PROCESS|PRE_PROCESS_POST_PROCESS> or
+L<POST_PROCESS|PRE_PROCESS_POST_PROCESS> templates defined, and returns the
+output text. An uncaught exception thrown by the template will be handled by a
+relevant L<ERROR> handler if defined. Errors that occur in the
+L<PRE_PROCESS|PRE_PROCESS_POST_PROCESS> or
+L<POST_PROCESS|PRE_PROCESS_POST_PROCESS> templates, or those that occur in the
+main input template and aren't handled, cause the method to return C<undef> to
+indicate failure. The appropriate error message can be retrieved via the
+L<error()|Template::Base#error()> method.
+
+    $service->process('myfile.html', { title => 'My Test File' })
+        || die $service->error();
+
+=head2 context()
+
+Returns a reference to the internal context object which is, by default, an
+instance of the L<Template::Context> class.
+
+=head1 CONFIGURATION OPTIONS
+
+The following list summarises the configuration options that can be provided
+to the C<Template::Service> L<new()> constructor. Please consult
+L<Template::Manual::Config> for further details and examples of each
+configuration option in use.
+
+=head2 PRE_PROCESS, POST_PROCESS
+
+The L<PRE_PROCESS|Template::Manual::Config#PRE_PROCESS_POST_PROCESS> and
+L<POST_PROCESS|Template::Manual::Config#PRE_PROCESS_POST_PROCESS> options may
+be set to contain the name(s) of template files which should be processed
+immediately before and/or after each template. These do not get added to
+templates processed into a document via directives such as C<INCLUDE>
+C<PROCESS>, C<WRAPPER>, etc.
+
+    my $service = Template::Service->new({
+        PRE_PROCESS  => 'header',
+        POST_PROCESS => 'footer',
+    };
+
+Multiple templates may be specified as a reference to a list.  Each is 
+processed in the order defined.
+
+    my $service = Template::Service->new({
+        PRE_PROCESS  => [ 'config', 'header' ],
+        POST_PROCESS => 'footer',
+    };
+
+=head2 PROCESS
+
+The L<PROCESS|Template::Manual::Config#PROCESS> option may be set to contain
+the name(s) of template files which should be processed instead of the main
+template passed to the C<Template::Service> L<process()> method. This can be used to
+apply consistent wrappers around all templates, similar to the use of
+L<PRE_PROCESS|PRE_PROCESS_POST_PROCESS> and 
+L<POST_PROCESS|PRE_PROCESS_POST_PROCESS> templates.
+
+    my $service = Template::Service->new({
+        PROCESS  => 'content',
+    };
+    
+    # processes 'content' instead of 'foo.html'
+    $service->process('foo.html');
+
+A reference to the original template is available in the C<template>
+variable.  Metadata items can be inspected and the template can be
+processed by specifying it as a variable reference (i.e. prefixed by
+'C<$>') to an C<INCLUDE>, C<PROCESS> or C<WRAPPER> directive.
+
+Example C<PROCESS> template:
+
+    <html>
+      <head>
+        <title>[% template.title %]</title>
+      </head>
+      <body>
+      [% PROCESS $template %]
+      </body>
+    </html>
+
+=head2 ERROR
+
+The L<ERROR|Template::Manual::Config#ERROR> (or C<ERRORS> if you prefer)
+configuration item can be used to name a single template or specify a hash
+array mapping exception types to templates which should be used for error
+handling. If an uncaught exception is raised from within a template then the
+appropriate error template will instead be processed.
+
+If specified as a single value then that template will be processed 
+for all uncaught exceptions. 
+
+    my $service = Template::Service->new({
+        ERROR => 'error.html'
+    });
+
+If the L<ERROR/ERRORS|Template::Manual::Config#ERROR> item is a hash reference
+the keys are assumed to be exception types and the relevant template for a
+given exception will be selected. A C<default> template may be provided for
+the general case.
+
+    my $service = Template::Service->new({
+        ERRORS => {
+            user     => 'user/index.html',
+            dbi      => 'error/database',
+            default  => 'error/default',
+        },
+    });
+
+=head2 AUTO_RESET
+
+The L<AUTO_RESET|Template::Manual::Config#AUTO_RESET> option is set by default
+and causes the local C<BLOCKS> cache for the L<Template::Context> object to be
+reset on each call to the L<Template> L<process()|Template#process()> method.
+This ensures that any C<BLOCK>s defined within a template will only persist until
+that template is finished processing. 
+
+=head2 DEBUG
+
+The L<DEBUG|Template::Manual::Config#DEBUG> option can be used to enable
+debugging messages from the C<Template::Service> module by setting it to include
+the C<DEBUG_SERVICE> value.
+
+    use Template::Constants qw( :debug );
+    
+    my $template = Template->new({
+        DEBUG => DEBUG_SERVICE,
+    });
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template>, L<Template::Context>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Stash.pm b/bench/perl/Template/Stash.pm
new file mode 100644
index 0000000..24f3fe0
--- /dev/null
+++ b/bench/perl/Template/Stash.pm
@@ -0,0 +1,839 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Stash
+#
+# DESCRIPTION
+#   Definition of an object class which stores and manages access to 
+#   variables for the Template Toolkit. 
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Stash;
+
+use strict;
+use warnings;
+use Template::VMethods;
+use Template::Exception;
+use Scalar::Util qw( blessed reftype );
+
+our $VERSION    = 2.91;
+our $DEBUG      = 0 unless defined $DEBUG;
+our $PRIVATE    = qr/^[_.]/;
+our $UNDEF_TYPE = 'var.undef';
+our $UNDEF_INFO = 'undefined variable: %s';
+
+# alias _dotop() to dotop() so that we have a consistent method name
+# between the Perl and XS stash implementations
+*dotop = \&_dotop;
+
+
+#------------------------------------------------------------------------
+# Virtual Methods
+#
+# If any of $ROOT_OPS, $SCALAR_OPS, $HASH_OPS or $LIST_OPS are already
+# defined then we merge their contents with the default virtual methods
+# define by Template::VMethods.  Otherwise we can directly alias the 
+# corresponding Template::VMethod package vars.
+#------------------------------------------------------------------------
+
+our $ROOT_OPS = defined $ROOT_OPS 
+    ? { %{$Template::VMethods::ROOT_VMETHODS}, %$ROOT_OPS }
+    : $Template::VMethods::ROOT_VMETHODS;
+
+our $SCALAR_OPS = defined $SCALAR_OPS 
+    ? { %{$Template::VMethods::TEXT_VMETHODS}, %$SCALAR_OPS }
+    : $Template::VMethods::TEXT_VMETHODS;
+
+our $HASH_OPS = defined $HASH_OPS 
+    ? { %{$Template::VMethods::HASH_VMETHODS}, %$HASH_OPS }
+    : $Template::VMethods::HASH_VMETHODS;
+
+our $LIST_OPS = defined $LIST_OPS 
+    ? { %{$Template::VMethods::LIST_VMETHODS}, %$LIST_OPS }
+    : $Template::VMethods::LIST_VMETHODS;
+
+
+#------------------------------------------------------------------------
+# define_vmethod($type, $name, \&sub)
+#
+# Defines a virtual method of type $type (SCALAR, HASH, or LIST), with
+# name $name, that invokes &sub when called.  It is expected that &sub
+# be able to handle the type that it will be called upon.
+#------------------------------------------------------------------------
+
+sub define_vmethod {
+    my ($class, $type, $name, $sub) = @_;
+    my $op;
+    $type = lc $type;
+
+    if ($type =~ /^scalar|item$/) {
+        $op = $SCALAR_OPS;
+    }
+    elsif ($type eq 'hash') {
+        $op = $HASH_OPS;
+    }
+    elsif ($type =~ /^list|array$/) {
+        $op = $LIST_OPS;
+    }
+    else {
+        die "invalid vmethod type: $type\n";
+    }
+
+    $op->{ $name } = $sub;
+
+    return 1;
+}
+
+
+#========================================================================
+#                      -----  CLASS METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# new(\%params)
+#
+# Constructor method which creates a new Template::Stash object.
+# An optional hash reference may be passed containing variable 
+# definitions that will be used to initialise the stash.
+#
+# Returns a reference to a newly created Template::Stash.
+#------------------------------------------------------------------------
+
+sub new {
+    my $class  = shift;
+    my $params = ref $_[0] eq 'HASH' ? shift(@_) : { @_ };
+
+    my $self   = {
+        global  => { },
+        %$params,
+        %$ROOT_OPS,
+        '_PARENT' => undef,
+    };
+
+    bless $self, $class;
+}
+
+
+#========================================================================
+#                   -----  PUBLIC OBJECT METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# clone(\%params)
+#
+# Creates a copy of the current stash object to effect localisation 
+# of variables.  The new stash is blessed into the same class as the 
+# parent (which may be a derived class) and has a '_PARENT' member added
+# which contains a reference to the parent stash that created it
+# ($self).  This member is used in a successive declone() method call to
+# return the reference to the parent.
+# 
+# A parameter may be provided which should reference a hash of 
+# variable/values which should be defined in the new stash.  The 
+# update() method is called to define these new variables in the cloned
+# stash.
+#
+# Returns a reference to a cloned Template::Stash.
+#------------------------------------------------------------------------
+
+sub clone {
+    my ($self, $params) = @_;
+    $params ||= { };
+
+    # look out for magical 'import' argument which imports another hash
+    my $import = $params->{ import };
+    if (defined $import && ref $import eq 'HASH') {
+        delete $params->{ import };
+    }
+    else {
+        undef $import;
+    }
+
+    my $clone = bless { 
+        %$self,         # copy all parent members
+        %$params,       # copy all new data
+        '_PARENT' => $self,     # link to parent
+    }, ref $self;
+    
+    # perform hash import if defined
+    &{ $HASH_OPS->{ import } }($clone, $import)
+        if defined $import;
+
+    return $clone;
+}
+
+    
+#------------------------------------------------------------------------
+# declone($export) 
+#
+# Returns a reference to the PARENT stash.  When called in the following
+# manner:
+#    $stash = $stash->declone();
+# the reference count on the current stash will drop to 0 and be "freed"
+# and the caller will be left with a reference to the parent.  This 
+# contains the state of the stash before it was cloned.  
+#------------------------------------------------------------------------
+
+sub declone {
+    my $self = shift;
+    $self->{ _PARENT } || $self;
+}
+
+
+#------------------------------------------------------------------------
+# get($ident)
+# 
+# Returns the value for an variable stored in the stash.  The variable
+# may be specified as a simple string, e.g. 'foo', or as an array 
+# reference representing compound variables.  In the latter case, each
+# pair of successive elements in the list represent a node in the 
+# compound variable.  The first is the variable name, the second a 
+# list reference of arguments or 0 if undefined.  So, the compound 
+# variable [% foo.bar('foo').baz %] would be represented as the list
+# [ 'foo', 0, 'bar', ['foo'], 'baz', 0 ].  Returns the value of the
+# identifier or an empty string if undefined.  Errors are thrown via
+# die().
+#------------------------------------------------------------------------
+
+sub get {
+    my ($self, $ident, $args) = @_;
+    my ($root, $result);
+    $root = $self;
+
+    if (ref $ident eq 'ARRAY'
+        || ($ident =~ /\./) 
+        && ($ident = [ map { s/\(.*$//; ($_, 0) } split(/\./, $ident) ])) {
+        my $size = $#$ident;
+
+        # if $ident is a list reference, then we evaluate each item in the 
+        # identifier against the previous result, using the root stash 
+        # ($self) as the first implicit 'result'...
+        
+        foreach (my $i = 0; $i <= $size; $i += 2) {
+            $result = $self->_dotop($root, @$ident[$i, $i+1]);
+            last unless defined $result;
+            $root = $result;
+        }
+    }
+    else {
+        $result = $self->_dotop($root, $ident, $args);
+    }
+
+    return defined $result 
+        ? $result 
+        : $self->undefined($ident, $args);
+}
+
+
+#------------------------------------------------------------------------
+# set($ident, $value, $default)
+#
+# Updates the value for a variable in the stash.  The first parameter
+# should be the variable name or array, as per get().  The second 
+# parameter should be the intended value for the variable.  The third,
+# optional parameter is a flag which may be set to indicate 'default'
+# mode.  When set true, the variable will only be updated if it is
+# currently undefined or has a false value.  The magical 'IMPORT'
+# variable identifier may be used to indicate that $value is a hash
+# reference whose values should be imported.  Returns the value set,
+# or an empty string if not set (e.g. default mode).  In the case of 
+# IMPORT, returns the number of items imported from the hash.
+#------------------------------------------------------------------------
+
+sub set {
+    my ($self, $ident, $value, $default) = @_;
+    my ($root, $result, $error);
+
+    $root = $self;
+
+    ELEMENT: {
+        if (ref $ident eq 'ARRAY'
+            || ($ident =~ /\./) 
+            && ($ident = [ map { s/\(.*$//; ($_, 0) }
+                           split(/\./, $ident) ])) {
+            
+            # a compound identifier may contain multiple elements (e.g. 
+            # foo.bar.baz) and we must first resolve all but the last, 
+            # using _dotop() with the $lvalue flag set which will create 
+            # intermediate hashes if necessary...
+            my $size = $#$ident;
+            foreach (my $i = 0; $i < $size - 2; $i += 2) {
+                $result = $self->_dotop($root, @$ident[$i, $i+1], 1);
+                last ELEMENT unless defined $result;
+                $root = $result;
+            }
+            
+            # then we call _assign() to assign the value to the last element
+            $result = $self->_assign($root, @$ident[$size-1, $size], 
+                                     $value, $default);
+        }
+        else {
+            $result = $self->_assign($root, $ident, 0, $value, $default);
+        }
+    }
+    
+    return defined $result ? $result : '';
+}
+
+
+#------------------------------------------------------------------------
+# getref($ident)
+# 
+# Returns a "reference" to a particular item.  This is represented as a 
+# closure which will return the actual stash item when called.  
+# WARNING: still experimental!
+#------------------------------------------------------------------------
+
+sub getref {
+    my ($self, $ident, $args) = @_;
+    my ($root, $item, $result);
+    $root = $self;
+
+    if (ref $ident eq 'ARRAY') {
+        my $size = $#$ident;
+        
+        foreach (my $i = 0; $i <= $size; $i += 2) {
+            ($item, $args) = @$ident[$i, $i + 1]; 
+            last if $i >= $size - 2;  # don't evaluate last node
+            last unless defined 
+                ($root = $self->_dotop($root, $item, $args));
+        }
+    }
+    else {
+        $item = $ident;
+    }
+    
+    if (defined $root) {
+        return sub { my @args = (@{$args||[]}, @_);
+                     $self->_dotop($root, $item, \@args);
+                 }
+    }
+    else {
+        return sub { '' };
+    }
+}
+
+
+
+
+#------------------------------------------------------------------------
+# update(\%params)
+#
+# Update multiple variables en masse.  No magic is performed.  Simple
+# variable names only.
+#------------------------------------------------------------------------
+
+sub update {
+    my ($self, $params) = @_;
+
+    # look out for magical 'import' argument to import another hash
+    my $import = $params->{ import };
+    if (defined $import && ref $import eq 'HASH') {
+        @$self{ keys %$import } = values %$import;
+        delete $params->{ import };
+    }
+
+    @$self{ keys %$params } = values %$params;
+}
+
+
+#------------------------------------------------------------------------
+# undefined($ident, $args)
+#
+# Method called when a get() returns an undefined value.  Can be redefined
+# in a subclass to implement alternate handling.
+#------------------------------------------------------------------------
+
+sub undefined {
+    my ($self, $ident, $args) = @_;
+
+    if ($self->{ _STRICT }) {
+        # Sorry, but we can't provide a sensible source file and line without
+        # re-designing the whole architecure of TT (see TT3)
+        die Template::Exception->new(
+            $UNDEF_TYPE, 
+            sprintf(
+                $UNDEF_INFO, 
+                $self->_reconstruct_ident($ident)
+            )
+        ) if $self->{ _STRICT };
+    }
+    else {
+        # There was a time when I thought this was a good idea. But it's not.
+        return '';
+    }
+}
+
+sub _reconstruct_ident {
+    my ($self, $ident) = @_;
+    my ($name, $args, @output);
+    my @input = ref $ident eq 'ARRAY' ? @$ident : ($ident);
+
+    while (@input) {
+        $name = shift @input;
+        $args = shift @input || 0;
+        $name .= '(' . join(', ', map { /^\d+$/ ? $_ : "'$_'" } @$args) . ')'
+            if $args && ref $args eq 'ARRAY';
+        push(@output, $name);
+    }
+    
+    return join('.', @output);
+}
+
+
+#========================================================================
+#                  -----  PRIVATE OBJECT METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# _dotop($root, $item, \@args, $lvalue)
+#
+# This is the core 'dot' operation method which evaluates elements of 
+# variables against their root.  All variables have an implicit root 
+# which is the stash object itself (a hash).  Thus, a non-compound 
+# variable 'foo' is actually '(stash.)foo', the compound 'foo.bar' is
+# '(stash.)foo.bar'.  The first parameter is a reference to the current
+# root, initially the stash itself.  The second parameter contains the 
+# name of the variable element, e.g. 'foo'.  The third optional
+# parameter is a reference to a list of any parenthesised arguments 
+# specified for the variable, which are passed to sub-routines, object 
+# methods, etc.  The final parameter is an optional flag to indicate 
+# if this variable is being evaluated on the left side of an assignment
+# (e.g. foo.bar.baz = 10).  When set true, intermediated hashes will 
+# be created (e.g. bar) if necessary.  
+#
+# Returns the result of evaluating the item against the root, having
+# performed any variable "magic".  The value returned can then be used
+# as the root of the next _dotop() in a compound sequence.  Returns
+# undef if the variable is undefined.
+#------------------------------------------------------------------------
+
+sub _dotop {
+    my ($self, $root, $item, $args, $lvalue) = @_;
+    my $rootref = ref $root;
+    my $atroot  = (blessed $root && $root->isa(ref $self));
+    my ($value, @result);
+
+    $args ||= [ ];
+    $lvalue ||= 0;
+
+#    print STDERR "_dotop(root=$root, item=$item, args=[@$args])\n"
+#   if $DEBUG;
+
+    # return undef without an error if either side of the dot is unviable
+    return undef unless defined($root) and defined($item);
+
+    # or if an attempt is made to access a private member, starting _ or .
+    return undef if $PRIVATE && $item =~ /$PRIVATE/;
+
+    if ($atroot || $rootref eq 'HASH') {
+        # if $root is a regular HASH or a Template::Stash kinda HASH (the 
+        # *real* root of everything).  We first lookup the named key 
+        # in the hash, or create an empty hash in its place if undefined
+        # and the $lvalue flag is set.  Otherwise, we check the HASH_OPS
+        # pseudo-methods table, calling the code if found, or return undef.
+        
+        if (defined($value = $root->{ $item })) {
+            return $value unless ref $value eq 'CODE';      ## RETURN
+            @result = &$value(@$args);                      ## @result
+        }
+        elsif ($lvalue) {
+            # we create an intermediate hash if this is an lvalue
+            return $root->{ $item } = { };                  ## RETURN
+        }
+        # ugly hack: only allow import vmeth to be called on root stash
+        elsif (($value = $HASH_OPS->{ $item })
+               && ! $atroot || $item eq 'import') {
+            @result = &$value($root, @$args);               ## @result
+        }
+        elsif ( ref $item eq 'ARRAY' ) {
+            # hash slice
+            return [@$root{@$item}];                        ## RETURN
+        }
+    }
+    elsif ($rootref eq 'ARRAY') {    
+        # if root is an ARRAY then we check for a LIST_OPS pseudo-method 
+        # or return the numerical index into the array, or undef
+        if ($value = $LIST_OPS->{ $item }) {
+            @result = &$value($root, @$args);               ## @result
+        }
+        elsif ($item =~ /^-?\d+$/) {
+            $value = $root->[$item];
+            return $value unless ref $value eq 'CODE';      ## RETURN
+            @result = &$value(@$args);                      ## @result
+        }
+        elsif ( ref $item eq 'ARRAY' ) {
+            # array slice
+            return [@$root[@$item]];                        ## RETURN
+        }
+    }
+    
+    # NOTE: we do the can-can because UNIVSERAL::isa($something, 'UNIVERSAL')
+    # doesn't appear to work with CGI, returning true for the first call
+    # and false for all subsequent calls. 
+    
+    # UPDATE: that doesn't appear to be the case any more
+    
+    elsif (blessed($root) && $root->can('can')) {
+
+        # if $root is a blessed reference (i.e. inherits from the 
+        # UNIVERSAL object base class) then we call the item as a method.
+        # If that fails then we try to fallback on HASH behaviour if 
+        # possible.
+        eval { @result = $root->$item(@$args); };       
+        
+        if ($@) {
+            # temporary hack - required to propogate errors thrown
+            # by views; if $@ is a ref (e.g. Template::Exception
+            # object then we assume it's a real error that needs
+            # real throwing
+
+            my $class = ref($root) || $root;
+            die $@ if ref($@) || ($@ !~ /Can't locate object method "\Q$item\E" via package "\Q$class\E"/);
+
+            # failed to call object method, so try some fallbacks
+            if (reftype $root eq 'HASH') {
+                if( defined($value = $root->{ $item })) {
+                    return $value unless ref $value eq 'CODE';      ## RETURN
+                    @result = &$value(@$args);
+                }
+                elsif ($value = $HASH_OPS->{ $item }) {
+                    @result = &$value($root, @$args);
+                }
+                elsif ($value = $LIST_OPS->{ $item }) {
+                    @result = &$value([$root], @$args);
+                }
+            }
+            elsif (reftype $root eq 'ARRAY') {
+                if( $value = $LIST_OPS->{ $item }) {
+                   @result = &$value($root, @$args);
+                }
+                elsif( $item =~ /^-?\d+$/ ) {
+                   $value = $root->[$item];
+                   return $value unless ref $value eq 'CODE';      ## RETURN
+                   @result = &$value(@$args);                      ## @result
+                }
+                elsif ( ref $item eq 'ARRAY' ) {
+                    # array slice
+                    return [@$root[@$item]];                        ## RETURN
+                }
+            }
+            elsif ($value = $SCALAR_OPS->{ $item }) {
+                @result = &$value($root, @$args);
+            }
+            elsif ($value = $LIST_OPS->{ $item }) {
+                @result = &$value([$root], @$args);
+            }
+            elsif ($self->{ _DEBUG }) {
+                @result = (undef, $@);
+            }
+        }
+    }
+    elsif (($value = $SCALAR_OPS->{ $item }) && ! $lvalue) {
+        # at this point, it doesn't look like we've got a reference to
+        # anything we know about, so we try the SCALAR_OPS pseudo-methods
+        # table (but not for l-values)
+        @result = &$value($root, @$args);           ## @result
+    }
+    elsif (($value = $LIST_OPS->{ $item }) && ! $lvalue) {
+        # last-ditch: can we promote a scalar to a one-element
+        # list and apply a LIST_OPS virtual method?
+        @result = &$value([$root], @$args);
+    }
+    elsif ($self->{ _DEBUG }) {
+        die "don't know how to access [ $root ].$item\n";   ## DIE
+    }
+    else {
+        @result = ();
+    }
+
+    # fold multiple return items into a list unless first item is undef
+    if (defined $result[0]) {
+        return                              ## RETURN
+        scalar @result > 1 ? [ @result ] : $result[0];
+    }
+    elsif (defined $result[1]) {
+        die $result[1];                     ## DIE
+    }
+    elsif ($self->{ _DEBUG }) {
+        die "$item is undefined\n";         ## DIE
+    }
+
+    return undef;
+}
+
+
+#------------------------------------------------------------------------
+# _assign($root, $item, \@args, $value, $default)
+#
+# Similar to _dotop() above, but assigns a value to the given variable
+# instead of simply returning it.  The first three parameters are the
+# root item, the item and arguments, as per _dotop(), followed by the 
+# value to which the variable should be set and an optional $default
+# flag.  If set true, the variable will only be set if currently false
+# (undefined/zero)
+#------------------------------------------------------------------------
+
+sub _assign {
+    my ($self, $root, $item, $args, $value, $default) = @_;
+    my $rootref = ref $root;
+    my $atroot  = ($root eq $self);
+    my $result;
+    $args ||= [ ];
+    $default ||= 0;
+
+    # return undef without an error if either side of the dot is unviable
+    return undef unless $root and defined $item;
+
+    # or if an attempt is made to update a private member, starting _ or .
+    return undef if $PRIVATE && $item =~ /$PRIVATE/;
+    
+    if ($rootref eq 'HASH' || $atroot) {
+        # if the root is a hash we set the named key
+        return ($root->{ $item } = $value)          ## RETURN
+            unless $default && $root->{ $item };
+    }
+    elsif ($rootref eq 'ARRAY' && $item =~ /^-?\d+$/) {
+        # or set a list item by index number
+        return ($root->[$item] = $value)            ## RETURN
+            unless $default && $root->{ $item };
+    }
+    elsif (blessed($root)) {
+        # try to call the item as a method of an object
+        
+        return $root->$item(@$args, $value)         ## RETURN
+            unless $default && $root->$item();
+        
+# 2 issues:
+#   - method call should be wrapped in eval { }
+#   - fallback on hash methods if object method not found
+#
+#     eval { $result = $root->$item(@$args, $value); };     
+# 
+#     if ($@) {
+#         die $@ if ref($@) || ($@ !~ /Can't locate object method/);
+# 
+#         # failed to call object method, so try some fallbacks
+#         if (UNIVERSAL::isa($root, 'HASH') && exists $root->{ $item }) {
+#         $result = ($root->{ $item } = $value)
+#             unless $default && $root->{ $item };
+#         }
+#     }
+#     return $result;                       ## RETURN
+    }
+    else {
+        die "don't know how to assign to [$root].[$item]\n";    ## DIE
+    }
+
+    return undef;
+}
+
+
+#------------------------------------------------------------------------
+# _dump()
+#
+# Debug method which returns a string representing the internal state
+# of the object.  The method calls itself recursively to dump sub-hashes.
+#------------------------------------------------------------------------
+
+sub _dump {
+    my $self   = shift;
+    return "[Template::Stash] " . $self->_dump_frame(2);
+}
+
+sub _dump_frame {
+    my ($self, $indent) = @_;
+    $indent ||= 1;
+    my $buffer = '    ';
+    my $pad    = $buffer x $indent;
+    my $text   = "{\n";
+    local $" = ', ';
+
+    my ($key, $value);
+
+    return $text . "...excessive recursion, terminating\n"
+        if $indent > 32;
+    
+    foreach $key (keys %$self) {
+        $value = $self->{ $key };
+        $value = '<undef>' unless defined $value;
+        next if $key =~ /^\./;
+        if (ref($value) eq 'ARRAY') {
+            $value = '[ ' . join(', ', map { defined $_ ? $_ : '<undef>' }
+                                 @$value) . ' ]';
+        }
+        elsif (ref $value eq 'HASH') {
+            $value = _dump_frame($value, $indent + 1);
+        }
+        
+        $text .= sprintf("$pad%-16s => $value\n", $key);
+    }
+    $text .= $buffer x ($indent - 1) . '}';
+    return $text;
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Stash - Magical storage for template variables
+
+=head1 SYNOPSIS
+
+    use Template::Stash;
+    
+    my $stash = Template::Stash->new(\%vars);
+    
+    # get variable values
+    $value = $stash->get($variable);
+    $value = $stash->get(\@compound);
+    
+    # set variable value
+    $stash->set($variable, $value);
+    $stash->set(\@compound, $value);
+    
+    # default variable value
+    $stash->set($variable, $value, 1);
+    $stash->set(\@compound, $value, 1);
+    
+    # set variable values en masse
+    $stash->update(\%new_vars)
+    
+    # methods for (de-)localising variables
+    $stash = $stash->clone(\%new_vars);
+    $stash = $stash->declone();
+
+=head1 DESCRIPTION
+
+The C<Template::Stash> module defines an object class which is used to store
+variable values for the runtime use of the template processor.  Variable
+values are stored internally in a hash reference (which itself is blessed 
+to create the object) and are accessible via the L<get()> and L<set()> methods.
+
+Variables may reference hash arrays, lists, subroutines and objects
+as well as simple values.  The stash automatically performs the right
+magic when dealing with variables, calling code or object methods,
+indexing into lists, hashes, etc.
+
+The stash has L<clone()> and L<declone()> methods which are used by the
+template processor to make temporary copies of the stash for
+localising changes made to variables.
+
+=head1 PUBLIC METHODS
+
+=head2 new(\%params)
+
+The C<new()> constructor method creates and returns a reference to a new
+C<Template::Stash> object.  
+
+    my $stash = Template::Stash->new();
+
+A hash reference may be passed to provide variables and values which
+should be used to initialise the stash.
+
+    my $stash = Template::Stash->new({ var1 => 'value1', 
+                                       var2 => 'value2' });
+
+=head2 get($variable)
+
+The C<get()> method retrieves the variable named by the first parameter.
+
+    $value = $stash->get('var1');
+
+Dotted compound variables can be retrieved by specifying the variable
+elements by reference to a list.  Each node in the variable occupies
+two entries in the list.  The first gives the name of the variable
+element, the second is a reference to a list of arguments for that 
+element, or C<0> if none.
+
+    [% foo.bar(10).baz(20) %]
+    
+    $stash->get([ 'foo', 0, 'bar', [ 10 ], 'baz', [ 20 ] ]);
+
+=head2 set($variable, $value, $default)
+
+The C<set()> method sets the variable name in the first parameter to the 
+value specified in the second.
+
+    $stash->set('var1', 'value1');
+
+If the third parameter evaluates to a true value, the variable is
+set only if it did not have a true value before.
+
+    $stash->set('var2', 'default_value', 1);
+
+Dotted compound variables may be specified as per L<get()> above.
+
+    [% foo.bar = 30 %]
+    
+    $stash->set([ 'foo', 0, 'bar', 0 ], 30);
+
+The magical variable 'C<IMPORT>' can be specified whose corresponding
+value should be a hash reference.  The contents of the hash array are
+copied (i.e. imported) into the current namespace.
+
+    # foo.bar = baz, foo.wiz = waz
+    $stash->set('foo', { 'bar' => 'baz', 'wiz' => 'waz' });
+    
+    # import 'foo' into main namespace: bar = baz, wiz = waz
+    $stash->set('IMPORT', $stash->get('foo'));
+
+=head2 clone(\%params)
+
+The C<clone()> method creates and returns a new C<Template::Stash> object
+which represents a localised copy of the parent stash. Variables can be freely
+updated in the cloned stash and when L<declone()> is called, the original stash
+is returned with all its members intact and in the same state as they were
+before C<clone()> was called.
+
+For convenience, a hash of parameters may be passed into C<clone()> which 
+is used to update any simple variable (i.e. those that don't contain any 
+namespace elements like C<foo> and C<bar> but not C<foo.bar>) variables while 
+cloning the stash.  For adding and updating complex variables, the L<set()> 
+method should be used after calling C<clone().>  This will correctly resolve
+and/or create any necessary namespace hashes.
+
+A cloned stash maintains a reference to the stash that it was copied 
+from in its C<_PARENT> member.
+
+=head2 declone()
+
+The C<declone()> method returns the C<_PARENT> reference and can be used to
+restore the state of a stash as described above.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template>, L<Template::Context>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Stash/Context.pm b/bench/perl/Template/Stash/Context.pm
new file mode 100644
index 0000000..49ae83d
--- /dev/null
+++ b/bench/perl/Template/Stash/Context.pm
@@ -0,0 +1,773 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Stash::Context
+#
+# DESCRIPTION
+#   This is an alternate stash object which includes a patch from 
+#   Craig Barratt to implement various new virtual methods to allow
+#   dotted template variable to denote if object methods and subroutines
+#   should be called in scalar or list context.  It adds a little overhead
+#   to each stash call and I'm a little wary of doing that.  So for now,
+#   it's implemented as a separate stash module which will allow us to 
+#   test it out, benchmark it and switch it in or out as we require.
+#
+#   This is what Craig has to say about it:
+#   
+#   Here's a better set of features for the core.  Attached is a new version
+#   of Stash.pm (based on TT2.02) that:
+#   
+#     - supports the special op "scalar" that forces scalar context on
+#       function calls, eg:
+#   
+#           cgi.param("foo").scalar
+#   
+#       calls cgi.param("foo") in scalar context (unlike my wimpy
+#       scalar op from last night).  Array context is the default.
+#   
+#       With non-function operands, scalar behaves like the perl
+#       version (eg: no-op for scalar, size for arrays, etc).
+#   
+#     - supports the special op "ref" that behaves like the perl ref.
+#       If applied to a function the function is not called.  Eg:
+#   
+#           cgi.param("foo").ref
+#   
+#       does *not* call cgi.param and evaluates to "CODE".  Similarly,
+#       HASH.ref, ARRAY.ref return what you expect.
+#   
+#     - adds a new scalar and list op called "array" that is a no-op for
+#       arrays and promotes scalars to one-element arrays.
+#   
+#     - allows scalar ops to be applied to arrays and hashes in place,
+#       eg: ARRAY.repeat(3) repeats each element in place.
+#   
+#     - allows list ops to be applied to scalars by promoting the scalars
+#       to one-element arrays (like an implicit "array").  So you can
+#       do things like SCALAR.size, SCALAR.join and get a useful result.
+#   
+#       This also means you can now use x.0 to safely get the first element
+#       whether x is an array or scalar.
+#   
+#   The new Stash.pm passes the TT2.02 test suite.  But I haven't tested the
+#   new features very much.  One nagging implementation problem is that the
+#   "scalar" and "ref" ops have higher precedence than user variable names.
+#   
+# AUTHORS
+#   Andy Wardley  <abw at kfs.org>
+#   Craig Barratt <craig at arraycomm.com>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2001 Andy Wardley.  All Rights Reserved.
+#   Copyright (C) 1998-2001 Canon Research Centre Europe Ltd.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Stash::Context;
+
+use strict;
+use warnings;
+use base 'Template::Stash';
+
+our $VERSION = 1.63;
+our $DEBUG   = 0 unless defined $DEBUG;
+
+
+#========================================================================
+#                    -- PACKAGE VARIABLES AND SUBS --
+#========================================================================
+
+#------------------------------------------------------------------------
+# copy virtual methods from those in the regular Template::Stash
+#------------------------------------------------------------------------
+
+our $ROOT_OPS = { 
+    %$Template::Stash::ROOT_OPS,
+    defined $ROOT_OPS ? %$ROOT_OPS : (),
+};
+
+our $SCALAR_OPS = { 
+    %$Template::Stash::SCALAR_OPS,
+    'array' => sub { return [$_[0]] },
+    defined $SCALAR_OPS ? %$SCALAR_OPS : (),
+};
+
+our $LIST_OPS = { 
+    %$Template::Stash::LIST_OPS,
+    'array' => sub { return $_[0] },
+    defined $LIST_OPS ? %$LIST_OPS : (),
+};
+                    
+our $HASH_OPS = { 
+    %$Template::Stash::HASH_OPS,
+    defined $HASH_OPS ? %$HASH_OPS : (),
+};
+ 
+
+
+#========================================================================
+#                      -----  CLASS METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# new(\%params)
+#
+# Constructor method which creates a new Template::Stash object.
+# An optional hash reference may be passed containing variable 
+# definitions that will be used to initialise the stash.
+#
+# Returns a reference to a newly created Template::Stash.
+#------------------------------------------------------------------------
+
+sub new {
+    my $class  = shift;
+    my $params = ref $_[0] eq 'HASH' ? shift(@_) : { @_ };
+
+    my $self   = {
+        global  => { },
+        %$params,
+        %$ROOT_OPS,
+        '_PARENT' => undef,
+        '_CLASS'  => $class,
+    };
+
+    bless $self, $class;
+}
+
+
+#========================================================================
+#                   -----  PUBLIC OBJECT METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# clone(\%params)
+#
+# Creates a copy of the current stash object to effect localisation 
+# of variables.  The new stash is blessed into the same class as the 
+# parent (which may be a derived class) and has a '_PARENT' member added
+# which contains a reference to the parent stash that created it
+# ($self).  This member is used in a successive declone() method call to
+# return the reference to the parent.
+# 
+# A parameter may be provided which should reference a hash of 
+# variable/values which should be defined in the new stash.  The 
+# update() method is called to define these new variables in the cloned
+# stash.
+#
+# Returns a reference to a cloned Template::Stash.
+#------------------------------------------------------------------------
+
+sub clone {
+    my ($self, $params) = @_;
+    $params ||= { };
+
+    # look out for magical 'import' argument which imports another hash
+    my $import = $params->{ import };
+    if (defined $import && UNIVERSAL::isa($import, 'HASH')) {
+        delete $params->{ import };
+    }
+    else {
+        undef $import;
+    }
+
+    my $clone = bless { 
+        %$self,                 # copy all parent members
+        %$params,               # copy all new data
+        '_PARENT' => $self,     # link to parent
+    }, ref $self;
+    
+    # perform hash import if defined
+    &{ $HASH_OPS->{ import }}($clone, $import)
+        if defined $import;
+
+    return $clone;
+}
+
+        
+#------------------------------------------------------------------------
+# declone($export) 
+#
+# Returns a reference to the PARENT stash.  When called in the following
+# manner:
+#    $stash = $stash->declone();
+# the reference count on the current stash will drop to 0 and be "freed"
+# and the caller will be left with a reference to the parent.  This 
+# contains the state of the stash before it was cloned.  
+#------------------------------------------------------------------------
+
+sub declone {
+    my $self = shift;
+    $self->{ _PARENT } || $self;
+}
+
+
+#------------------------------------------------------------------------
+# get($ident)
+# 
+# Returns the value for an variable stored in the stash.  The variable
+# may be specified as a simple string, e.g. 'foo', or as an array 
+# reference representing compound variables.  In the latter case, each
+# pair of successive elements in the list represent a node in the 
+# compound variable.  The first is the variable name, the second a 
+# list reference of arguments or 0 if undefined.  So, the compound 
+# variable [% foo.bar('foo').baz %] would be represented as the list
+# [ 'foo', 0, 'bar', ['foo'], 'baz', 0 ].  Returns the value of the
+# identifier or an empty string if undefined.  Errors are thrown via
+# die().
+#------------------------------------------------------------------------
+
+sub get {
+    my ($self, $ident, $args) = @_;
+    my ($root, $result);
+    $root = $self;
+
+    if (ref $ident eq 'ARRAY'
+        || ($ident =~ /\./) 
+        && ($ident = [ map { s/\(.*$//; ($_, 0) } split(/\./, $ident) ])) {
+        my $size = $#$ident;
+
+        # if $ident is a list reference, then we evaluate each item in the 
+        # identifier against the previous result, using the root stash 
+        # ($self) as the first implicit 'result'...
+
+        foreach (my $i = 0; $i <= $size; $i += 2) {
+            if ( $i + 2 <= $size && ($ident->[$i+2] eq "scalar"
+                                    || $ident->[$i+2] eq "ref") ) {
+                $result = $self->_dotop($root, @$ident[$i, $i+1], 0,
+                                        $ident->[$i+2]);
+                $i += 2;
+            } else {
+                $result = $self->_dotop($root, @$ident[$i, $i+1]);
+            }
+            last unless defined $result;
+            $root = $result;
+        }
+    }
+    else {
+        $result = $self->_dotop($root, $ident, $args);
+    }
+
+    return defined $result 
+        ? $result 
+        : $self->undefined($ident, $args);
+}
+
+
+#------------------------------------------------------------------------
+# set($ident, $value, $default)
+#
+# Updates the value for a variable in the stash.  The first parameter
+# should be the variable name or array, as per get().  The second 
+# parameter should be the intended value for the variable.  The third,
+# optional parameter is a flag which may be set to indicate 'default'
+# mode.  When set true, the variable will only be updated if it is
+# currently undefined or has a false value.  The magical 'IMPORT'
+# variable identifier may be used to indicate that $value is a hash
+# reference whose values should be imported.  Returns the value set,
+# or an empty string if not set (e.g. default mode).  In the case of 
+# IMPORT, returns the number of items imported from the hash.
+#------------------------------------------------------------------------
+
+sub set {
+    my ($self, $ident, $value, $default) = @_;
+    my ($root, $result, $error);
+
+    $root = $self;
+
+    ELEMENT: {
+        if (ref $ident eq 'ARRAY'
+            || ($ident =~ /\./) 
+            && ($ident = [ map { s/\(.*$//; ($_, 0) }
+                           split(/\./, $ident) ])) {
+
+            # a compound identifier may contain multiple elements (e.g. 
+            # foo.bar.baz) and we must first resolve all but the last, 
+            # using _dotop() with the $lvalue flag set which will create 
+            # intermediate hashes if necessary...
+            my $size = $#$ident;
+            foreach (my $i = 0; $i < $size - 2; $i += 2) {
+                $result = $self->_dotop($root, @$ident[$i, $i+1], 1);
+                last ELEMENT unless defined $result;
+                $root = $result;
+            }
+
+            # then we call _assign() to assign the value to the last element
+            $result = $self->_assign($root, @$ident[$size-1, $size], 
+                                     $value, $default);
+        }
+        else {
+            $result = $self->_assign($root, $ident, 0, $value, $default);
+        }
+    }
+
+    return defined $result ? $result : '';
+}
+
+
+#------------------------------------------------------------------------
+# getref($ident)
+# 
+# Returns a "reference" to a particular item.  This is represented as a 
+# closure which will return the actual stash item when called.  
+# WARNING: still experimental!
+#------------------------------------------------------------------------
+
+sub getref {
+    my ($self, $ident, $args) = @_;
+    my ($root, $item, $result);
+    $root = $self;
+
+    if (ref $ident eq 'ARRAY') {
+        my $size = $#$ident;
+
+        foreach (my $i = 0; $i <= $size; $i += 2) {
+            ($item, $args) = @$ident[$i, $i + 1]; 
+            last if $i >= $size - 2;  # don't evaluate last node
+            last unless defined 
+                ($root = $self->_dotop($root, $item, $args));
+        }
+    }
+    else {
+        $item = $ident;
+    }
+
+    if (defined $root) {
+        return sub { my @args = (@{$args||[]}, @_);
+                     $self->_dotop($root, $item, \@args);
+                 }
+    }
+    else {
+        return sub { '' };
+    }
+}
+
+
+
+
+#------------------------------------------------------------------------
+# update(\%params)
+#
+# Update multiple variables en masse.  No magic is performed.  Simple
+# variable names only.
+#------------------------------------------------------------------------
+
+sub update {
+    my ($self, $params) = @_;
+
+    # look out for magical 'import' argument to import another hash
+    my $import = $params->{ import };
+    if (defined $import && UNIVERSAL::isa($import, 'HASH')) {
+        @$self{ keys %$import } = values %$import;
+        delete $params->{ import };
+    }
+
+    @$self{ keys %$params } = values %$params;
+}
+
+
+#========================================================================
+#                  -----  PRIVATE OBJECT METHODS -----
+#========================================================================
+
+#------------------------------------------------------------------------
+# _dotop($root, $item, \@args, $lvalue, $nextItem)
+#
+# This is the core 'dot' operation method which evaluates elements of 
+# variables against their root.  All variables have an implicit root 
+# which is the stash object itself (a hash).  Thus, a non-compound 
+# variable 'foo' is actually '(stash.)foo', the compound 'foo.bar' is
+# '(stash.)foo.bar'.  The first parameter is a reference to the current
+# root, initially the stash itself.  The second parameter contains the 
+# name of the variable element, e.g. 'foo'.  The third optional
+# parameter is a reference to a list of any parenthesised arguments 
+# specified for the variable, which are passed to sub-routines, object 
+# methods, etc.  The final parameter is an optional flag to indicate 
+# if this variable is being evaluated on the left side of an assignment
+# (e.g. foo.bar.baz = 10).  When set true, intermediated hashes will 
+# be created (e.g. bar) if necessary.  
+#
+# Returns the result of evaluating the item against the root, having
+# performed any variable "magic".  The value returned can then be used
+# as the root of the next _dotop() in a compound sequence.  Returns
+# undef if the variable is undefined.
+#------------------------------------------------------------------------
+
+sub _dotop {
+    my ($self, $root, $item, $args, $lvalue, $nextItem) = @_;
+    my $rootref = ref $root;
+    my ($value, @result, $ret, $retVal);
+    $nextItem ||= "";
+    my $scalarContext = 1 if ( $nextItem eq "scalar" );
+    my $returnRef = 1     if ( $nextItem eq "ref" );
+
+    $args ||= [ ];
+    $lvalue ||= 0;
+
+#    print STDERR "_dotop(root=$root, item=$item, args=[@$args])\n"
+#       if $DEBUG;
+
+    # return undef without an error if either side of the dot is unviable
+    # or if an attempt is made to access a private member, starting _ or .
+    return undef
+        unless defined($root) and defined($item) and $item !~ /^[\._]/;
+
+    if (ref(\$root) eq "SCALAR" && !$lvalue &&
+            (($value = $LIST_OPS->{ $item }) || $item =~ /^-?\d+$/) ) {
+        #
+        # Promote scalar to one element list, to be processed below.
+        #
+        $rootref = 'ARRAY';
+        $root = [$root];
+    }
+    if ($rootref eq $self->{_CLASS} || $rootref eq 'HASH') {
+
+        # if $root is a regular HASH or a Template::Stash kinda HASH (the 
+        # *real* root of everything).  We first lookup the named key 
+        # in the hash, or create an empty hash in its place if undefined
+        # and the $lvalue flag is set.  Otherwise, we check the HASH_OPS
+        # pseudo-methods table, calling the code if found, or return undef.
+
+        if (defined($value = $root->{ $item })) {
+            ($ret, $retVal, @result) = _dotop_return($value, $args, $returnRef,
+                                                     $scalarContext);
+            return $retVal if ( $ret );                     ## RETURN
+        }
+        elsif ($lvalue) {
+            # we create an intermediate hash if this is an lvalue
+            return $root->{ $item } = { };                  ## RETURN
+        }
+        elsif ($value = $HASH_OPS->{ $item }) {
+            @result = &$value($root, @$args);               ## @result
+        }
+        elsif (ref $item eq 'ARRAY') {
+            # hash slice
+            return [@$root{@$item}];                       ## RETURN
+        }
+        elsif ($value = $SCALAR_OPS->{ $item }) {
+            #
+            # Apply scalar ops to every hash element, in place.
+            #
+            foreach my $key ( keys %$root ) {
+                $root->{$key} = &$value($root->{$key}, @$args);
+            }
+        }
+    }
+    elsif ($rootref eq 'ARRAY') {
+
+        # if root is an ARRAY then we check for a LIST_OPS pseudo-method 
+        # (except for l-values for which it doesn't make any sense)
+        # or return the numerical index into the array, or undef
+
+        if (($value = $LIST_OPS->{ $item }) && ! $lvalue) {
+            @result = &$value($root, @$args);               ## @result
+        }
+        elsif (($value = $SCALAR_OPS->{ $item }) && ! $lvalue) {
+            #
+            # Apply scalar ops to every array element, in place.
+            #
+            for ( my $i = 0 ; $i < @$root ; $i++ ) {
+                $root->[$i] = &$value($root->[$i], @$args); ## @result
+            }
+        }
+        elsif ($item =~ /^-?\d+$/) {
+            $value = $root->[$item];
+            ($ret, $retVal, @result) = _dotop_return($value, $args, $returnRef,
+                                                     $scalarContext);
+            return $retVal if ( $ret );                     ## RETURN
+        }
+        elsif (ref $item eq 'ARRAY' ) {
+            # array slice
+            return [@$root[@$item]];                        ## RETURN
+        }
+    }
+
+    # NOTE: we do the can-can because UNIVSERAL::isa($something, 'UNIVERSAL')
+    # doesn't appear to work with CGI, returning true for the first call
+    # and false for all subsequent calls. 
+
+    elsif (ref($root) && UNIVERSAL::can($root, 'can')) {
+
+        # if $root is a blessed reference (i.e. inherits from the 
+        # UNIVERSAL object base class) then we call the item as a method.
+        # If that fails then we try to fallback on HASH behaviour if 
+        # possible.
+        return ref $root->can($item) if ( $returnRef );       ## RETURN
+        eval {
+            @result = $scalarContext ? scalar $root->$item(@$args)
+                                     : $root->$item(@$args);  ## @result
+        };
+
+        if ($@) {
+            # failed to call object method, so try some fallbacks
+            if (UNIVERSAL::isa($root, 'HASH')
+                    && defined($value = $root->{ $item })) {
+                ($ret, $retVal, @result) = _dotop_return($value, $args,
+                                                    $returnRef, $scalarContext);
+                return $retVal if ( $ret );                     ## RETURN
+            }
+            elsif (UNIVERSAL::isa($root, 'ARRAY') 
+                   && ($value = $LIST_OPS->{ $item })) {
+                @result = &$value($root, @$args);
+            }
+            else {
+                @result = (undef, $@);
+            }
+        }
+    }
+    elsif (($value = $SCALAR_OPS->{ $item }) && ! $lvalue) {
+
+        # at this point, it doesn't look like we've got a reference to
+        # anything we know about, so we try the SCALAR_OPS pseudo-methods
+        # table (but not for l-values)
+
+        @result = &$value($root, @$args);                   ## @result
+    }
+    elsif ($self->{ _DEBUG }) {
+        die "don't know how to access [ $root ].$item\n";   ## DIE
+    }
+    else {
+        @result = ();
+    }
+
+    # fold multiple return items into a list unless first item is undef
+    if (defined $result[0]) {
+        return ref(@result > 1 ? [ @result ] : $result[0])
+                                            if ( $returnRef );  ## RETURN
+        if ( $scalarContext ) {
+            return scalar @result if ( @result > 1 );           ## RETURN
+            return scalar(@{$result[0]}) if ( ref $result[0] eq "ARRAY" );
+            return scalar(%{$result[0]}) if ( ref $result[0] eq "HASH" );
+            return $result[0];                                  ## RETURN
+        } else {
+            return @result > 1 ? [ @result ] : $result[0];      ## RETURN
+        }
+    }
+    elsif (defined $result[1]) {
+        die $result[1];                                     ## DIE
+    }
+    elsif ($self->{ _DEBUG }) {
+        die "$item is undefined\n";                         ## DIE
+    }
+
+    return undef;
+}
+
+#------------------------------------------------------------------------
+# ($ret, $retVal, @result) = _dotop_return($value, $args, $returnRef,
+#                                          $scalarContext);
+#
+# Handle the various return processing for _dotop
+#------------------------------------------------------------------------
+
+sub _dotop_return
+{
+    my($value, $args, $returnRef, $scalarContext) = @_;
+    my(@result);
+
+    return (1, ref $value) if ( $returnRef );                     ## RETURN
+    if ( $scalarContext ) {
+        return (1, scalar(@$value)) if ref $value eq 'ARRAY';     ## RETURN
+        return (1, scalar(%$value)) if ref $value eq 'HASH';      ## RETURN
+        return (1, scalar($value))  unless ref $value eq 'CODE';  ## RETURN;
+        @result = scalar &$value(@$args)                          ## @result;
+    } else {
+        return (1, $value) unless ref $value eq 'CODE';           ## RETURN
+        @result = &$value(@$args);                                ## @result
+    }
+    return (0, undef, @result);
+}
+
+
+#------------------------------------------------------------------------
+# _assign($root, $item, \@args, $value, $default)
+#
+# Similar to _dotop() above, but assigns a value to the given variable
+# instead of simply returning it.  The first three parameters are the
+# root item, the item and arguments, as per _dotop(), followed by the 
+# value to which the variable should be set and an optional $default
+# flag.  If set true, the variable will only be set if currently false
+# (undefined/zero)
+#------------------------------------------------------------------------
+
+sub _assign {
+    my ($self, $root, $item, $args, $value, $default) = @_;
+    my $rootref = ref $root;
+    my $result;
+    $args ||= [ ];
+    $default ||= 0;
+
+#    print(STDERR "_assign(root=$root, item=$item, args=[@$args], \n",
+#                         "value=$value, default=$default)\n")
+#       if $DEBUG;
+
+    # return undef without an error if either side of the dot is unviable
+    # or if an attempt is made to update a private member, starting _ or .
+    return undef                                                ## RETURN
+        unless $root and defined $item and $item !~ /^[\._]/;
+    
+    if ($rootref eq 'HASH' || $rootref eq $self->{_CLASS}) {
+        # if the root is a hash we set the named key
+        return ($root->{ $item } = $value)                      ## RETURN
+            unless $default && $root->{ $item };
+    }
+    elsif ($rootref eq 'ARRAY' && $item =~ /^-?\d+$/) {
+            # or set a list item by index number
+            return ($root->[$item] = $value)                    ## RETURN
+                unless $default && $root->{ $item };
+    }
+    elsif (UNIVERSAL::isa($root, 'UNIVERSAL')) {
+        # try to call the item as a method of an object
+        return $root->$item(@$args, $value);                    ## RETURN
+    }
+    else {
+        die "don't know how to assign to [$root].[$item]\n";    ## DIE
+    }
+
+    return undef;
+}
+
+
+#------------------------------------------------------------------------
+# _dump()
+#
+# Debug method which returns a string representing the internal state
+# of the object.  The method calls itself recursively to dump sub-hashes.
+#------------------------------------------------------------------------
+
+sub _dump {
+    my $self   = shift;
+    my $indent = shift || 1;
+    my $buffer = '    ';
+    my $pad    = $buffer x $indent;
+    my $text   = '';
+    local $" = ', ';
+
+    my ($key, $value);
+
+
+    return $text . "...excessive recursion, terminating\n"
+        if $indent > 32;
+
+    foreach $key (keys %$self) {
+
+        $value = $self->{ $key };
+        $value = '<undef>' unless defined $value;
+
+        if (ref($value) eq 'ARRAY') {
+            $value = "$value [@$value]";
+        }
+        $text .= sprintf("$pad%-8s => $value\n", $key);
+        next if $key =~ /^\./;
+        if (UNIVERSAL::isa($value, 'HASH')) {
+            $text .= _dump($value, $indent + 1);
+        }
+    }
+    $text;
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Stash::Context - Experimetal stash allowing list/scalar context definition
+
+=head1 SYNOPSIS
+
+    use Template;
+    use Template::Stash::Context;
+
+    my $stash = Template::Stash::Context->new(\%vars);
+    my $tt2   = Template->new({ STASH => $stash });
+
+=head1 DESCRIPTION
+
+This is an alternate stash object which includes a patch from 
+Craig Barratt to implement various new virtual methods to allow
+dotted template variable to denote if object methods and subroutines
+should be called in scalar or list context.  It adds a little overhead
+to each stash call and I'm a little wary of applying that to the core
+default stash without investigating the effects first. So for now,
+it's implemented as a separate stash module which will allow us to 
+test it out, benchmark it and switch it in or out as we require.
+
+This is what Craig has to say about it:
+
+Here's a better set of features for the core.  Attached is a new version
+of Stash.pm (based on TT2.02) that:
+
+* supports the special op "scalar" that forces scalar context on
+function calls, eg:
+
+    cgi.param("foo").scalar
+
+calls cgi.param("foo") in scalar context (unlike my wimpy
+scalar op from last night).  Array context is the default.
+
+With non-function operands, scalar behaves like the perl
+version (eg: no-op for scalar, size for arrays, etc).
+
+* supports the special op "ref" that behaves like the perl ref.
+If applied to a function the function is not called.  Eg:
+
+    cgi.param("foo").ref
+
+does *not* call cgi.param and evaluates to "CODE".  Similarly,
+HASH.ref, ARRAY.ref return what you expect.
+
+* adds a new scalar and list op called "array" that is a no-op for
+arrays and promotes scalars to one-element arrays.
+
+* allows scalar ops to be applied to arrays and hashes in place,
+eg: ARRAY.repeat(3) repeats each element in place.
+
+* allows list ops to be applied to scalars by promoting the scalars
+to one-element arrays (like an implicit "array").  So you can
+do things like SCALAR.size, SCALAR.join and get a useful result.
+
+This also means you can now use x.0 to safely get the first element
+whether x is an array or scalar.
+
+The new Stash.pm passes the TT2.02 test suite.  But I haven't tested the
+new features very much.  One nagging implementation problem is that the
+"scalar" and "ref" ops have higher precedence than user variable names.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt>
+
+L<http://wardley.org/|http://wardley.org/>
+
+
+
+
+=head1 VERSION
+
+1.63, distributed as part of the
+Template Toolkit version 2.19, released on 27 April 2007.
+
+=head1 COPYRIGHT
+
+  Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Stash|Template::Stash>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/Stash/XS.pm b/bench/perl/Template/Stash/XS.pm
new file mode 100644
index 0000000..1742d55
--- /dev/null
+++ b/bench/perl/Template/Stash/XS.pm
@@ -0,0 +1,137 @@
+#============================================================= -*-Perl-*-
+# 
+# Template::Stash::XS
+# 
+# DESCRIPTION
+#
+#   Perl bootstrap for XS module. Inherits methods from 
+#   Template::Stash when not implemented in the XS module.
+#
+#========================================================================
+
+package Template::Stash::XS;
+
+use strict;
+use warnings;
+use Template;
+use Template::Stash;
+
+our $AUTOLOAD;
+
+BEGIN {
+    require DynaLoader;
+    @Template::Stash::XS::ISA = qw( DynaLoader Template::Stash );
+
+    eval {
+        bootstrap Template::Stash::XS $Template::VERSION;
+    };
+    if ($@) {
+        die "Couldn't load Template::Stash::XS $Template::VERSION:\n\n$@\n";
+    }
+}
+
+sub DESTROY {
+    # no op
+    1;
+}
+
+
+# catch missing method calls here so perl doesn't barf 
+# trying to load *.al files 
+
+sub AUTOLOAD {
+    my ($self, @args) = @_;
+    my @c             = caller(0);
+    my $auto	    = $AUTOLOAD;
+
+    $auto =~ s/.*:://;
+    $self =~ s/=.*//;
+
+    die "Can't locate object method \"$auto\"" .
+        " via package \"$self\" at $c[1] line $c[2]\n";
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Stash::XS - High-speed variable stash written in C
+
+=head1 SYNOPSIS
+
+    use Template;
+    use Template::Stash::XS;
+
+    my $stash = Template::Stash::XS->new(\%vars);
+    my $tt2   = Template->new({ STASH => $stash });
+
+=head1 DESCRIPTION
+
+The Template:Stash::XS module is an implementation of the
+Template::Stash written in C.  The "XS" in the name refers to Perl's
+XS extension system for interfacing Perl to C code.  It works just
+like the regular Perl implementation of Template::Stash but runs about
+twice as fast.
+
+The easiest way to use the XS stash is to configure the Template
+Toolkit to use it by default.  You can do this at installation time
+(when you run C<perl Makefile.PL>) by answering 'y' to the questions:
+
+    Do you want to build the XS Stash module?      y
+    Do you want to use the XS Stash by default?    y
+
+See the F<INSTALL> file distributed with the Template Toolkit for further
+details on installation.
+
+If you don't elect to use the XS stash by default then you should use
+the C<STASH> configuration item when you create a new Template object.
+This should reference an XS stash object that you have created
+manually.
+
+    use Template;
+    use Template::Stash::XS;
+
+    my $stash = Template::Stash::XS->new(\%vars);
+    my $tt2   = Template->new({ STASH => $stash });
+
+Alternately, you can set the C<$Template::Config::STASH> package
+variable like so:
+
+    use Template;
+    use Template::Config;
+
+    $Template::Config::STASH = 'Template::Stash::XS';
+
+    my $tt2 = Template->new();
+
+The XS stash will then be automatically used.  
+
+If you want to use the XS stash by default and don't want to
+re-install the Template Toolkit, then you can manually modify the
+C<Template/Config.pm> module near line 42 to read:
+
+    $STASH = 'Template::Stash::XS';
+
+=head1 BUGS
+
+Please report bugs to the Template Toolkit mailing list
+templates at template-toolkit.org
+
+=head1 AUTHORS
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+Doug Steinwand E<lt>dsteinwand at citysearch.comE<gt>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2009 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Stash>
diff --git a/bench/perl/Template/Test.pm b/bench/perl/Template/Test.pm
new file mode 100644
index 0000000..c14eb31
--- /dev/null
+++ b/bench/perl/Template/Test.pm
@@ -0,0 +1,709 @@
+#============================================================= -*-Perl-*-
+#
+# Template::Test
+#
+# DESCRIPTION
+#   Module defining a test harness which processes template input and
+#   then compares the output against pre-define expected output.
+#   Generates test output compatible with Test::Harness.  This was 
+#   originally the t/texpect.pl script.
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+#============================================================================
+
+package Template::Test;
+
+use strict;
+use warnings;
+use Template qw( :template );
+use Exporter;
+
+our $VERSION = 2.75;
+our $DEBUG   = 0;
+our @ISA     = qw( Exporter );
+our @EXPORT  = qw( ntests ok is match flush skip_all test_expect callsign banner );
+our @EXPORT_OK = ( 'assert' );
+our %EXPORT_TAGS = ( all => [ @EXPORT_OK, @EXPORT ] );
+$| = 1;
+
+our $REASON   = 'not applicable on this platform';
+our $NO_FLUSH = 0;
+our $EXTRA    = 0;   # any extra tests to come after test_expect()
+our $PRESERVE = 0    # don't mangle newlines in output/expect
+    unless defined $PRESERVE;
+
+our ($loaded, %callsign);
+
+# always set binmode on Win32 machines so that any output generated
+# is true to what we expect 
+$Template::BINMODE = ($^O eq 'MSWin32') ? 1 : 0;
+
+my @results = ();
+my ($ntests, $ok_count);
+*is = \&match;
+
+END {
+    # ensure flush() is called to print any cached results 
+    flush();
+}
+
+
+#------------------------------------------------------------------------
+# ntests($n)
+#
+# Declare how many (more) tests are expected to come.  If ok() is called 
+# before ntests() then the results are cached instead of being printed
+# to STDOUT.  When ntests() is called, the total number of tests 
+# (including any cached) is known and the "1..$ntests" line can be
+# printed along with the cached results.  After that, calls to ok() 
+# generated printed output immediately.
+#------------------------------------------------------------------------
+
+sub ntests {
+    $ntests = shift;
+    # add any pre-declared extra tests, or pre-stored test @results, to 
+    # the grand total of tests
+    $ntests += $EXTRA + scalar @results;     
+    $ok_count = 1;
+    print $ntests ? "1..$ntests\n" : "1..$ntests # skip $REASON\n";
+    # flush cached results
+    foreach my $pre_test (@results) {
+        ok(@$pre_test);
+    }
+}
+
+
+#------------------------------------------------------------------------
+# ok($truth, $msg)
+#
+# Tests the value passed for truth and generates an "ok $n" or "not ok $n"
+# line accordingly.  If ntests() hasn't been called then we cached 
+# results for later, instead.
+#------------------------------------------------------------------------
+
+sub ok {
+    my ($ok, $msg) = @_;
+
+    # cache results if ntests() not yet called
+    unless ($ok_count) {
+        push(@results, [ $ok, $msg ]);
+        return $ok;
+    }
+
+    $msg = defined $msg ? " - $msg" : '';
+    if ($ok) {
+        print "ok ", $ok_count++, "$msg\n";
+    }
+    else {
+        print STDERR "FAILED $ok_count: $msg\n" if defined $msg;
+        print "not ok ", $ok_count++, "$msg\n";
+    }
+}
+
+
+
+#------------------------------------------------------------------------
+# assert($truth, $error)
+#
+# Test value for truth, die if false.
+#------------------------------------------------------------------------
+
+sub assert {
+    my ($ok, $err) = @_;
+    return ok(1) if $ok;
+
+    # failed
+    my ($pkg, $file, $line) = caller();
+    $err ||= "assert failed";
+    $err .= " at $file line $line\n";
+    ok(0);
+    die $err;
+}
+
+#------------------------------------------------------------------------
+# match( $result, $expect )
+#------------------------------------------------------------------------
+
+sub match {
+    my ($result, $expect, $msg) = @_;
+    my $count = $ok_count ? $ok_count : scalar @results + 1;
+
+    # force stringification of $result to avoid 'no eq method' overload errors
+    $result = "$result" if ref $result;    
+
+    if ($result eq $expect) {
+        return ok(1, $msg);
+    }
+    else {
+        print STDERR "FAILED $count:\n  expect: [$expect]\n  result: [$result]\n";
+        return ok(0, $msg);
+    }
+}
+
+
+#------------------------------------------------------------------------
+# flush()
+#
+# Flush any tests results.
+#------------------------------------------------------------------------
+
+sub flush {
+    ntests(0)
+    unless $ok_count || $NO_FLUSH;
+}
+
+
+#------------------------------------------------------------------------
+# skip_all($reason)
+#
+# Skip all tests, setting $REASON to contain any message passed.  Calls
+# exit(0) which triggers flush() which generates a "1..0 # $REASON"
+# string to keep to test harness happy.
+#------------------------------------------------------------------------
+
+sub skip_all {
+    $REASON = join('', @_);
+    exit(0);
+}
+
+
+#------------------------------------------------------------------------
+# test_expect($input, $template, \%replace)
+#
+# This is the main testing sub-routine.  The $input parameter should be a 
+# text string or a filehandle reference (e.g. GLOB or IO::Handle) from
+# which the input text can be read.  The input should contain a number 
+# of tests which are split up and processed individually, comparing the 
+# generated output against the expected output.  Tests should be defined
+# as follows:
+#
+#   -- test --
+#   test input
+#   -- expect --
+#   expected output
+# 
+#   -- test --
+#    etc...
+#
+# The number of tests is determined and ntests() is called to generate 
+# the "0..$n" line compatible with Test::Harness.  Each test input is
+# then processed by the Template object passed as the second parameter,
+# $template.  This may also be a hash reference containing configuration
+# which are used to instantiate a Template object, or may be left 
+# undefined in which case a default Template object will be instantiated.
+# The third parameter, also optional, may be a reference to a hash array
+# defining template variables.  This is passed to the template process()
+# method.
+#------------------------------------------------------------------------
+
+sub test_expect {
+    my ($src, $tproc, $params) = @_;
+    my ($input, @tests);
+    my ($output, $expect, $match);
+    my $count = 0;
+    my $ttprocs;
+
+    # read input text
+    eval {
+        local $/ = undef;
+        $input = ref $src ? <$src> : $src;
+    };
+    if ($@) {
+        ntests(1); ok(0);
+        warn "Cannot read input text from $src\n";
+        return undef;
+    }
+
+    # remove any comment lines
+    $input =~ s/^#.*?\n//gm;
+
+    # remove anything before '-- start --' and/or after '-- stop --'
+    $input = $' if $input =~ /\s*--\s*start\s*--\s*/;
+    $input = $` if $input =~ /\s*--\s*stop\s*--\s*/;
+
+    @tests = split(/^\s*--\s*test\s*--\s*\n/im, $input);
+
+    # if the first line of the file was '--test--' (optional) then the 
+    # first test will be empty and can be discarded
+    shift(@tests) if $tests[0] =~ /^\s*$/;
+
+    ntests(3 + scalar(@tests) * 2);
+
+    # first test is that Template loaded OK, which it did
+    ok(1, 'running test_expect()');
+
+    # optional second param may contain a Template reference or a HASH ref
+    # of constructor options, or may be undefined
+    if (ref($tproc) eq 'HASH') {
+        # create Template object using hash of config items
+        $tproc = Template->new($tproc)
+            || die Template->error(), "\n";
+    }
+    elsif (ref($tproc) eq 'ARRAY') {
+        # list of [ name => $tproc, name => $tproc ], use first $tproc
+        $ttprocs = { @$tproc };
+        $tproc   = $tproc->[1];
+    }
+    elsif (! ref $tproc) {
+        $tproc = Template->new()
+            || die Template->error(), "\n";
+    }
+    # otherwise, we assume it's a Template reference
+
+    # test: template processor created OK
+    ok($tproc, 'template processor is engaged');
+
+    # third test is that the input read ok, which it did
+    ok(1, 'input read and split into ' . scalar @tests . ' tests');
+
+    # the remaining tests are defined in @tests...
+    foreach $input (@tests) {
+        $count++;
+        my $name = '';
+        
+        if ($input =~ s/^\s*-- name:? (.*?) --\s*\n//im) {
+            $name = $1; 
+        }
+        else {
+            $name = "template text $count";
+        }
+
+        # split input by a line like "-- expect --"
+        ($input, $expect) = 
+            split(/^\s*--\s*expect\s*--\s*\n/im, $input);
+        $expect = '' 
+            unless defined $expect;
+
+        $output = '';
+
+        # input text may be prefixed with "-- use name --" to indicate a
+        # Template object in the $ttproc hash which we should use
+        if ($input =~ s/^\s*--\s*use\s+(\S+)\s*--\s*\n//im) {
+            my $ttname = $1;
+            my $ttlookup;
+            if ($ttlookup = $ttprocs->{ $ttname }) {
+                $tproc = $ttlookup;
+            }
+            else {
+                warn "no such template object to use: $ttname\n";
+            }
+        }
+
+        # process input text
+        $tproc->process(\$input, $params, \$output) || do {
+            warn "Template process failed: ", $tproc->error(), "\n";
+            # report failure and automatically fail the expect match
+            ok(0, "$name process FAILED: " . subtext($input));
+            ok(0, '(obviously did not match expected)');
+            next;
+        };
+
+        # processed OK
+        ok(1, "$name processed OK: " . subtext($input));
+
+        # another hack: if the '-- expect --' section starts with 
+        # '-- process --' then we process the expected output 
+        # before comparing it with the generated output.  This is
+        # slightly twisted but it makes it possible to run tests 
+        # where the expected output isn't static.  See t/date.t for
+        # an example.
+
+        if ($expect =~ s/^\s*--+\s*process\s*--+\s*\n//im) {
+            my $out;
+            $tproc->process(\$expect, $params, \$out) || do {
+                warn("Template process failed (expect): ", 
+                     $tproc->error(), "\n");
+                # report failure and automatically fail the expect match
+                ok(0, "failed to process expected output ["
+                   . subtext($expect) . ']');
+                next;
+            };
+            $expect = $out;
+        };      
+        
+        # strip any trailing blank lines from expected and real output
+        foreach ($expect, $output) {
+            s/[\n\r]*\Z//mg;
+        }
+        
+        $match = ($expect eq $output) ? 1 : 0;
+        if (! $match || $DEBUG) {
+            print "MATCH FAILED\n"
+                unless $match;
+            
+            my ($copyi, $copye, $copyo) = ($input, $expect, $output);
+            unless ($PRESERVE) {
+                foreach ($copyi, $copye, $copyo) {
+                    s/\n/\\n/g;
+                }
+            }
+            printf(" input: [%s]\nexpect: [%s]\noutput: [%s]\n", 
+                   $copyi, $copye, $copyo);
+        }
+        
+        ok($match, $match ? "$name matched expected" : "$name did not match expected");
+    };
+}
+
+#------------------------------------------------------------------------
+# callsign()
+#
+# Returns a hash array mapping lower a..z to their phonetic alphabet 
+# equivalent.
+#------------------------------------------------------------------------
+
+sub callsign {
+    my %callsign;
+    @callsign{ 'a'..'z' } = qw( 
+        alpha bravo charlie delta echo foxtrot golf hotel india 
+        juliet kilo lima mike november oscar papa quebec romeo 
+        sierra tango umbrella victor whisky x-ray yankee zulu );
+    return \%callsign;
+}
+
+
+#------------------------------------------------------------------------
+# banner($text)
+# 
+# Prints a banner with the specified text if $DEBUG is set.
+#------------------------------------------------------------------------
+
+sub banner {
+    return unless $DEBUG;
+    my $text = join('', @_);
+    my $count = $ok_count ? $ok_count - 1 : scalar @results;
+    print "-" x 72, "\n$text ($count tests completed)\n", "-" x 72, "\n";
+}
+
+
+sub subtext {
+    my $text = shift;
+    $text =~ s/\s*$//sg;
+    $text = substr($text, 0, 32) . '...' if length $text > 32;
+    $text =~ s/\n/\\n/g;
+    return $text;
+}
+
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::Test - Module for automating TT2 test scripts
+
+=head1 SYNOPSIS
+
+    use Template::Test;
+   
+    $Template::Test::DEBUG = 0;   # set this true to see each test running
+    $Template::Test::EXTRA = 2;   # 2 extra tests follow test_expect()...
+    
+    # ok() can be called any number of times before test_expect
+    ok( $true_or_false )
+    
+    # test_expect() splits $input into individual tests, processes each 
+    # and compares generated output against expected output
+    test_expect($input, $template, \%replace );
+    
+    # $input is text or filehandle (e.g. DATA section after __END__)
+    test_expect( $text );
+    test_expect( \*DATA );
+    
+    # $template is a Template object or configuration hash
+    my $template_cfg = { ... };
+    test_expect( $input, $template_cfg );
+    my $template_obj = Template->new($template_cfg);
+    test_expect( $input, $template_obj );
+    
+    # $replace is a hash reference of template variables
+    my $replace = {
+        a => 'alpha',
+        b => 'bravo'
+    };
+    test_expect( $input, $template, $replace );
+    
+    # ok() called after test_expect should be declared in $EXTRA (2)
+    ok( $true_or_false )   
+    ok( $true_or_false )   
+
+=head1 DESCRIPTION
+
+The C<Template::Test> module defines the L<test_expect()> and other related
+subroutines which can be used to automate test scripts for the
+Template Toolkit.  See the numerous tests in the F<t> sub-directory of
+the distribution for examples of use.
+
+=head1 PACKAGE SUBROUTINES
+
+=head2 text_expect()
+
+The C<test_expect()> subroutine splits an input document into a number
+of separate tests, processes each one using the Template Toolkit and
+then compares the generated output against an expected output, also
+specified in the input document.  It generates the familiar 
+C<ok>/C<not ok> output compatible with C<Test::Harness>.
+
+The test input should be specified as a text string or a reference to
+a filehandle (e.g. C<GLOB> or C<IO::Handle>) from which it can be read.  In 
+particular, this allows the test input to be placed after the C<__END__>
+marker and read via the C<DATA> filehandle.
+
+    use Template::Test;
+    
+    test_expect(\*DATA);
+    
+    __END__
+    # this is the first test (this is a comment)
+    -- test --
+    blah blah blah [% foo %]
+    -- expect --
+    blah blah blah value_of_foo
+    
+    # here's the second test (no surprise, so is this)
+    -- test --
+    more blah blah [% bar %]
+    -- expect --
+    more blah blah value_of_bar
+
+Blank lines between test sections are generally ignored.  Any line starting
+with C<#> is treated as a comment and is ignored.
+
+The second and third parameters to C<test_expect()> are optional.  The second
+may be either a reference to a Template object which should be used to 
+process the template fragments, or a reference to a hash array containing
+configuration values which should be used to instantiate a new Template
+object.
+
+    # pass reference to config hash
+    my $config = {
+        INCLUDE_PATH => '/here/there:/every/where',
+        POST_CHOMP   => 1,
+    };
+    test_expect(\*DATA, $config);
+    
+    # or create Template object explicitly
+    my $template = Template->new($config);
+    test_expect(\*DATA, $template);
+
+The third parameter may be used to reference a hash array of template
+variable which should be defined when processing the tests.  This is
+passed to the L<Template> L<process()|Template#process()> method.
+
+    my $replace = {
+        a => 'alpha',
+        b => 'bravo',
+    };
+    
+    test_expect(\*DATA, $config, $replace);
+
+The second parameter may be left undefined to specify a default L<Template>
+configuration.
+
+    test_expect(\*DATA, undef, $replace);
+
+For testing the output of different L<Template> configurations, a
+reference to a list of named L<Template> objects also may be passed as
+the second parameter.
+
+    my $tt1 = Template->new({ ... });
+    my $tt2 = Template->new({ ... });
+    my @tts = [ one => $tt1, two => $tt1 ];
+
+The first object in the list is used by default.  Other objects may be 
+switched in with a 'C<-- use $name -->' marker.  This should immediately 
+follow a 'C<-- test -->' line.  That object will then be used for the rest 
+of the test, or until a different object is selected.
+
+    -- test --
+    -- use one --
+    [% blah %]
+    -- expect --
+    blah, blah
+    
+    -- test --
+    still using one...
+    -- expect --
+    ...
+    
+    -- test --
+    -- use two --
+    [% blah %]
+    -- expect --
+    blah, blah, more blah
+
+The C<test_expect()> sub counts the number of tests, and then calls L<ntests()> 
+to generate the familiar "C<1..$ntests\n>" test harness line.  Each 
+test defined generates two test numbers.  The first indicates 
+that the input was processed without error, and the second that the 
+output matches that expected. 
+
+Additional test may be run before C<test_expect()> by calling L<ok()>. These
+test results are cached until L<ntests()> is called and the final number of
+tests can be calculated. Then, the "C<1..$ntests>" line is output, along with
+"C<ok $n>" / "C<not ok $n>" lines for each of the cached test result.
+Subsequent calls to L<ok()> then generate an output line immediately.
+
+    my $something = SomeObject->new();
+    ok( $something );
+    
+    my $other = AnotherThing->new();
+    ok( $other );
+    
+    test_expect(\*DATA);
+
+If any tests are to follow after C<test_expect()> is called then these 
+should be pre-declared by setting the C<$EXTRA> package variable.  This
+value (default: C<0>) is added to the grand total calculated by L<ntests()>.
+The results of the additional tests are also registered by calling L<ok()>.
+
+    $Template::Test::EXTRA = 2;
+    
+    # can call ok() any number of times before test_expect()
+    ok( $did_that_work );             
+    ok( $make_sure );
+    ok( $dead_certain ); 
+    
+    # <some> number of tests...
+    test_expect(\*DATA, $config, $replace);
+    
+    # here's those $EXTRA tests
+    ok( defined $some_result && ref $some_result eq 'ARRAY' );
+    ok( $some_result->[0] eq 'some expected value' );
+
+If you don't want to call C<test_expect()> at all then you can call
+C<ntests($n)> to declare the number of tests and generate the test 
+header line.  After that, simply call L<ok()> for each test passing 
+a true or false values to indicate that the test passed or failed.
+
+    ntests(2);
+    ok(1);
+    ok(0);
+
+If you're really lazy, you can just call L<ok()> and not bother declaring
+the number of tests at all.  All tests results will be cached until the
+end of the script and then printed in one go before the program exits.
+
+    ok( $x );
+    ok( $y );
+
+You can identify only a specific part of the input file for testing
+using the 'C<-- start -->' and 'C<-- stop -->' markers.  Anything before the 
+first 'C<-- start -->' is ignored, along with anything after the next 
+'C<-- stop -->' marker.
+
+    -- test --
+    this is test 1 (not performed)
+    -- expect --
+    this is test 1 (not performed)
+    
+    -- start --
+    
+    -- test --
+    this is test 2
+    -- expect --
+    this is test 2
+        
+    -- stop --
+    
+    ...
+
+=head2 ntests()
+
+Subroutine used to specify how many tests you're expecting to run.
+
+=head2 ok($test)
+
+Generates an "C<ok $n>" or "C<not ok $n>" message if C<$test> is true or false.
+
+=head2 not_ok($test)
+
+The logical inverse of L<ok()>. Prints an "C<ok $n>" message is C<$test> is
+I<false> and vice-versa.
+
+=head2 callsign()
+
+For historical reasons and general utility, the module also defines a
+C<callsign()> subroutine which returns a hash mapping the letters C<a>
+to C<z> to their phonetic alphabet equivalent (e.g. radio callsigns). 
+This is used by many of the test scripts as a known source of variable values.
+
+    test_expect(\*DATA, $config, callsign());
+
+=head2 banner()
+
+This subroutine prints a simple banner including any text passed as parameters.
+The C<$DEBUG> variable must be set for it to generate any output.
+
+    banner('Testing something-or-other');
+
+example output:
+
+    #------------------------------------------------------------
+    # Testing something-or-other (27 tests completed)
+    #------------------------------------------------------------
+
+=head1 PACKAGE VARIABLES
+
+=head2 $DEBUG
+
+The $DEBUG package variable can be set to enable debugging mode.
+
+=head2 $PRESERVE
+
+The $PRESERVE package variable can be set to stop the test_expect()
+from converting newlines in the output and expected output into
+the literal strings '\n'. 
+
+=head1 HISTORY
+
+This module started its butt-ugly life as the C<t/texpect.pl> script.  It
+was cleaned up to became the C<Template::Test> module some time around
+version 0.29.  It underwent further cosmetic surgery for version 2.00
+but still retains some remarkable rear-end resemblances.
+
+Since then the C<Test::More> and related modules have appeared on CPAN
+making this module mostly, but not entirely, redundant.
+
+=head1 BUGS / KNOWN "FEATURES"
+
+Imports all methods by default.  This is generally a Bad Thing, but
+this module is only used in test scripts (i.e. at build time) so a) we
+don't really care and b) it saves typing.
+
+The line splitter may be a bit dumb, especially if it sees lines like
+C<-- this --> that aren't supposed to be special markers.  So don't do that.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/VMethods.pm b/bench/perl/Template/VMethods.pm
new file mode 100644
index 0000000..002289f
--- /dev/null
+++ b/bench/perl/Template/VMethods.pm
@@ -0,0 +1,587 @@
+#============================================================= -*-Perl-*-
+#
+# Template::VMethods
+#
+# DESCRIPTION
+#   Module defining virtual methods for the Template Toolkit
+#
+# AUTHOR
+#   Andy Wardley   <abw at wardley.org>
+#
+# COPYRIGHT
+#   Copyright (C) 1996-2006 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+# REVISION
+#   $Id: VMethods.pm 1245 2009-07-04 17:02:52Z abw $
+#
+#============================================================================
+
+package Template::VMethods;
+
+use strict;
+use warnings;
+use Scalar::Util 'blessed';
+require Template::Stash;
+
+our $VERSION = 2.16;
+our $DEBUG   = 0 unless defined $DEBUG;
+our $PRIVATE = $Template::Stash::PRIVATE;
+
+our $ROOT_VMETHODS = {
+    inc     => \&root_inc,
+    dec     => \&root_dec,
+};
+
+our $TEXT_VMETHODS = {
+    item    => \&text_item,
+    list    => \&text_list,
+    hash    => \&text_hash,
+    length  => \&text_length,
+    size    => \&text_size,
+    defined => \&text_defined,
+    match   => \&text_match,
+    search  => \&text_search,
+    repeat  => \&text_repeat,
+    replace => \&text_replace,
+    remove  => \&text_remove,
+    split   => \&text_split,
+    chunk   => \&text_chunk,
+    substr  => \&text_substr,
+};
+
+our $HASH_VMETHODS = {
+    item    => \&hash_item,
+    hash    => \&hash_hash,
+    size    => \&hash_size,
+    each    => \&hash_each,
+    keys    => \&hash_keys,
+    values  => \&hash_values,
+    items   => \&hash_items,
+    pairs   => \&hash_pairs,
+    list    => \&hash_list,
+    exists  => \&hash_exists,
+    defined => \&hash_defined,
+    delete  => \&hash_delete,
+    import  => \&hash_import,
+    sort    => \&hash_sort,
+    nsort   => \&hash_nsort,
+};
+
+our $LIST_VMETHODS = {
+    item    => \&list_item,
+    list    => \&list_list,
+    hash    => \&list_hash,
+    push    => \&list_push,
+    pop     => \&list_pop,
+    unshift => \&list_unshift,
+    shift   => \&list_shift,
+    max     => \&list_max,
+    size    => \&list_size,
+    defined => \&list_defined,
+    first   => \&list_first,
+    last    => \&list_last,
+    reverse => \&list_reverse,
+    grep    => \&list_grep,
+    join    => \&list_join,
+    sort    => \&list_sort,
+    nsort   => \&list_nsort,
+    unique  => \&list_unique,
+    import  => \&list_import,
+    merge   => \&list_merge,
+    slice   => \&list_slice,
+    splice  => \&list_splice,
+};
+
+
+#========================================================================
+# root virtual methods
+#========================================================================
+
+sub root_inc { 
+    no warnings;
+    my $item = shift; 
+    ++$item;
+}
+
+sub root_dec {
+    no warnings;
+    my $item = shift; 
+    --$item;
+}
+
+
+#========================================================================
+# text virtual methods
+#========================================================================
+
+sub text_item {
+    $_[0];
+}
+
+sub text_list { 
+    [ $_[0] ];
+}
+
+sub text_hash { 
+    { value => $_[0] };
+}
+
+sub text_length { 
+    length $_[0];
+}
+
+sub text_size { 
+    return 1;
+}
+
+sub text_defined { 
+    return 1;
+}
+
+sub text_match {
+    my ($str, $search, $global) = @_;
+    return $str unless defined $str and defined $search;
+    my @matches = $global ? ($str =~ /$search/g)
+        : ($str =~ /$search/);
+    return @matches ? \@matches : '';
+}
+
+sub text_search { 
+    my ($str, $pattern) = @_;
+    return $str unless defined $str and defined $pattern;
+    return $str =~ /$pattern/;
+}
+
+sub text_repeat { 
+    my ($str, $count) = @_;
+    $str = '' unless defined $str;  
+    return '' unless $count;
+    $count ||= 1;
+    return $str x $count;
+}
+
+sub text_replace {
+    my ($text, $pattern, $replace, $global) = @_;
+    $text    = '' unless defined $text;
+    $pattern = '' unless defined $pattern;
+    $replace = '' unless defined $replace;
+    $global  = 1  unless defined $global;
+
+    if ($replace =~ /\$\d+/) {
+        # replacement string may contain backrefs
+        my $expand = sub {
+            my ($chunk, $start, $end) = @_;
+            $chunk =~ s{ \\(\\|\$) | \$ (\d+) }{
+                $1 ? $1
+                    : ($2 > $#$start || $2 == 0) ? '' 
+                    : substr($text, $start->[$2], $end->[$2] - $start->[$2]);
+            }exg;
+            $chunk;
+        };
+        if ($global) {
+            $text =~ s{$pattern}{ &$expand($replace, [@-], [@+]) }eg;
+        } 
+        else {
+            $text =~ s{$pattern}{ &$expand($replace, [@-], [@+]) }e;
+        }
+    }
+    else {
+        if ($global) {
+            $text =~ s/$pattern/$replace/g;
+        } 
+        else {
+            $text =~ s/$pattern/$replace/;
+        }
+    }
+    return $text;
+}
+
+sub text_remove { 
+    my ($str, $search) = @_;
+    return $str unless defined $str and defined $search;
+    $str =~ s/$search//g;
+    return $str;
+}
+    
+sub text_split {
+    my ($str, $split, $limit) = @_;
+    $str = '' unless defined $str;
+    
+    # we have to be very careful about spelling out each possible 
+    # combination of arguments because split() is very sensitive
+    # to them, for example C<split(' ', ...)> behaves differently 
+    # to C<$space=' '; split($space, ...)>
+    
+    if (defined $limit) {
+        return [ defined $split 
+                 ? split($split, $str, $limit)
+                 : split(' ', $str, $limit) ];
+    }
+    else {
+        return [ defined $split 
+                 ? split($split, $str)
+                 : split(' ', $str) ];
+    }
+}
+
+sub text_chunk {
+    my ($string, $size) = @_;
+    my @list;
+    $size ||= 1;
+    if ($size < 0) {
+        # sexeger!  It's faster to reverse the string, search
+        # it from the front and then reverse the output than to 
+        # search it from the end, believe it nor not!
+        $string = reverse $string;
+        $size = -$size;
+        unshift(@list, scalar reverse $1) 
+            while ($string =~ /((.{$size})|(.+))/g);
+    }
+    else {
+        push(@list, $1) while ($string =~ /((.{$size})|(.+))/g);
+    }
+    return \@list;
+}
+
+sub text_substr {
+    my ($text, $offset, $length, $replacement) = @_;
+    $offset ||= 0;
+    
+    if(defined $length) {
+        if (defined $replacement) {
+            substr( $text, $offset, $length, $replacement );
+            return $text;
+        }
+        else {
+            return substr( $text, $offset, $length );
+        }
+    }
+    else {
+        return substr( $text, $offset );
+    }
+}
+
+
+#========================================================================
+# hash virtual methods
+#========================================================================
+
+
+sub hash_item { 
+    my ($hash, $item) = @_; 
+    $item = '' unless defined $item;
+    return if $PRIVATE && $item =~ /$PRIVATE/;
+    $hash->{ $item };
+}
+
+sub hash_hash { 
+    $_[0];
+}
+
+sub hash_size { 
+    scalar keys %{$_[0]};
+}
+
+sub hash_each { 
+    # this will be changed in TT3 to do what hash_pairs() does
+    [ %{ $_[0] } ];
+}
+
+sub hash_keys { 
+    [ keys   %{ $_[0] } ];
+}
+
+sub hash_values { 
+    [ values %{ $_[0] } ];
+}
+
+sub hash_items {
+    [ %{ $_[0] } ];
+}
+
+sub hash_pairs { 
+    [ map { 
+        { key => $_ , value => $_[0]->{ $_ } } 
+      }
+      sort keys %{ $_[0] } 
+    ];
+}
+
+sub hash_list { 
+    my ($hash, $what) = @_;  
+    $what ||= '';
+    return ($what eq 'keys')   ? [   keys %$hash ]
+        :  ($what eq 'values') ? [ values %$hash ]
+        :  ($what eq 'each')   ? [        %$hash ]
+        :  # for now we do what pairs does but this will be changed 
+           # in TT3 to return [ $hash ] by default
+        [ map { { key => $_ , value => $hash->{ $_ } } }
+          sort keys %$hash 
+          ];
+}
+
+sub hash_exists { 
+    exists $_[0]->{ $_[1] };
+}
+
+sub hash_defined { 
+    # return the item requested, or 1 if no argument 
+    # to indicate that the hash itself is defined
+    my $hash = shift;
+    return @_ ? defined $hash->{ $_[0] } : 1;
+}
+
+sub hash_delete { 
+    my $hash = shift; 
+    delete $hash->{ $_ } for @_;
+}
+
+sub hash_import { 
+    my ($hash, $imp) = @_;
+    $imp = {} unless ref $imp eq 'HASH';
+    @$hash{ keys %$imp } = values %$imp;
+    return '';
+}
+
+sub hash_sort {
+    my ($hash) = @_;
+    [ sort { lc $hash->{$a} cmp lc $hash->{$b} } (keys %$hash) ];
+}
+
+sub hash_nsort {
+    my ($hash) = @_;
+    [ sort { $hash->{$a} <=> $hash->{$b} } (keys %$hash) ];
+}
+
+
+#========================================================================
+# list virtual methods
+#========================================================================
+
+
+sub list_item {
+    $_[0]->[ $_[1] || 0 ];
+}
+
+sub list_list { 
+    $_[0];
+}
+
+sub list_hash { 
+    my $list = shift;
+    if (@_) {
+        my $n = shift || 0;
+        return { map { ($n++, $_) } @$list }; 
+    }
+    no warnings;
+    return { @$list };
+}
+
+sub list_push {
+    my $list = shift; 
+    push(@$list, @_); 
+    return '';
+}
+
+sub list_pop {
+    my $list = shift; 
+    pop(@$list);
+}
+
+sub list_unshift {
+    my $list = shift; 
+    unshift(@$list, @_); 
+    return '';
+}
+
+sub list_shift {
+    my $list = shift; 
+    shift(@$list);
+}
+
+sub list_max {
+    no warnings;
+    my $list = shift; 
+    $#$list; 
+}
+
+sub list_size {
+    no warnings;
+    my $list = shift; 
+    $#$list + 1; 
+}
+
+sub list_defined {
+    # return the item requested, or 1 if no argument to 
+    # indicate that the hash itself is defined
+    my $list = shift;
+    return @_ ? defined $list->[$_[0]] : 1;
+}
+
+sub list_first {
+    my $list = shift;
+    return $list->[0] unless @_;
+    return [ @$list[0..$_[0]-1] ];
+}
+
+sub list_last {
+    my $list = shift;
+    return $list->[-1] unless @_;
+    return [ @$list[-$_[0]..-1] ];
+}
+
+sub list_reverse {
+    my $list = shift; 
+    [ reverse @$list ];
+}
+
+sub list_grep {
+    my ($list, $pattern) = @_;
+    $pattern ||= '';
+    return [ grep /$pattern/, @$list ];
+}
+
+sub list_join {
+    my ($list, $joint) = @_; 
+    join(defined $joint ? $joint : ' ', 
+         map { defined $_ ? $_ : '' } @$list);
+}
+
+sub _list_sort_make_key {
+   my ($item, $fields) = @_;
+   my @keys;
+
+   if (ref($item) eq 'HASH') {
+       @keys = map { $item->{ $_ } } @$fields;
+   }
+   elsif (blessed $item) {
+       @keys = map { $item->can($_) ? $item->$_() : $item } @$fields;
+   }
+   else {
+       @keys = $item;
+   }
+   
+   # ugly hack to generate a single string using a delimiter that is
+   # unlikely (but not impossible) to be found in the wild.
+   return lc join('/*^UNLIKELY^*/', map { defined $_ ? $_ : '' } @keys);
+}
+
+sub list_sort {
+    my ($list, @fields) = @_;
+    return $list unless @$list > 1;         # no need to sort 1 item lists
+    return [ 
+        @fields                          # Schwartzian Transform 
+        ?   map  { $_->[0] }                # for case insensitivity
+            sort { $a->[1] cmp $b->[1] }
+            map  { [ $_, _list_sort_make_key($_, \@fields) ] }
+            @$list
+        :  map  { $_->[0] }
+           sort { $a->[1] cmp $b->[1] }
+           map  { [ $_, lc $_ ] } 
+           @$list,
+    ];
+}
+
+sub list_nsort {
+    my ($list, @fields) = @_;
+    return $list unless @$list > 1;     # no need to sort 1 item lists
+    return [ 
+        @fields                         # Schwartzian Transform 
+        ?  map  { $_->[0] }             # for case insensitivity
+           sort { $a->[1] <=> $b->[1] }
+           map  { [ $_, _list_sort_make_key($_, \@fields) ] }
+           @$list 
+        :  map  { $_->[0] }
+           sort { $a->[1] <=> $b->[1] }
+           map  { [ $_, lc $_ ] } 
+           @$list,
+    ];
+}
+
+sub list_unique {
+    my %u; 
+    [ grep { ++$u{$_} == 1 } @{$_[0]} ];
+}
+
+sub list_import {
+    my $list = shift;
+    push(@$list, grep defined, map ref eq 'ARRAY' ? @$_ : undef, @_);
+    return $list;
+}
+
+sub list_merge {
+    my $list = shift;
+    return [ @$list, grep defined, map ref eq 'ARRAY' ? @$_ : undef, @_ ];
+}
+
+sub list_slice {
+    my ($list, $from, $to) = @_;
+    $from ||= 0;
+    $to    = $#$list unless defined $to;
+    $from += @$list if $from < 0;
+    $to   += @$list if $to   < 0;
+    return [ @$list[$from..$to] ];
+}
+
+sub list_splice {
+    my ($list, $offset, $length, @replace) = @_;
+    if (@replace) {
+        # @replace can contain a list of multiple replace items, or 
+        # be a single reference to a list
+        @replace = @{ $replace[0] }
+        if @replace == 1 && ref $replace[0] eq 'ARRAY';
+        return [ splice @$list, $offset, $length, @replace ];
+    }
+    elsif (defined $length) {
+        return [ splice @$list, $offset, $length ];
+    }
+    elsif (defined $offset) {
+        return [ splice @$list, $offset ];
+    }
+    else {
+        return [ splice(@$list) ];
+    }
+}
+
+1;
+
+__END__
+
+=head1 NAME
+
+Template::VMethods - Virtual methods for variables
+
+=head1 DESCRIPTION
+
+The C<Template::VMethods> module implements the virtual methods
+that can be applied to variables.
+
+Please see L<Template::Manual::VMethods> for further information.
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 1996-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Stash>, L<Template::Manual::VMethods>
+
+=cut
+
+# Local Variables:
+# mode: perl
+# perl-indent-level: 4
+# indent-tabs-mode: nil
+# End:
+#
+# vim: expandtab shiftwidth=4:
diff --git a/bench/perl/Template/View.pm b/bench/perl/Template/View.pm
new file mode 100644
index 0000000..416065a
--- /dev/null
+++ b/bench/perl/Template/View.pm
@@ -0,0 +1,743 @@
+#============================================================= -*-Perl-*-
+#
+# Template::View
+#
+# DESCRIPTION
+#   A custom view of a template processing context.  Can be used to 
+#   implement custom "skins".
+#
+# AUTHOR
+#   Andy Wardley   <abw at kfs.org>
+#
+# COPYRIGHT
+#   Copyright (C) 2000 Andy Wardley.  All Rights Reserved.
+#
+#   This module is free software; you can redistribute it and/or
+#   modify it under the same terms as Perl itself.
+#
+# TODO
+#  * allowing print to have a hash ref as final args will cause problems
+#    if you do this: [% view.print(hash1, hash2, hash3) %].  Current
+#    work-around is to do [% view.print(hash1); view.print(hash2); 
+#    view.print(hash3) %] or [% view.print(hash1, hash2, hash3, { }) %]
+#
+#============================================================================
+
+package Template::View;
+
+use strict;
+use warnings;
+use base 'Template::Base';
+
+our $VERSION  = 2.91;
+our $DEBUG    = 0 unless defined $DEBUG;
+our @BASEARGS = qw( context );
+our $AUTOLOAD;
+our $MAP = {
+    HASH    => 'hash',
+    ARRAY   => 'list',
+    TEXT    => 'text',
+    default => '',
+};
+
+
+#------------------------------------------------------------------------
+# _init(\%config)
+#
+# Initialisation method called by the Template::Base class new() 
+# constructor.  $self->{ context } has already been set, by virtue of
+# being named in @BASEARGS.  Remaining config arguments are presented 
+# as a hash reference.
+#------------------------------------------------------------------------
+
+sub _init {
+    my ($self, $config) = @_;
+
+    # move 'context' somewhere more private
+    $self->{ _CONTEXT } = $self->{ context };
+    delete $self->{ context };
+    
+    # generate table mapping object types to templates
+    my $map = $config->{ map } || { };
+    $map->{ default } = $config->{ default } unless defined $map->{ default };
+    $self->{ map } = {
+        %$MAP,
+        %$map,
+    };
+
+    # local BLOCKs definition table
+    $self->{ _BLOCKS } = $config->{ blocks } || { };
+    
+    # name of presentation method which printed objects might provide
+    $self->{ method } = defined $config->{ method } 
+                              ? $config->{ method } : 'present';
+    
+    # view is sealed by default preventing variable update after 
+    # definition, however we don't actually seal a view until the 
+    # END of the view definition
+    my $sealed = $config->{ sealed };
+    $sealed = 1 unless defined $sealed;
+    $self->{ sealed } = $sealed ? 1 : 0;
+
+    # copy remaining config items from $config or set defaults
+    foreach my $arg (qw( base prefix suffix notfound silent )) {
+        $self->{ $arg } = $config->{ $arg } || '';
+    }
+
+    # name of data item used by view()
+    $self->{ item } = $config->{ item } || 'item';
+
+    # map methods of form ${include_prefix}_foobar() to include('foobar')?
+    $self->{ include_prefix } = $config->{ include_prefix } || 'include_';
+    # what about mapping foobar() to include('foobar')?
+    $self->{ include_naked  } = defined $config->{ include_naked } 
+                                      ? $config->{ include_naked } : 1;
+
+    # map methods of form ${view_prefix}_foobar() to include('foobar')?
+    $self->{ view_prefix } = $config->{ view_prefix } || 'view_';
+    # what about mapping foobar() to view('foobar')?
+    $self->{ view_naked  } = $config->{ view_naked  } || 0;
+
+    # the view is initially unsealed, allowing directives in the initial 
+    # view template to create data items via the AUTOLOAD; once sealed via
+    # call to seal(), the AUTOLOAD will not update any internal items.
+    delete @$config{ qw( base method map default prefix suffix notfound item 
+                         include_prefix include_naked silent sealed
+                         view_prefix view_naked blocks ) };
+    $config = { %{ $self->{ base }->{ data } }, %$config }
+        if $self->{ base };
+    $self->{ data   } = $config;
+    $self->{ SEALED } = 0;
+
+    return $self;
+}
+
+
+#------------------------------------------------------------------------
+# seal()
+# unseal()
+#
+# Seal or unseal the view to allow/prevent new datat items from being
+# automatically created by the AUTOLOAD method.
+#------------------------------------------------------------------------
+
+sub seal {
+    my $self = shift;
+    $self->{ SEALED } = $self->{ sealed };
+}
+
+sub unseal {
+    my $self = shift;
+    $self->{ SEALED } = 0;
+}
+
+
+#------------------------------------------------------------------------
+# clone(\%config)
+#
+# Cloning method which takes a copy of $self and then applies to it any 
+# modifications specified in the $config hash passed as an argument.
+# Configuration items may also be specified as a list of "name => $value"
+# arguments.  Returns a reference to the cloned Template::View object.
+#
+# NOTE: may need to copy BLOCKS???
+#------------------------------------------------------------------------
+
+sub clone {
+    my $self   = shift;
+    my $clone  = bless { %$self }, ref $self;
+    my $config = ref $_[0] eq 'HASH' ? shift : { @_ };
+
+    # merge maps
+    $clone->{ map } = {
+        %{ $self->{ map } },
+        %{ $config->{ map } || { } },
+    };
+
+    # "map => { default=>'xxx' }" can be specified as "default => 'xxx'"
+    $clone->{ map }->{ default } = $config->{ default }
+        if defined $config->{ default };
+
+    # update any remaining config items
+    my @args = qw( base prefix suffix notfound item method include_prefix 
+                   include_naked view_prefix view_naked );
+    foreach my $arg (@args) {
+        $clone->{ $arg } = $config->{ $arg } if defined $config->{ $arg };
+    }
+    push(@args, qw( default map ));
+    delete @$config{ @args };
+
+    # anything left is data
+    my $data = $clone->{ data } = { %{ $self->{ data } } };
+    @$data{ keys %$config } = values %$config;
+
+    return $clone;
+}
+
+
+#------------------------------------------------------------------------
+# print(@items, ..., \%config)
+#
+# Prints @items in turn by mapping each to an approriate template using 
+# the internal 'map' hash.  If an entry isn't found and the item is an 
+# object that implements the method named in the internal 'method' item,
+# (default: 'present'), then the method will be called passing a reference
+# to $self, against which the presenter method may make callbacks (e.g. 
+# to view_item()).  If the presenter method isn't implemented, then the 
+# 'default' map entry is consulted and used if defined.  The final argument 
+# may be a reference to a hash array providing local overrides to the internal
+# defaults for various items (prefix, suffix, etc).  In the presence
+# of this parameter, a clone of the current object is first made, applying
+# any configuration updates, and control is then delegated to it.
+#------------------------------------------------------------------------
+
+sub print {
+    my $self = shift;
+
+    # if final config hash is specified then create a clone and delegate to it
+    # NOTE: potential problem when called print(\%data_hash1, \%data_hash2);
+    if ((scalar @_ > 1) && (ref $_[-1] eq 'HASH')) {
+        my $cfg = pop @_;
+        my $clone = $self->clone($cfg)
+            || return;
+        return $clone->print(@_) 
+            || $self->error($clone->error());
+    }
+    my ($item, $type, $template, $present);
+    my $method = $self->{ method };
+    my $map = $self->{ map };
+    my $output = '';
+    
+    # print each argument
+    foreach $item (@_) {
+        my $newtype;
+        
+        if (! ($type = ref $item)) {
+            # non-references are TEXT
+            $type = 'TEXT';
+            $template = $map->{ $type };
+        }
+        elsif (! defined ($template = $map->{ $type })) {
+            # no specific map entry for object, maybe it implements a 
+            # 'present' (or other) method?
+            if ( $method && UNIVERSAL::can($item, $method) ) {
+                $present = $item->$method($self);       ## call item method
+                # undef returned indicates error, note that we expect 
+                # $item to have called error() on the view
+                return unless defined $present;
+                $output .= $present;
+                next;                                   ## NEXT
+            }   
+            elsif ( ref($item) eq 'HASH' 
+                    && defined($newtype = $item->{$method})
+                    && defined($template = $map->{"$method=>$newtype"})) {
+            }
+            elsif ( defined($newtype)
+                    && defined($template = $map->{"$method=>*"}) ) {
+                $template =~ s/\*/$newtype/;
+            }    
+            elsif (! ($template = $map->{ default }) ) {
+                # default not defined, so construct template name from type
+                ($template = $type) =~ s/\W+/_/g;
+            }
+        }
+#       else {
+#           $self->DEBUG("defined map type for $type: $template\n");
+#       }
+        $self->DEBUG("printing view '", $template || '', "', $item\n") if $DEBUG;
+        $output .= $self->view($template, $item)
+            if $template;
+    }
+    return $output;
+}
+
+
+#------------------------------------------------------------------------
+# view($template, $item, \%vars)
+#
+# Wrapper around include() which expects a template name, $template,
+# followed by a data item, $item, and optionally, a further hash array
+# of template variables.  The $item is added as an entry to the $vars
+# hash (which is created empty if not passed as an argument) under the
+# name specified by the internal 'item' member, which is appropriately
+# 'item' by default.  Thus an external object present() method can
+# callback against this object method, simply passing a data item to
+# be displayed.  The external object doesn't have to know what the
+# view expects the item to be called in the $vars hash.
+#------------------------------------------------------------------------
+
+sub view {
+    my ($self, $template, $item) = splice(@_, 0, 3);
+    my $vars = ref $_[0] eq 'HASH' ? shift : { @_ };
+    $vars->{ $self->{ item } } = $item if defined $item;
+    $self->include($template, $vars);
+}
+
+
+#------------------------------------------------------------------------
+# include($template, \%vars)
+#
+# INCLUDE a template, $template, mapped according to the current prefix,
+# suffix, default, etc., where $vars is an optional hash reference 
+# containing template variable definitions.  If the template isn't found
+# then the method will default to any 'notfound' template, if defined 
+# as an internal item.
+#------------------------------------------------------------------------
+
+sub include {
+    my ($self, $template, $vars) = @_;
+    my $context = $self->{ _CONTEXT };
+
+    $template = $self->template($template);
+
+    $vars = { } unless ref $vars eq 'HASH';
+    $vars->{ view } ||= $self;
+
+    $context->include( $template, $vars );
+
+# DEBUGGING
+#    my $out = $context->include( $template, $vars );
+#    print STDERR "VIEW return [$out]\n";
+#    return $out;
+}
+
+
+#------------------------------------------------------------------------
+# template($template)
+#
+# Returns a compiled template for the specified template name, according
+# to the current configuration parameters.
+#------------------------------------------------------------------------
+
+sub template {
+    my ($self, $name) = @_;
+    my $context = $self->{ _CONTEXT };
+    return $context->throw(Template::Constants::ERROR_VIEW,
+                           "no view template specified")
+        unless $name;
+
+    my $notfound = $self->{ notfound };
+    my $base = $self->{ base };
+    my ($template, $block, $error);
+
+    return $block
+        if ($block = $self->{ _BLOCKS }->{ $name });
+    
+    # try the named template
+    $template = $self->template_name($name);
+    $self->DEBUG("looking for $template\n") if $DEBUG;
+    eval { $template = $context->template($template) };
+
+    # try asking the base view if not found
+    if (($error = $@) && $base) {
+        $self->DEBUG("asking base for $name\n") if $DEBUG;
+        eval { $template = $base->template($name) };
+    }
+
+    # try the 'notfound' template (if defined) if that failed
+    if (($error = $@) && $notfound) {
+        unless ($template = $self->{ _BLOCKS }->{ $notfound }) {
+            $notfound = $self->template_name($notfound);
+            $self->DEBUG("not found, looking for $notfound\n") if $DEBUG;
+            eval { $template = $context->template($notfound) };
+
+            return $context->throw(Template::Constants::ERROR_VIEW, $error)
+                if $@;  # return first error
+        }
+    }
+    elsif ($error) {
+        $self->DEBUG("no 'notfound'\n") 
+            if $DEBUG;
+        return $context->throw(Template::Constants::ERROR_VIEW, $error);
+    }
+    return $template;
+}
+
+    
+#------------------------------------------------------------------------
+# template_name($template)
+#
+# Returns the name of the specified template with any appropriate prefix
+# and/or suffix added.
+#------------------------------------------------------------------------
+
+sub template_name {
+    my ($self, $template) = @_;
+    $template = $self->{ prefix } . $template . $self->{ suffix }
+        if $template;
+
+    $self->DEBUG("template name: $template\n") if $DEBUG;
+    return $template;
+}
+
+
+#------------------------------------------------------------------------
+# default($val)
+#
+# Special case accessor to retrieve/update 'default' as an alias for 
+# '$map->{ default }'.
+#------------------------------------------------------------------------
+
+sub default {
+    my $self = shift;
+    return @_ ? ($self->{ map }->{ default } = shift) 
+              :  $self->{ map }->{ default };
+}
+
+
+#------------------------------------------------------------------------
+# AUTOLOAD
+#
+
+# Returns/updates public internal data items (i.e. not prefixed '_' or
+# '.') or presents a view if the method matches the view_prefix item,
+# e.g. view_foo(...) => view('foo', ...).  Similarly, the
+# include_prefix is used, if defined, to map include_foo(...) to
+# include('foo', ...).  If that fails then the entire method name will
+# be used as the name of a template to include iff the include_named
+# parameter is set (default: 1).  Last attempt is to match the entire
+# method name to a view() call, iff view_naked is set.  Otherwise, a
+# 'view' exception is raised reporting the error "no such view member:
+# $method".
+#------------------------------------------------------------------------
+
+sub AUTOLOAD {
+    my $self = shift;
+    my $item = $AUTOLOAD;
+    $item =~ s/.*:://;
+    return if $item eq 'DESTROY';
+
+    if ($item =~ /^[\._]/) {
+        return $self->{ _CONTEXT }->throw(Template::Constants::ERROR_VIEW,
+                            "attempt to view private member: $item");
+    }
+    elsif (exists $self->{ $item }) {
+        # update existing config item (e.g. 'prefix') if unsealed
+        return $self->{ _CONTEXT }->throw(Template::Constants::ERROR_VIEW,
+                            "cannot update config item in sealed view: $item")
+            if @_ && $self->{ SEALED };
+        $self->DEBUG("accessing item: $item\n") if $DEBUG;
+        return @_ ? ($self->{ $item } = shift) : $self->{ $item };
+    }
+    elsif (exists $self->{ data }->{ $item }) {
+        # get/update existing data item (must be unsealed to update)
+        if (@_ && $self->{ SEALED }) {
+            return $self->{ _CONTEXT }->throw(Template::Constants::ERROR_VIEW,
+                                  "cannot update item in sealed view: $item")
+                unless $self->{ silent };
+            # ignore args if silent
+            @_ = ();
+        }
+        $self->DEBUG(@_ ? "updating data item: $item <= $_[0]\n" 
+                        : "returning data item: $item\n") if $DEBUG;
+        return @_ ? ($self->{ data }->{ $item } = shift) 
+                  :  $self->{ data }->{ $item };
+    }
+    elsif (@_ && ! $self->{ SEALED }) {
+        # set data item if unsealed
+        $self->DEBUG("setting unsealed data: $item => @_\n") if $DEBUG;
+        $self->{ data }->{ $item } = shift;
+    }
+    elsif ($item =~ s/^$self->{ view_prefix }//) {
+        $self->DEBUG("returning view($item)\n") if $DEBUG;
+        return $self->view($item, @_);
+    }
+    elsif ($item =~ s/^$self->{ include_prefix }//) {
+        $self->DEBUG("returning include($item)\n") if $DEBUG;
+        return $self->include($item, @_);
+    }
+    elsif ($self->{ include_naked }) {
+        $self->DEBUG("returning naked include($item)\n") if $DEBUG;
+        return $self->include($item, @_);
+    }
+    elsif ($self->{ view_naked }) {
+        $self->DEBUG("returning naked view($item)\n") if $DEBUG;
+        return $self->view($item, @_);
+    }
+    else {
+        return $self->{ _CONTEXT }->throw(Template::Constants::ERROR_VIEW,
+                                         "no such view member: $item");
+    }
+}
+
+
+1;
+
+
+__END__
+
+=head1 NAME
+
+Template::View - customised view of a template processing context
+
+=head1 SYNOPSIS
+
+    # define a view
+    [% VIEW view
+            # some standard args
+            prefix        => 'my_', 
+            suffix        => '.tt2',
+            notfound      => 'no_such_file'
+            ...
+
+            # any other data
+            title         => 'My View title'
+            other_item    => 'Joe Random Data'
+            ...
+    %]
+       # add new data definitions, via 'my' self reference
+       [% my.author = "$abw.name <$abw.email>" %]
+       [% my.copy   = "© Copyright 2000 $my.author" %]
+
+       # define a local block
+       [% BLOCK header %]
+       This is the header block, title: [% title or my.title %]
+       [% END %]
+
+    [% END %]
+
+    # access data items for view
+    [% view.title %]
+    [% view.other_item %]
+
+    # access blocks directly ('include_naked' option, set by default)
+    [% view.header %]
+    [% view.header(title => 'New Title') %]
+
+    # non-local templates have prefix/suffix attached
+    [% view.footer %]           # => [% INCLUDE my_footer.tt2 %]
+
+    # more verbose form of block access
+    [% view.include( 'header', title => 'The Header Title' ) %]
+    [% view.include_header( title => 'The Header Title' ) %]
+
+    # very short form of above ('include_naked' option, set by default)
+    [% view.header( title => 'The Header Title' ) %]
+
+    # non-local templates have prefix/suffix attached
+    [% view.footer %]           # => [% INCLUDE my_footer.tt2 %]
+
+    # fallback on the 'notfound' template ('my_no_such_file.tt2')
+    # if template not found 
+    [% view.include('missing') %]
+    [% view.include_missing %]
+    [% view.missing %]
+
+    # print() includes a template relevant to argument type
+    [% view.print("some text") %]     # type=TEXT, template='text'
+
+    [% BLOCK my_text.tt2 %]           # 'text' with prefix/suffix
+       Text: [% item %]
+    [% END %]
+
+    # now print() a hash ref, mapped to 'hash' template
+    [% view.print(some_hash_ref) %]   # type=HASH, template='hash'
+
+    [% BLOCK my_hash.tt2 %]           # 'hash' with prefix/suffix
+       hash keys: [% item.keys.sort.join(', ')
+    [% END %]
+
+    # now print() a list ref, mapped to 'list' template
+    [% view.print(my_list_ref) %]     # type=ARRAY, template='list'
+
+    [% BLOCK my_list.tt2 %]           # 'list' with prefix/suffix
+       list: [% item.join(', ') %]
+    [% END %]
+
+    # print() maps 'My::Object' to 'My_Object'
+    [% view.print(myobj) %]
+
+    [% BLOCK my_My_Object.tt2 %]
+       [% item.this %], [% item.that %]
+    [% END %]
+
+    # update mapping table
+    [% view.map.ARRAY = 'my_list_template' %]
+    [% view.map.TEXT  = 'my_text_block'    %]
+
+
+    # change prefix, suffix, item name, etc.
+    [% view.prefix = 'your_' %]
+    [% view.default = 'anyobj' %]
+    ...
+
+=head1 DESCRIPTION
+
+TODO
+
+=head1 METHODS
+
+=head2 new($context, \%config)
+
+Creates a new Template::View presenting a custom view of the specified 
+$context object.
+
+A reference to a hash array of configuration options may be passed as the 
+second argument.
+
+=over 4
+
+=item prefix
+
+Prefix added to all template names.
+
+    [% USE view(prefix => 'my_') %]
+    [% view.view('foo', a => 20) %]     # => my_foo
+
+=item suffix
+
+Suffix added to all template names.
+
+    [% USE view(suffix => '.tt2') %]
+    [% view.view('foo', a => 20) %]     # => foo.tt2
+
+=item map 
+
+Hash array mapping reference types to template names.  The print() 
+method uses this to determine which template to use to present any
+particular item.  The TEXT, HASH and ARRAY items default to 'test', 
+'hash' and 'list' appropriately.
+
+    [% USE view(map => { ARRAY   => 'my_list', 
+                         HASH    => 'your_hash',
+                         My::Foo => 'my_foo', } ) %]
+
+    [% view.print(some_text) %]         # => text
+    [% view.print(a_list) %]            # => my_list
+    [% view.print(a_hash) %]            # => your_hash
+    [% view.print(a_foo) %]             # => my_foo
+
+    [% BLOCK text %]
+       Text: [% item %]
+    [% END %]
+
+    [% BLOCK my_list %]
+       list: [% item.join(', ') %]
+    [% END %]
+
+    [% BLOCK your_hash %]
+       hash keys: [% item.keys.sort.join(', ')
+    [% END %]
+
+    [% BLOCK my_foo %] 
+       Foo: [% item.this %], [% item.that %]
+    [% END %]
+
+=item method
+
+Name of a method which objects passed to print() may provide for presenting
+themselves to the view.  If a specific map entry can't be found for an 
+object reference and it supports the method (default: 'present') then 
+the method will be called, passing the view as an argument.  The object 
+can then make callbacks against the view to present itself.
+
+    package Foo;
+
+    sub present {
+        my ($self, $view) = @_;
+        return "a regular view of a Foo\n";
+    }
+
+    sub debug {
+        my ($self, $view) = @_;
+        return "a debug view of a Foo\n";
+    }
+
+In a template:
+
+    [% USE view %]
+    [% view.print(my_foo_object) %]     # a regular view of a Foo
+
+    [% USE view(method => 'debug') %]
+    [% view.print(my_foo_object) %]     # a debug view of a Foo
+
+=item default
+
+Default template to use if no specific map entry is found for an item.
+
+    [% USE view(default => 'my_object') %]
+
+    [% view.print(objref) %]            # => my_object
+
+If no map entry or default is provided then the view will attempt to 
+construct a template name from the object class, substituting any 
+sequence of non-word characters to single underscores, e.g.
+
+    # 'fubar' is an object of class Foo::Bar
+    [% view.print(fubar) %]             # => Foo_Bar
+
+Any current prefix and suffix will be added to both the default template 
+name and any name constructed from the object class.
+
+=item notfound
+
+Fallback template to use if any other isn't found.
+
+=item item
+
+Name of the template variable to which the print() method assigns the current
+item.  Defaults to 'item'.
+
+    [% USE view %]
+    [% BLOCK list %] 
+       [% item.join(', ') %] 
+    [% END %]
+    [% view.print(a_list) %]
+
+    [% USE view(item => 'thing') %]
+    [% BLOCK list %] 
+       [% thing.join(', ') %] 
+    [% END %]
+    [% view.print(a_list) %]
+
+=item view_prefix
+
+Prefix of methods which should be mapped to view() by AUTOLOAD.  Defaults
+to 'view_'.
+
+    [% USE view %]
+    [% view.view_header() %]                    # => view('header')
+
+    [% USE view(view_prefix => 'show_me_the_' %]
+    [% view.show_me_the_header() %]             # => view('header')
+
+=item view_naked
+
+Flag to indcate if any attempt should be made to map method names to 
+template names where they don't match the view_prefix.  Defaults to 0.
+
+    [% USE view(view_naked => 1) %]
+
+    [% view.header() %]                 # => view('header')
+
+=back
+
+=head2 print( $obj1, $obj2, ... \%config)
+
+TODO
+
+=head2 view( $template, \%vars, \%config );
+
+TODO
+
+=head1 AUTHOR
+
+Andy Wardley E<lt>abw at wardley.orgE<gt> L<http://wardley.org/>
+
+=head1 COPYRIGHT
+
+Copyright (C) 2000-2007 Andy Wardley.  All Rights Reserved.
+
+This module is free software; you can redistribute it and/or
+modify it under the same terms as Perl itself.
+
+=head1 SEE ALSO
+
+L<Template::Plugin>
+
+=cut
+
+
+
+
+
diff --git a/bench/perl/gas.pm b/bench/perl/gas.pm
new file mode 100644
index 0000000..106bee3
--- /dev/null
+++ b/bench/perl/gas.pm
@@ -0,0 +1,211 @@
+#!/usr/bin/perl 
+
+package as;
+use Data::Dumper;
+use isax86;
+use isax86_64;
+
+$AS = { HEADER     => '.intel_syntax noprefix',
+	    FOOTER     => ''};
+
+$LOCAL = {};
+$MODE = 'GLOBAL';
+
+my $CURRENT_SECTION='NONE';
+my $WORDLENGTH;
+my $STACKPTR;
+my $BASEPTR;
+my $REG;
+my $ARG;
+
+sub emit_code
+{
+	my $code = shift;
+	$code =~ s/([GF]PR[0-9]+)/$REG->{$1}/g;
+	$code =~ s/(ARG[0-9]+)/$ARG->{$1}/g;
+	$code =~ s/(LOCAL[0-9]+)/$LOCAL->{$1}/g;
+	print "$code\n";
+}
+
+sub align
+{
+	my $number = shift;
+	print ".align $number\n";
+
+}
+
+sub mode
+{
+	$cmd = shift;
+
+	if ($cmd eq 'START') {
+		$MODE = 'LOCAL';
+	} elsif ($cmd eq 'STOP') {
+		$MODE = 'GLOBAL';
+	}
+}
+
+sub function_entry
+{
+	my $symbolname = shift;
+	my $allocate = shift;
+	my $distance;
+
+	foreach ( (0 .. $allocate) ) {
+		$distance =  $_ * $WORDLENGTH;
+		$LOCAL->{"LOCAL$_"} = "[$BASEPTR-$distance]";
+	}
+
+	if($CURRENT_SECTION ne 'text') {
+		$CURRENT_SECTION = 'text';
+		print ".text\n";
+	}
+
+	print ".globl $symbolname\n";
+	print ".type $symbolname, \@function\n";
+	print "$symbolname :\n";
+
+	if ($main::ISA eq 'x86') {
+		print "push ebp\n";
+		print "mov ebp, esp\n";
+		$distance = $allocate * $WORDLENGTH;
+		print "sub  esp, $distance\n" if ($allocate);
+		print "push ebx\n";
+		print "push esi\n";
+		print "push edi\n";
+	} elsif ($main::ISA eq 'x86-64') {
+		print "push rbp\n";
+		print "mov rbp, rsp\n";
+		$distance = $allocate * $WORDLENGTH;
+		print "sub  rsp, $distance\n" if ($allocate);
+		print "push rbx\n";
+		print "push r12\n";
+		print "push r13\n";
+		print "push r14\n";
+		print "push r15\n";
+	}
+}
+
+sub function_exit
+{
+	my $symbolname = shift;
+
+	$LOCAL = {};
+
+	if ($main::ISA eq 'x86') {
+		print "pop edi\n";
+		print "pop esi\n";
+		print "pop ebx\n";
+		print "mov  esp, ebp\n";
+		print "pop ebp\n";
+	} elsif ($main::ISA eq 'x86-64') {
+		print "pop r15\n";
+		print "pop r14\n";
+		print "pop r13\n";
+		print "pop r12\n";
+		print "pop rbx\n";
+		print "mov  rsp, rbp\n";
+		print "pop rbp\n";
+	}
+	print "ret\n";
+	print ".size $symbolname, .-$symbolname\n";
+	print "\n";
+}
+
+sub define_data
+{
+	my $symbolname = shift;
+	my $type = shift;
+	my $value = shift;
+
+	if($CURRENT_SECTION ne 'data') {
+		$CURRENT_SECTION = 'data';
+		print ".data\n";
+	}
+	print ".align 64\n";
+	print "$symbolname:\n";
+	if ($type eq 'DOUBLE') {
+		print ".double $value, $value, $value, $value, $value, $value, $value, $value\n"
+	} elsif ($type eq 'SINGLE') {
+		print ".single $value, $value, $value, $value, $value, $value, $value, $value\n"
+	} elsif ($type eq 'INT') {
+		print ".int $value, $value\n"
+	}
+}
+
+sub define_offset
+{
+	my $symbolname = shift;
+	my $type = shift;
+	my $value = shift;
+
+	if($CURRENT_SECTION ne 'data') {
+		$CURRENT_SECTION = 'data';
+		print ".data\n";
+	}
+	print ".align 16\n";
+	print "$symbolname:\n";
+  print ".int $value\n";
+}
+
+
+sub loop_entry
+{
+  my $symbolname = shift;
+  my $stopping_criterion = shift;
+  $stopping_criterion = $REG->{$stopping_criterion} if( exists $REG->{$stopping_criterion});
+
+  if ($main::ISA eq 'x86') {
+    print "xor   eax, eax\n";
+  } elsif ($main::ISA eq 'x86-64') {
+    print "xor   rax, rax\n";
+  }
+  print ".align 16\n";
+  if ($MODE eq 'GLOBAL') {
+    print "$symbolname :\n";
+  }else {
+    print "1:\n";
+  }
+
+}
+
+
+sub loop_exit
+{
+  my $symbolname = shift;
+  my $step = shift;
+
+  if ($main::ISA eq 'x86') {
+    print "add eax, $step\n";
+    print "cmp eax, edi\n";
+  } elsif ($main::ISA eq 'x86-64') {
+    print "add rax, $step\n";
+    print "cmp rax, rdi\n";
+  }
+  if ($MODE eq 'GLOBAL') {
+    print "jl $symbolname\n";
+  }else {
+    print "jl 1b\n";
+  }
+  print "\n";
+}
+
+sub isa_init
+{
+  if ($main::ISA eq 'x86') {
+    $WORDLENGTH = $isax86::WORDLENGTH_X86 ;
+    $STACKPTR = $isax86::STACKPTR_X86 ;
+    $BASEPTR = $isax86::BASEPTR_X86 ;
+    $REG = $isax86::REG_X86;
+    $ARG = $isax86::ARG_X86 ;
+  } elsif ($main::ISA eq 'x86-64') {
+    $WORDLENGTH = $isax86_64::WORDLENGTH_X86_64;
+    $STACKPTR = $isax86_64::STACKPTR_X86_64 ;
+    $BASEPTR = $isax86_64::BASEPTR_X86_64 ;
+    $REG = $isax86_64::REG_X86_64;
+    $ARG = $isax86_64::ARG_X86_64 ;
+  }
+}
+
+
+1;
diff --git a/bench/perl/generatePas.pl b/bench/perl/generatePas.pl
new file mode 100755
index 0000000..520cbc6
--- /dev/null
+++ b/bench/perl/generatePas.pl
@@ -0,0 +1,163 @@
+#!/usr/bin/perl
+
+use lib 'util';
+use strict;
+use warnings;
+use lib './perl';
+use File::Copy;
+use Cwd 'abs_path';
+use Data::Dumper;
+use Template;
+
+my @Testcases;
+my $name;
+my $streams;
+my $type;
+my $flops;
+my $bytes;
+my $prolog='';
+my $loop='';
+my $increment;
+my $isLoop=0;
+my $skip=0;
+my $multi=0;
+
+my $BenchRoot = $ARGV[0];
+my $OutputDirectory = $ARGV[1];
+my $TemplateRoot = $ARGV[2];
+my $DEBUG = 0;
+
+my $stream_lookup = {
+    STR0 => 'ARG2',
+    STR1 => 'ARG3',
+    STR2 => 'ARG4',
+    STR3 => 'ARG5',
+    STR4 => 'ARG6',
+    STR5 =>  '[rbp+16]',
+    STR6 =>  '[rbp+24]',
+    STR7 =>  '[rbp+32]',
+    STR8 =>  '[rbp+40]',
+    STR9 => '[rbp+48]',
+    STR10 => '[rbp+56]',
+    STR11 => '[rbp+64]',
+    STR12 => '[rbp+72]',
+    STR13 => '[rbp+80]',
+    STR14 => '[rbp+88]',
+    STR15 => '[rbp+96]',
+    STR16 => '[rbp+104]',
+    STR17 => '[rbp+112]',
+    STR18 => '[rbp+120]',
+    STR19 => '[rbp+128]',
+    STR20 => '[rbp+136]',
+    STR21 => '[rbp+144]',
+    STR22 => '[rbp+152]',
+    STR23 => '[rbp+160]',
+    STR24 => '[rbp+168]',
+    STR25 => '[rbp+176]',
+    STR26 => '[rbp+184]',
+    STR27 => '[rbp+192]',
+    STR28 => '[rbp+200]',
+    STR29 => '[rbp+208]',
+    STR30 => '[rbp+216]',
+    STR31 => '[rbp+224]',
+    STR32 => '[rbp+232]',
+    STR33 => '[rbp+240]',
+    STR34 => '[rbp+248]',
+    STR35 => '[rbp+256]',
+    STR36 => '[rbp+264]',
+    STR37 => '[rbp+272]',
+    STR38 => '[rbp+280]',
+    STR39 => '[rbp+288]',
+    STR40 => '[rbp+296]'};
+
+opendir (DIR, "./$BenchRoot") or die "Cannot open bench directory: $!\n";
+my $tpl = Template->new({
+        INCLUDE_PATH => ["$TemplateRoot"]
+        });
+
+while (defined(my $file = readdir(DIR))) {
+    if ($file !~ /^\./) {
+        print "SCANNING $file\n" if ($DEBUG);
+
+        $file =~ /([A-Za-z_0-9]+)\.ptt/;
+        $name = $1;
+
+        $isLoop = 0;
+        $skip=0;
+        $multi=0;
+        $prolog='';
+        $loop='';
+        open FILE, "<$BenchRoot/$file";
+        while (<FILE>) {
+            my $line = $_;
+
+            if($line =~ /STREAMS[ ]+([0-9]+)/) {
+                $streams = $1;
+                if ($streams > 10) {
+                    $multi = 1;
+                }
+            } elsif ($line =~ /TYPE[ ]+(SINGLE|DOUBLE)/) {
+                $type = $1;
+            } elsif ($line =~ /FLOPS[ ]+([0-9]+)/) {
+                $flops = $1;
+            } elsif ($line =~ /BYTES[ ]+([0-9]+)/) {
+                $bytes = $1;
+            } elsif ($line =~ /INC[ ]+([0-9]+)/) {
+                $increment = $1;
+                $skip = 1;
+            } elsif ($line =~ /LOOP[ ]+([0-9]+)/) {
+                $increment = $1;
+                $isLoop = 1;
+            } else {
+                if ($isLoop) {
+                    if($line =~ /SET[ ]+(STR[0-9]+)[ ]+(GPR[0-9]+)/) {
+                        $loop .= "#define $1  $2\n";
+                        $loop .= "mov $2, $stream_lookup->{$1}\n";
+                    } else {
+                        $loop .= $line;
+                    }
+                } else {
+                    $prolog .= $line;
+                }
+            }
+        }
+        close FILE;
+
+        if (($streams > 5) &&  ($streams < 10)) {
+            my $arg = 7;
+            foreach my $stream ( 5 .. $streams ) {
+                $prolog .= "mov STR$stream, ARG$arg\n";
+                $arg++;
+            }
+        }
+
+        $streams = 'STREAM_'.$streams;
+        my $Vars;
+        $Vars->{name} = $name;
+        $Vars->{prolog} = $prolog;
+        $Vars->{increment} = $increment;
+        $Vars->{loop} = $loop;
+        $Vars->{skip} = $skip;
+        $Vars->{multi} = $multi;
+
+#print Dumper($Vars);
+
+        $tpl->process('bench.tt', $Vars, "$OutputDirectory/$name.pas");
+        push(@Testcases,{name    => $name,
+                         streams => $streams,
+                         type    => $type,
+                         stride  => $increment,
+                         flops   => $flops, 
+                         bytes   => $bytes});
+    }
+}
+#print Dumper(@Testcases);
+my @TestcasesSorted = sort {$a->{name} cmp $b->{name}} @Testcases;
+
+my $Vars;
+$Vars->{Testcases} = \@TestcasesSorted;
+$Vars->{numKernels} = $#TestcasesSorted+1;
+$Vars->{allTests} = join('\n',map {$_->{name}} @TestcasesSorted);
+$tpl->process('testcases.tt', $Vars, "$OutputDirectory/testcases.h");
+
+
diff --git a/bench/perl/isax86.pm b/bench/perl/isax86.pm
new file mode 100644
index 0000000..7575f37
--- /dev/null
+++ b/bench/perl/isax86.pm
@@ -0,0 +1,45 @@
+#!/usr/bin/perl
+
+package isax86;
+
+$WORDLENGTH_X86 = 4;
+$STACKPTR_X86 = 'esp';
+$BASEPTR_X86  = 'ebp';
+
+$REG_X86 = { GPR1 => 'eax',
+  GPR2 => 'ebx',
+  GPR3 => 'ecx',
+  GPR4 => 'edx',
+  GPR5 => 'esi',
+  GPR6 => 'edi',
+  FPR1 => 'xmm0',
+  FPR2 => 'xmm1',
+  FPR3 => 'xmm2',
+  FPR4 => 'xmm3',
+  FPR5 => 'xmm4',
+  FPR6 => 'xmm5',
+  FPR7 => 'xmm6',
+  FPR8 => 'xmm7'};
+
+$ARG_X86 = {
+    ARG1 =>  '[ebp+8]',
+    ARG2 =>  '[ebp+12]',
+    ARG3 =>  '[ebp+16]',
+    ARG4 => '[ebp+20]',
+    ARG5 => '[ebp+24]',
+    ARG6 => '[ebp+28]',
+    ARG7 => '[ebp+32]',
+    ARG8 => '[ebp+36]',
+    ARG9 => '[ebp+40]',
+    ARG10 => '[ebp+44]',
+    ARG11 => '[ebp+48]',
+    ARG12 => '[ebp+52]',
+    ARG13 => '[ebp+56]',
+    ARG14 => '[ebp+60]',
+    ARG15 => '[ebp+64]',
+    ARG16 => '[ebp+68]',
+    ARG17 => '[ebp+72]',
+    ARG18 => '[ebp+76]'};
+
+
+1;
diff --git a/bench/perl/isax86_64.pm b/bench/perl/isax86_64.pm
new file mode 100644
index 0000000..7c57279
--- /dev/null
+++ b/bench/perl/isax86_64.pm
@@ -0,0 +1,66 @@
+#!/usr/bin/perl 
+
+package isax86_64;
+
+$WORDLENGTH_X86_64 = 8;
+$STACKPTR_X86_64 = 'rsp';
+$BASEPTR_X86_64  = 'rbp';
+
+$REG_X86_64 = { GPR1 => 'rax',
+  GPR2 => 'rbx',
+  GPR3 => 'rcx',
+  GPR4 => 'rdx',
+  GPR5 => 'rsi',
+  GPR6 => 'rdi',
+  GPR7 => 'r8',
+  GPR8 => 'r9',
+  GPR9 => 'r10',
+  GPR10 => 'r11',
+  GPR11 => 'r12',
+  GPR12 => 'r13',
+  GPR13 => 'r14',
+  GPR14 => 'r15',
+  FPR1 => 'xmm0',
+  FPR2 => 'xmm1',
+  FPR3 => 'xmm2',
+  FPR4 => 'xmm3',
+  FPR5 => 'xmm4',
+  FPR6 => 'xmm5',
+  FPR7 => 'xmm6',
+  FPR8 => 'xmm7',
+  FPR9 => 'xmm8',
+  FPR10 => 'xmm9',
+  FPR11 => 'xmm10',
+  FPR12 => 'xmm11',
+  FPR13 => 'xmm12',
+  FPR14 => 'xmm13',
+  FPR15 => 'xmm14',
+  FPR16 => 'xmm15'};
+
+$ARG_X86_64 = {
+  ARG1 => 'rdi',
+  ARG2 => 'rsi',
+  ARG3 => 'rdx',
+  ARG4 => 'rcx',
+  ARG5 => 'r8',
+  ARG6 => 'r9',
+  ARG7 =>  '[rbp+16]',
+  ARG8 =>  '[rbp+24]',
+  ARG9 =>  '[rbp+32]',
+  ARG10 => '[rbp+40]',
+  ARG11 => '[rbp+48]',
+  ARG12 => '[rbp+56]',
+  ARG13 => '[rbp+64]',
+  ARG14 => '[rbp+72]',
+  ARG15 => '[rbp+80]',
+  ARG16 => '[rbp+88]',
+  ARG17 => '[rbp+96]',
+  ARG18 => '[rbp+104]',
+  ARG19 => '[rbp+112]',
+  ARG20 => '[rbp+120]',
+  ARG21 => '[rbp+128]',
+  ARG22 => '[rbp+136]',
+  ARG23 => '[rbp+144]',
+  ARG24 => '[rbp+152]'};
+
+1;
diff --git a/bench/perl/templates/bench.tt b/bench/perl/templates/bench.tt
new file mode 100644
index 0000000..76e9438
--- /dev/null
+++ b/bench/perl/templates/bench.tt
@@ -0,0 +1,36 @@
+#define STR0 ARG2
+#define STR1 ARG3
+#define STR2 ARG4
+#define STR3 ARG5 
+#define STR4 ARG6 
+[% IF NOT multi %]
+#define STR5 GPR9 
+#define STR6 GPR10 
+#define STR7 GPR11 
+#define STR8 GPR12 
+#define STR9 GPR13 
+#define STR10 GPR14 
+[% END %]
+
+DEFINE DOUBLE SCALAR  1.0
+DEFINE SINGLE SSCALAR  1.0
+DEFINE INT ISCALAR  1
+DEFINE INT OMM  0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
+DEFINE INT IOMM  0,16,32,48,64,80,96,128,144,160,176,192,208,224,240,256
+DEFINE INT TOMM  0,2,4,6,16,18,20,22,32,34,36,38,48,50,52,54
+
+START LOCAL
+
+FUNC [% name %]
+{
+[% prolog %]
+
+[% IF NOT skip %]
+LOOP .loop [% increment %] GPR6 {
+[% loop %]
+}
+[% END %]
+}
+
+STOP LOCAL
+
diff --git a/bench/perl/templates/group.tt b/bench/perl/templates/group.tt
new file mode 100644
index 0000000..5676318
--- /dev/null
+++ b/bench/perl/templates/group.tt
@@ -0,0 +1,157 @@
+/* GENERATED FILE: DO NOTE EDIT */
+
+#define NUM_GROUPS_[% arch FILTER upper %] [% numGroups %]
+
+static PerfmonGroupMap [% arch %]_group_map[NUM_GROUPS_[% arch FILTER upper %]] = {
+[% FOREACH group IN groups %]
+    {"[% group.name %]",[% group.name %],[% group.isUncore %],"[% group.shortHelp %]","[% group.eventSet %]"},
+[% END %]
+};
+
+/*void
+perfmon_printDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup group)
+{
+    int threadId;
+    double time = rdtscTime;
+    double inverseClock = 1.0 /(double) timer_getCpuClock();
+    PerfmonResultTable tableData;
+    int numRows;
+    int numColumns = perfmon_numThreads;
+    bstring label;
+    bstrList* fc;
+    double** stat;
+    double tmpValue;
+    uint64_t cpi_instr = 0;
+    uint64_t cpi_cyc  = 0;
+    int cpi_index = 0;
+
+    switch ( group ) 
+    {
+[% FOREACH group IN groups %]
+        case [% group.name %]:
+            numRows = [% group.numRows %];
+            stat = (double**) malloc(numRows * sizeof(double*));
+            for (int i=0; i<numRows; i++)
+            {
+                stat[i] = (double*) malloc(4 * sizeof(double));
+                stat[i][0] = 0;
+                stat[i][1] = 0;
+                stat[i][2] = DBL_MAX;
+            }
+            INIT_BASIC;
+[% FOREACH metric IN group.metrics %]
+            bstrListAdd(fc,[% loop.count %],[% metric.label %]);
+[% END %]
+            initResultTable(&tableData, fc, numRows, numColumns);
+
+            for(threadId=0; threadId < perfmon_numThreads; threadId++)
+            {
+[% FOREACH metric IN group.metrics %]
+                tmpValue = [% metric.rule %];
+                if (!isnan(tmpValue))
+                {
+                    tableData.rows[[% loop.index %]].value[threadId] = tmpValue;
+                }
+                else
+                {
+                    tableData.rows[[% loop.index %]].value[threadId] = 0.0;
+                }
+[% IF metric.label == 'CPI' && arch == 'westmere' %]
+                cpi_instr += perfmon_getResult(threadId,"FIXC0");
+                cpi_cyc += perfmon_getResult(threadId,"FIXC1");
+                cpi_index = [% loop.index %];
+[% ELSE %]
+                stat[[% loop.index %]][0] += (double) tableData.rows[[% loop.index %]].value[threadId];
+[% END %]
+                stat[[% loop.index %]][1] =  MAX(stat[[% loop.index %]][1],(double) tableData.rows[[% loop.index %]].value[threadId]);
+                stat[[% loop.index %]][2] =  MIN(stat[[% loop.index %]][2],(double) tableData.rows[[% loop.index %]].value[threadId]);
+[% END %]
+            }
+
+            if (cpi_instr)
+            {
+                stat[cpi_index][0] = (double) cpi_cyc / (double) cpi_instr;
+            }
+                
+            break;
+[% END %]
+
+        default:
+            fprintf (stderr, "perfmon_printDerivedMetrics[% arch %]: Unknown group! Exiting!\n" );
+            exit (EXIT_FAILURE);
+            break;
+    }
+
+    printResultTable(&tableData);
+    freeResultTable(&tableData);
+
+    // for threaded results print sum, max, min and avg 
+    if (perfmon_numThreads > 1)
+    {
+        initStatisticTable(&tableData, fc, numRows);
+        for (int i=0; i<numRows; i++)
+        {
+            stat[i][3] =  stat[i][0]/perfmon_numThreads;
+            for (int j=0; j<4; j++)
+            {
+                tableData.rows[i].value[j] = stat[i][j];
+            }
+        }
+        printResultTable(&tableData);
+        freeResultTable(&tableData);
+    }
+
+    for (int i=0; i<numRows; i++)
+    {
+        free(stat[i]);
+    }
+    free(stat);
+    bstrListDestroy(fc);
+}
+
+void
+perfmon_logDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup group, double time,double timeStamp)
+{
+    int threadId;
+    double tmpValue;
+    double inverseClock = 1.0 /(double) timer_getCpuClock();
+
+    switch ( group ) 
+    {
+        [% FOREACH group IN groups %]
+        case [% group.name %]:
+
+                    [% FOREACH metric IN group.metrics %]
+                        printf("[% metric.label %] %e ",timeStamp);
+                        for(threadId=0; threadId < perfmon_numThreads; threadId++)
+                        {
+                            tmpValue = [% metric.rule %];
+                            if (!isnan(tmpValue))
+                            {
+                                printf(" %e  ", tmpValue);
+                            }
+                            else
+                            {
+                                printf(" 0.0  ");
+                            }
+                        }
+                        printf("\n");
+                    [% END %]
+            break;
+            [% END %]
+
+        default:
+                fprintf (stderr, "perfmon_printDerivedMetrics[% arch %]: Unknown group! Exiting!\n" );
+                exit (EXIT_FAILURE);
+                break;
+    }
+}*/
+
+
+
+static PerfmonGroupHelp [% arch %]_group_help[NUM_GROUPS_[% arch FILTER upper %]] = {
+[% FOREACH group IN groups %]
+    {"[% group.name %]","[% group.longHelp %]"},
+[% END %]
+};
+
diff --git a/bench/perl/templates/group_types.tt b/bench/perl/templates/group_types.tt
new file mode 100644
index 0000000..1820248
--- /dev/null
+++ b/bench/perl/templates/group_types.tt
@@ -0,0 +1,13 @@
+#ifndef PERFMON_GROUP_TYPES_H
+#define PERFMON_GROUP_TYPES_H
+
+
+typedef enum {
+    _NOGROUP = 0,
+[% FOREACH group IN groups %]
+    [% group.key %],
+[% END %]
+    MAXNUMGROUPS
+    } PerfmonGroup;
+
+#endif
diff --git a/bench/perl/templates/testcases.tt b/bench/perl/templates/testcases.tt
new file mode 100644
index 0000000..1f03a85
--- /dev/null
+++ b/bench/perl/templates/testcases.tt
@@ -0,0 +1,19 @@
+#ifndef TESTCASES_H
+#define TESTCASES_H
+
+#include <test_types.h>
+
+[% FOREACH test IN Testcases %]
+extern void [% test.name %]();
+[% END %]
+
+#define TESTS  "[% allTests %]"
+#define NUMKERNELS [% numKernels %]
+
+static const TestCase kernels[NUMKERNELS] = {
+    [% FOREACH test IN Testcases %]
+    {"[% test.name %]" , [% test.streams %], [% test.type %], [% test.stride %], &[% test.name %], [% test.flops %], [% test.bytes %]},
+    [% END %]
+};
+
+#endif /* TESTCASES_H */
diff --git a/bench/src/allocator.c b/bench/src/allocator.c
new file mode 100644
index 0000000..45c8f4c
--- /dev/null
+++ b/bench/src/allocator.c
@@ -0,0 +1,171 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  allocator.c
+ *
+ *      Description:  Implementation of allocator module.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+/* #####   HEADER FILE INCLUDES   ######################################### */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <allocator.h>
+#include <likwid.h>
+
+/* #####   EXPORTED VARIABLES   ########################################### */
+
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+
+static int numberOfAllocatedVectors = 0;
+static void** allocations;
+static AffinityDomains_t domains;
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+void
+allocator_init(int numVectors)
+{
+    allocations = (void**) malloc(numVectors * sizeof(void*));
+    domains = get_affinityDomains();
+}
+
+
+void
+allocator_finalize()
+{
+    int i;
+
+    for (i=0; i<numberOfAllocatedVectors; i++)
+    {
+        free(allocations[i]);
+    }
+}
+
+void
+allocator_allocateVector(
+        void** ptr,
+        int alignment,
+        uint64_t size,
+        int offset,
+        DataType type,
+        bstring domainString)
+{
+    int i;
+    size_t bytesize = 0;
+    const AffinityDomain* domain;
+    int errorCode;
+
+    switch ( type )
+    {
+        case SINGLE:
+            bytesize = (size+offset) * sizeof(float);
+            break;
+
+        case DOUBLE:
+            bytesize = (size+offset) * sizeof(double);
+            break;
+    }
+
+    errorCode =  posix_memalign(ptr, alignment, bytesize);
+
+    if (errorCode)
+    {
+        if (errorCode == EINVAL)
+        {
+            fprintf(stderr,
+                    "Alignment parameter is not a power of two\n");
+            exit(EXIT_FAILURE);
+        }
+        if (errorCode == ENOMEM)
+        {
+            fprintf(stderr,
+                    "Insufficient memory to fulfill the request\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    if ((*ptr) == NULL)
+    {
+            fprintf(stderr, "posix_memalign failed!\n");
+            exit(EXIT_FAILURE);
+
+    }
+
+    allocations[numberOfAllocatedVectors] = *ptr;
+    numberOfAllocatedVectors++;
+    for (i=0;i<domains->numberOfAffinityDomains;i++)
+    {
+        if (biseq(domainString, domains->domains[i].tag))
+        {
+            domain = domains->domains + i;
+        }
+    }
+    affinity_pinProcess(domain->processorList[0]);
+
+    printf("Allocate: Process running on core %d - Vector length %llu Offset %d\n",
+            affinity_processGetProcessorId(),
+            LLU_CAST size,
+            offset);
+
+    switch ( type )
+    {
+        case SINGLE:
+            {
+                float* sptr = (float*) (*ptr);
+                sptr += offset;
+
+                for ( uint64_t i=0; i < size; i++ )
+                {
+                    sptr[i] = 1.0;
+                }
+                *ptr = (void*) sptr;
+
+            }
+            break;
+
+        case DOUBLE:
+            {
+                double* dptr = (double*) (*ptr);
+                dptr += offset;
+
+                for ( uint64_t i=0; i < size; i++ )
+                {
+                    dptr[i] = 1.0;
+                }
+                *ptr = (void*) dptr;
+            }
+            break;
+    }
+}
+
diff --git a/bench/src/barrier.c b/bench/src/barrier.c
new file mode 100644
index 0000000..27129b7
--- /dev/null
+++ b/bench/src/barrier.c
@@ -0,0 +1,159 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  barrier.c
+ *
+ *      Description:  Implementation of threaded spin loop barrier
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+/* #####   HEADER FILE INCLUDES   ######################################### */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+#include <errno.h>
+#include <barrier.h>
+
+/* #####   EXPORTED VARIABLES   ########################################### */
+
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+#define CACHELINE_SIZE 64
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+
+static BarrierGroup* groups;
+static int currentGroupId = 0;
+static int maxGroupId = 0;
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+int
+barrier_registerGroup(int numThreads)
+{
+    int ret;
+
+    if (currentGroupId > maxGroupId)
+    {
+        fprintf(stderr, "ERROR: Group ID %d larger than maxGroupID %d\n",currentGroupId,maxGroupId);
+    }
+
+    groups[currentGroupId].numberOfThreads = numThreads;
+    ret = posix_memalign(
+            (void**) &groups[currentGroupId].groupBval,
+            CACHELINE_SIZE, 
+            numThreads * 32 * sizeof(int));
+
+    if (ret < 0)
+    {
+        fprintf(stderr, "ERROR: Cannot register thread group - %s\n", strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+
+
+    return currentGroupId++;
+}
+
+void
+barrier_registerThread(BarrierData* barr, int groupId, int threadId)
+{
+    int ret;
+    int i;
+    int j = 1;
+    if (groupId > currentGroupId)
+    {
+        fprintf(stderr, "ERROR: Group not yet registered");
+    }
+    if (threadId > groups[groupId].numberOfThreads)
+    {
+        fprintf(stderr, "ERROR: Thread ID %d too large\n",threadId);
+    }
+
+    barr->numberOfThreads = groups[groupId].numberOfThreads;
+    barr->offset = 0;
+    barr->val = 1;
+    barr->bval =  groups[groupId].groupBval;
+    ret = posix_memalign(
+            (void**) &(barr->index),
+            CACHELINE_SIZE, 
+            barr->numberOfThreads * sizeof(int));
+
+    if (ret < 0)
+    {
+        fprintf(stderr, "ERROR: Cannot register thread - %s\n", strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+
+
+    barr->index[0] = threadId;
+
+    for (i = 0; i < barr->numberOfThreads; i++)
+    {
+        if (!(i == threadId))
+        {
+            barr->index[j++] = i;
+        }
+    }
+}
+
+
+void
+barrier_init(int numberOfGroups) 
+{
+    maxGroupId = numberOfGroups-1;
+    groups = (BarrierGroup*) malloc(numberOfGroups * sizeof(BarrierGroup));
+    if (!groups)
+    {
+        fprintf(stderr, "ERROR: Cannot allocate barrier - %s\n", strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+}
+
+void
+barrier_synchronize(BarrierData* barr)
+{
+    int i;
+
+    barr->bval[barr->index[0] * 32 +  barr->offset * 16] = barr->val;
+
+    for (i = 1; i < barr->numberOfThreads; i++)
+    {
+        while (barr->bval[barr->index[i] * 32 + barr->offset * 16] != barr->val)
+        {
+            __asm__ ("pause");
+        }
+    }
+
+    if (barr->offset)
+    {
+        barr->val = !barr->val;
+    }
+    barr->offset = !barr->offset;
+}
+
+
diff --git a/bench/src/bench.c b/bench/src/bench.c
new file mode 100644
index 0000000..8adfb52
--- /dev/null
+++ b/bench/src/bench.c
@@ -0,0 +1,772 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  bench.c
+ *
+ *      Description:  Benchmarking framework for likwid-bench
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *               Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+/* #####   HEADER FILE INCLUDES   ######################################### */
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/syscall.h>
+#include <string.h>
+#include <sched.h>
+//#include <types.h>
+#include <unistd.h>
+
+#include <allocator.h>
+#include <threads.h>
+#include <barrier.h>
+#include <likwid.h>
+
+/* #####   EXPORTED VARIABLES   ########################################### */
+
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+//#define BARRIER pthread_barrier_wait(&threads_barrier)
+#define BARRIER   barrier_synchronize(&barr)
+
+#ifdef PERFMON
+#define START_PERFMON likwid_markerStartRegion("bench");
+#define STOP_PERFMON  likwid_markerStopRegion("bench");
+#define LIKWID_THREAD_INIT  likwid_markerThreadInit();
+#else
+#define START_PERFMON
+#define STOP_PERFMON
+#define LIKWID_THREAD_INIT
+#endif
+
+#define EXECUTE(func)   \
+    BARRIER; \
+    timer_start(&time); \
+    START_PERFMON  \
+    for (i=0; i<myData->iter; i++) \
+    {   \
+        func; \
+    } \
+    BARRIER; \
+    STOP_PERFMON  \
+    timer_stop(&time); \
+    data->cycles = timer_printCycles(&time); \
+    BARRIER
+
+
+
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+void* runTest(void* arg)
+{
+    int threadId;
+    int offset;
+    size_t size, allSize;
+    size_t i;
+    BarrierData barr;
+    ThreadData* data;
+    ThreadUserData* myData;
+    TimerData time;
+    FuncPrototype func;
+
+    data = (ThreadData*) arg;
+    myData = &(data->data);
+    func = myData->test->kernel;
+    threadId = data->threadId;
+    barrier_registerThread(&barr, 0, data->globalThreadId);
+
+    /* Prepare ptrs for thread */
+    allSize = myData->size;
+    size = myData->size / data->numberOfThreads;
+    size -= (size%myData->test->stride);
+    offset = data->threadId * size;
+    myData->size = size;
+
+
+    /* pin the thread */
+    likwid_pinThread(myData->processors[threadId]);
+
+    printf("Group: %d Thread %d Global Thread %d running on core %d - Vector length %llu Offset %d\n",
+            data->groupId,
+            threadId,
+            data->globalThreadId,
+            affinity_threadGetProcessorId(),
+            LLU_CAST allSize,
+            offset);
+    BARRIER;
+
+    /* Up to 10 streams the following registers are used for Array ptr:
+     * Size rdi
+     * in Registers: rsi  rdx  rcx  r8  r9
+     * passed on stack, then: r10  r11  r12  r13  r14  r15
+     * If more than 10 streams are used first 5 streams are in register, above 5 a macro must be used to
+     * load them from stack
+     * */
+
+    switch ( myData->test->streams ) {
+        case STREAM_1:
+            EXECUTE(func(size,myData->streams[0]));
+            break;
+        case STREAM_2:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1]));
+            break;
+        case STREAM_3:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2]));
+            break;
+        case STREAM_4:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3]));
+            break;
+        case STREAM_5:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4]));
+            break;
+        case STREAM_6:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5]));
+            break;
+        case STREAM_7:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6]));
+            break;
+        case STREAM_8:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7]));
+            break;
+        case STREAM_9:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8]));
+            break;
+        case STREAM_10:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9]));
+            break;
+        case STREAM_11:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10]));
+            break;
+        case STREAM_12:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11]));
+            break;
+        case STREAM_13:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12]));
+            break;
+        case STREAM_14:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13]));
+            break;
+        case STREAM_15:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14]));
+            break;
+        case STREAM_16:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15]));
+            break;
+        case STREAM_17:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16]));
+            break;
+        case STREAM_18:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17]));
+            break;
+        case STREAM_19:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18]));
+            break;
+        case STREAM_20:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19]));
+            break;
+        case STREAM_21:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20]));
+            break;
+        case STREAM_22:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21]));
+            break;
+        case STREAM_23:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22]));
+            break;
+        case STREAM_24:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23]));
+            break;
+        case STREAM_25:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24]));
+            break;
+        case STREAM_26:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25]));
+            break;
+        case STREAM_27:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26]));
+            break;
+        case STREAM_28:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27]));
+            break;
+        case STREAM_29:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28]));
+            break;
+        case STREAM_30:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29]));
+            break;
+        case STREAM_31:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30]));
+            break;
+        case STREAM_32:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31]));
+            break;
+        case STREAM_33:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32]));
+            break;
+        case STREAM_34:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33]));
+            break;
+        case STREAM_35:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34]));
+            break;
+        case STREAM_36:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35]));
+            break;
+        case STREAM_37:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
+                        myData->streams[36]));
+            break;
+        case STREAM_38:
+            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
+                        myData->streams[36],myData->streams[37]));
+            break;
+        default:
+            break;
+    }
+    pthread_exit(NULL);
+}
+
+
+#define MEASURE(func) \
+    if (data->globalThreadId == 0) \
+    { \
+        timer_start(&time); \
+        i = 0; \
+        for (i=0; i < SIZE_MAX; i++) \
+        { \
+            func; \
+            timer_stop(&time); \
+            if (timer_print(&time) >= (double)data->data.min_runtime) \
+                break; \
+        } \
+        iterations = i;  \
+    } \
+    BARRIER;
+
+
+void* getIter(void* arg)
+{
+    int threadId;
+    int offset;
+    size_t size;
+    size_t i;
+    BarrierData barr;
+    ThreadData* data;
+    ThreadUserData* myData;
+    TimerData time;
+    FuncPrototype func;
+    size_t iterations = 0;
+
+    data = (ThreadData*) arg;
+    myData = &(data->data);
+    func = myData->test->kernel;
+    threadId = data->threadId;
+    barrier_registerThread(&barr, 0, data->globalThreadId);
+
+
+    /* Prepare ptrs for thread */
+    size = myData->size / data->numberOfThreads;
+    size -= (size%myData->test->stride);
+    offset = data->threadId * size;
+
+    switch ( myData->test->type )
+    {
+        case SINGLE:
+            {
+                float* sptr;
+                for (i=0; i <  myData->test->streams; i++)
+                {
+                    sptr = (float*) myData->streams[i];
+                    sptr +=  offset;
+                    myData->streams[i] = (float*) sptr;
+                }
+            }
+            break;
+        case DOUBLE:
+            {
+                double* dptr;
+                for (i=0; i <  myData->test->streams; i++)
+                {
+                    dptr = (double*) myData->streams[i];
+                    dptr +=  offset;
+                    myData->streams[i] = (double*) dptr;
+                }
+            }
+            break;
+    }
+
+    /* pin the thread */
+    likwid_pinThread(myData->processors[threadId]);
+#ifdef PERFMON
+    LIKWID_THREAD_INIT;
+    BARRIER;
+#endif
+
+    switch ( myData->test->streams ) {
+        case STREAM_1:
+            MEASURE(func(size,myData->streams[0]));
+            break;
+        case STREAM_2:
+            MEASURE(func(size,myData->streams[0],myData->streams[1]));
+            break;
+        case STREAM_3:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2]));
+            break;
+        case STREAM_4:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3]));
+            break;
+        case STREAM_5:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4]));
+            break;
+        case STREAM_6:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5]));
+            break;
+        case STREAM_7:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6]));
+            break;
+        case STREAM_8:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7]));
+            break;
+        case STREAM_9:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8]));
+            break;
+        case STREAM_10:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9]));
+            break;
+        case STREAM_11:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10]));
+            break;
+        case STREAM_12:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11]));
+            break;
+        case STREAM_13:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12]));
+            break;
+        case STREAM_14:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13]));
+            break;
+        case STREAM_15:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14]));
+            break;
+        case STREAM_16:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15]));
+            break;
+        case STREAM_17:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16]));
+            break;
+        case STREAM_18:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17]));
+            break;
+        case STREAM_19:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18]));
+            break;
+        case STREAM_20:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19]));
+            break;
+        case STREAM_21:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20]));
+            break;
+        case STREAM_22:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21]));
+            break;
+        case STREAM_23:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22]));
+            break;
+        case STREAM_24:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23]));
+            break;
+        case STREAM_25:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24]));
+            break;
+        case STREAM_26:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25]));
+            break;
+        case STREAM_27:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26]));
+            break;
+        case STREAM_28:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27]));
+            break;
+        case STREAM_29:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28]));
+            break;
+        case STREAM_30:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29]));
+            break;
+        case STREAM_31:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30]));
+            break;
+        case STREAM_32:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31]));
+            break;
+        case STREAM_33:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32]));
+            break;
+        case STREAM_34:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33]));
+            break;
+        case STREAM_35:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34]));
+            break;
+        case STREAM_36:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35]));
+            break;
+        case STREAM_37:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
+                        myData->streams[36]));
+            break;
+        case STREAM_38:
+            MEASURE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
+                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
+                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
+                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
+                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
+                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
+                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
+                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
+                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
+                        myData->streams[36],myData->streams[37]));
+            break;
+        default:
+            break;
+    }
+
+    data->data.iter = iterations;
+    pthread_exit(NULL);
+}
diff --git a/bench/src/strUtil.c b/bench/src/strUtil.c
new file mode 100644
index 0000000..9b700f5
--- /dev/null
+++ b/bench/src/strUtil.c
@@ -0,0 +1,315 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  strUtil.c
+ *
+ *      Description:  Utility string routines building upon bstrlib
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com.
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#include <strUtil.h>
+
+static int str2int(const char* str)
+{
+    char* endptr;
+    errno = 0;
+    unsigned long val;
+    val = strtoul(str, &endptr, 10);
+
+    if ((errno == ERANGE && val == LONG_MAX)
+        || (errno != 0 && val == 0))
+    {
+        fprintf(stderr, "Value in string out of range\n");
+        return -EINVAL;
+    }
+
+    if (endptr == str)
+    {
+        fprintf(stderr, "No digits were found\n");
+        return -EINVAL;
+    }
+
+    return (int) val;
+}
+
+uint64_t bstr_to_doubleSize(const_bstring str, DataType type)
+{
+    bstring unit = bmidstr(str, blength(str)-2, 2);
+    bstring sizeStr = bmidstr(str, 0, blength(str)-2);
+    uint64_t sizeU = str2int(bdata(sizeStr));
+    uint64_t junk = 0;
+    uint64_t bytesize = 0;
+
+    switch (type)
+    {
+        case SINGLE:
+            bytesize = 4;
+            break;
+
+        case DOUBLE:
+            bytesize = 8;
+            break;
+    }
+
+    if ((biseqcstr(unit, "kB"))||(biseqcstr(unit, "KB")))
+    {
+        junk = (sizeU *1000)/bytesize;
+    }
+    else if (biseqcstr(unit, "MB"))
+    {
+        junk = (sizeU *1000000)/bytesize;
+    }
+    else if (biseqcstr(unit, "GB"))
+    {
+        junk = (sizeU *1000000000)/bytesize;
+    }
+    else if (biseqcstr(unit, "B"))
+    {
+        junk = (sizeU)/bytesize;
+    }
+
+    return junk;
+}
+
+void bstr_to_workgroup(Workgroup* group, const_bstring str, DataType type, int numberOfStreams)
+{
+    uint32_t i;
+    int parseStreams = 0;
+    bstring threadInfo;
+    bstring streams= bformat("0");
+    struct bstrList* tokens;
+    struct bstrList* subtokens;
+    AffinityDomains_t domains;
+    AffinityDomain* domain = NULL;
+
+    /* split the workgroup into the thread and the streams part */
+    tokens = bsplit(str,'-');
+
+    if (tokens->qty == 2)
+    {
+        threadInfo = bstrcpy(tokens->entry[0]);
+        streams = bstrcpy(tokens->entry[1]);
+        parseStreams = 1;
+    }
+    else if (tokens->qty == 1)
+    {
+        threadInfo = bstrcpy(tokens->entry[0]);
+    }
+    else
+    {
+        fprintf(stderr, "Error in parsing workgroup string\n");
+    }
+
+    bstrListDestroy (tokens);
+    tokens = bsplit(threadInfo,':');
+
+    if (tokens->qty == 5)
+    {
+        uint32_t maxNumThreads;
+        int chunksize;
+        int stride;
+        int counter;
+        int currentId = 0;
+        int startId = 0;
+
+        domains = get_affinityDomains();
+        for (i = 0; i < domains->numberOfAffinityDomains; i++)
+        {
+            if (bstrcmp(domains->domains[i].tag, tokens->entry[0]) == BSTR_OK)
+            {
+                domain = &(domains->domains[i]);
+                break;
+            }
+        }
+
+        if (domain == NULL)
+        {
+            fprintf(stderr, "Error: Domain %s not available on current machine.\nTry likwid-bench -p for supported domains.",
+                bdata(tokens->entry[0]));
+            exit(EXIT_FAILURE);
+        }
+
+        group->size = bstr_to_doubleSize(tokens->entry[1], type);
+        group->numberOfThreads = str2int(bdata(tokens->entry[2]));
+        chunksize = str2int(bdata(tokens->entry[3]));
+        stride = str2int(bdata(tokens->entry[4]));
+        maxNumThreads = (domain->numberOfProcessors / stride) * chunksize;
+
+        if (group->numberOfThreads > maxNumThreads)
+        {
+            fprintf(stderr, "Error: Domain %s supports only up to %d threads with used expression.\n",
+                    bdata(tokens->entry[0]), maxNumThreads);
+            exit(EXIT_FAILURE);
+        }
+
+        group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
+
+        counter = chunksize;
+
+        for (i=0; i<group->numberOfThreads; i++)
+        {
+            if (counter)
+            {
+                group->processorIds[i] = domain->processorList[currentId++];
+            }
+            else
+            {
+                startId += stride;
+                currentId = startId;
+                group->processorIds[i] = domain->processorList[currentId++];
+                counter = chunksize;
+            }
+            counter--;
+        }
+    }
+    else if (tokens->qty == 3)
+    {
+        domains = get_affinityDomains();
+        for (i = 0; i < domains->numberOfAffinityDomains; i++)
+        {
+            if (bstrcmp(domains->domains[i].tag, tokens->entry[0]) == BSTR_OK)
+            {
+                domain = &(domains->domains[i]);
+                break;
+            }
+        }
+
+        if (domain == NULL)
+        {
+            fprintf(stderr, "Error: Domain %s not available on current machine.\nTry likwid-bench -p for supported domains.",
+                    bdata(tokens->entry[0]));
+            exit(EXIT_FAILURE);
+        }
+
+        group->size = bstr_to_doubleSize(tokens->entry[1], type);
+        group->numberOfThreads = str2int(bdata(tokens->entry[2]));
+
+        if (group->numberOfThreads > domain->numberOfProcessors)
+        {
+            fprintf(stderr, "Error: Domain %s supports only up to %d threads.\n",
+                    bdata(tokens->entry[0]), domain->numberOfProcessors);
+            exit(EXIT_FAILURE);
+        }
+
+        group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
+
+        for (i=0; i<group->numberOfThreads; i++)
+        {
+            group->processorIds[i] = domain->processorList[i];
+        }
+    }
+    else if (tokens->qty == 2)
+    {
+        domains = get_affinityDomains();
+        for (i = 0; i < domains->numberOfAffinityDomains; i++)
+        {
+            if (bstrcmp(domains->domains[i].tag, tokens->entry[0]) == BSTR_OK)
+            {
+                domain = &(domains->domains[i]);
+                break;
+            }
+        }
+
+        if (domain == NULL)
+        {
+            fprintf(stderr, "Error: Domain %s not available on current machine.\nTry likwid-bench -p for supported domains.",
+                            bdata(tokens->entry[0]));
+            exit(EXIT_FAILURE);
+        }
+
+        group->size = bstr_to_doubleSize(tokens->entry[1], type);
+        group->numberOfThreads = domain->numberOfProcessors;
+        group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
+
+        for (i=0; i<group->numberOfThreads; i++)
+        {
+            group->processorIds[i] = domain->processorList[i];
+        }
+    }
+    else
+    {
+        fprintf(stderr, "Error in parsing workgroup string\n");
+    }
+
+    bstrListDestroy(tokens);
+
+    /* parse stream list */
+    if (parseStreams)
+    {
+        tokens = bsplit(streams,',');
+
+        if (tokens->qty < numberOfStreams)
+        {
+            fprintf(stderr, "Testcase requires at least %d streams\n", numberOfStreams);
+        }
+
+        group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
+
+        for (i=0;i<(uint32_t) tokens->qty;i++)
+        {
+            subtokens = bsplit(tokens->entry[i],':');
+
+            if ( subtokens->qty == 3 )
+            {
+                int index = str2int(bdata(subtokens->entry[0]));
+                if (index >= numberOfStreams)
+                {
+                    fprintf(stderr, "Stream Index %d out of range\n",index);
+                }
+                group->streams[index].domain = bstrcpy(subtokens->entry[1]);
+                group->streams[index].offset = str2int(bdata(subtokens->entry[2]));
+            }
+            else if ( subtokens->qty == 2 )
+            {
+                int index = str2int(bdata(subtokens->entry[0]));
+                if (index >= numberOfStreams)
+                {
+                    fprintf(stderr, "Stream Index %d out of range\n",index);
+                }
+                group->streams[index].domain = bstrcpy(subtokens->entry[1]);
+                group->streams[index].offset = 0;
+            }
+            else
+            {
+                fprintf(stderr, "Error in parsing event string\n");
+            }
+
+            bstrListDestroy(subtokens);
+        }
+
+        bstrListDestroy(tokens);
+    }
+    else
+    {
+        group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
+
+        for (i=0; i< (uint32_t)numberOfStreams; i++)
+        {
+            group->streams[i].domain = domain->tag;
+            group->streams[i].offset = 0;
+        }
+    }
+
+    group->size /= numberOfStreams;
+    return;
+}
diff --git a/bench/src/threads.c b/bench/src/threads.c
new file mode 100644
index 0000000..f9e6a43
--- /dev/null
+++ b/bench/src/threads.c
@@ -0,0 +1,287 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  threads.c
+ *
+ *      Description:  High level interface to pthreads
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+/* #####   HEADER FILE INCLUDES   ######################################### */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <string.h>
+
+#include <errno.h>
+#include <threads.h>
+
+
+
+/* #####   EXPORTED VARIABLES   ########################################### */
+
+pthread_barrier_t threads_barrier;
+ThreadData* threads_data;
+ThreadGroup* threads_groups;
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+
+static pthread_t* threads = NULL;
+static pthread_attr_t attr;
+static int numThreads = 0;
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE  ################## */
+static int count_characters(const char *str, char character)
+{
+    if (str == 0)
+        return 0;
+    const char *p = str;
+    int count = 0;
+
+    do {
+        if (*p == character)
+            count++;
+    } while (*(p++));
+
+    return count;
+}
+
+void* dummy_function(void* arg)
+{
+    return 0;
+}
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+
+
+int threads_test()
+{
+    int cnt = 0;
+    int err;
+    pthread_t pid;
+    int likwid_pin = count_characters(getenv("LIKWID_PIN"), ',');
+    int max_cpus = sysconf(_SC_NPROCESSORS_CONF);
+    int max = likwid_pin;
+    if (likwid_pin == 0)
+    {
+        max = max_cpus;
+    }
+    while (cnt < max) {
+        err = pthread_create(&pid, NULL, dummy_function, NULL);
+        cnt++;
+    }
+    return cnt;
+}
+
+
+void
+threads_init(int numberOfThreads)
+{
+    int i;
+    numThreads = numberOfThreads;
+
+    threads = (pthread_t*) malloc(numThreads * sizeof(pthread_t));
+    threads_data = (ThreadData*) malloc(numThreads * sizeof(ThreadData));
+
+    for(i = 0; i < numThreads; i++)
+    {
+        threads_data[i].numberOfThreads = numThreads;
+        threads_data[i].globalNumberOfThreads = numThreads;
+        threads_data[i].globalThreadId = i;
+        threads_data[i].threadId = i;
+    }
+
+    pthread_barrier_init(&threads_barrier, NULL, numThreads);
+    pthread_attr_init(&attr);
+    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
+}
+
+
+void 
+threads_create(void *(*startRoutine)(void*))
+{
+    int i;
+
+    for(i = 0; i < numThreads; i++)
+    {
+        pthread_create(&threads[i],
+                &attr,
+                startRoutine,
+                (void*) &threads_data[i]);
+    }
+}
+
+void 
+threads_createGroups(int numberOfGroups)
+{
+    int i;
+    int j;
+    int numThreadsPerGroup;
+    int globalId = 0;
+
+    if (numThreads % numberOfGroups)
+    {
+        fprintf(stderr, "ERROR: Not enough threads %d to create %d groups\n",numThreads,numberOfGroups);
+    }
+    else 
+    {
+        numThreadsPerGroup = numThreads / numberOfGroups;
+    }
+
+    threads_groups = (ThreadGroup*) malloc(numberOfGroups * sizeof(ThreadGroup));
+    if (!threads_groups)
+    {
+        fprintf(stderr, "ERROR: Cannot allocate thread groups - %s\n", strerror(errno));
+        exit(EXIT_FAILURE);
+    }
+
+    for (i = 0; i < numberOfGroups; i++)
+    {
+        threads_groups[i].numberOfThreads = numThreadsPerGroup;
+        threads_groups[i].threadIds = (int*) malloc(numThreadsPerGroup * sizeof(int));
+        if (!threads_groups[i].threadIds)
+        {
+            fprintf(stderr, "ERROR: Cannot allocate threadID list for thread groups - %s\n", strerror(errno));
+            exit(EXIT_FAILURE);
+        }
+
+        for (j = 0; j < numThreadsPerGroup; j++)
+        {
+            threads_data[globalId].threadId = j;
+            threads_data[globalId].groupId = i;
+            threads_data[globalId].numberOfGroups = numberOfGroups;
+            threads_data[globalId].numberOfThreads = numThreadsPerGroup;
+            threads_groups[i].threadIds[j] = globalId++;
+        }
+    }
+}
+
+
+void 
+threads_registerDataAll(ThreadUserData* data, threads_copyDataFunc func)
+{
+    int i;
+
+    if (func == NULL)
+    {
+        for(i = 0; i < numThreads; i++)
+        {
+            threads_data[i].data = (*data);
+        }
+    }
+    else
+    {
+        for(i = 0; i < numThreads; i++)
+        {
+            func( data, &threads_data[i].data);
+        }
+    }
+}
+
+void
+threads_registerDataThread(int threadId,
+        ThreadUserData* data,
+        threads_copyDataFunc func)
+{
+    if (func == NULL)
+    {
+        threads_data[threadId].data = (*data);
+    }
+    else
+    {
+        func( data, &threads_data[threadId].data);
+    }
+}
+
+void
+threads_registerDataGroup(int groupId,
+        ThreadUserData* data,
+        threads_copyDataFunc func)
+{
+    int i;
+
+    if (func == NULL)
+    {
+        for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
+        {
+            threads_data[threads_groups[groupId].threadIds[i]].data = (*data);
+        }
+    }
+    else
+    {
+        for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
+        {
+            func( data,
+                    &threads_data[threads_groups[groupId].threadIds[i]].data);
+        }
+    }
+}
+
+size_t
+threads_updateIterations(int groupId, size_t demandIter)
+{
+    int i;
+    size_t iterations = threads_data[0].data.iter;
+    if (demandIter > 0)
+    {
+        iterations = demandIter;
+    }
+    iterations = (iterations < 10 ? 10 : iterations);
+
+    for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
+    {
+        threads_data[threads_groups[groupId].threadIds[i]].data.iter = iterations;
+        threads_data[threads_groups[groupId].threadIds[i]].data.cycles = 0;
+        threads_data[threads_groups[groupId].threadIds[i]].cycles = 0;
+        threads_data[threads_groups[groupId].threadIds[i]].time = 0;
+    }
+    return iterations;
+}
+
+void
+threads_join(void)
+{
+    int i;
+
+    for(i=0; i < numThreads; i++)
+    {
+        pthread_join(threads[i], NULL);
+    }
+}
+
+void
+threads_destroy(int numberOfGroups)
+{
+    int i;
+    pthread_attr_destroy(&attr);
+    pthread_barrier_destroy(&threads_barrier);
+    free(threads_data);
+    for(i=0;i<numberOfGroups;i++)
+    {
+        free(threads_groups[i].threadIds);
+    }
+    free(threads_groups);
+    free(threads);
+}
diff --git a/bench/x86-64/branch.ptt b/bench/x86-64/branch.ptt
deleted file mode 100644
index e15086d..0000000
--- a/bench/x86-64/branch.ptt
+++ /dev/null
@@ -1,36 +0,0 @@
-STREAMS 4
-TYPE DOUBLE_RAND
-FLOPS 2
-BYTES 32
-LOOP 8
-movaps    FPR1, [STR1 + GPR1*8]
-movaps    FPR2, [STR1 + GPR1*8+16]
-movaps    FPR3, [STR1 + GPR1*8+32]
-movaps    FPR4, [STR1 + GPR1*8+48]
-cvtsd2si  GPR2, FPR1
-cmp		  GPR2, 0
-jl sub
-mulpd     FPR1, [STR2 + GPR1*8]
-addpd     FPR1, [STR3 + GPR1*8]
-mulpd     FPR2, [STR2 + GPR1*8+16]
-addpd     FPR2, [STR3 + GPR1*8+16]
-mulpd     FPR3, [STR2 + GPR1*8+32]
-addpd     FPR3, [STR3 + GPR1*8+32]
-mulpd     FPR4, [STR2 + GPR1*8+48]
-addpd     FPR4, [STR3 + GPR1*8+48]
-jmp end
-sub:
-mulpd     FPR1, [STR2 + GPR1*8]
-subpd     FPR1, [STR3 + GPR1*8]
-mulpd     FPR2, [STR2 + GPR1*8+16]
-subpd     FPR2, [STR3 + GPR1*8+16]
-mulpd     FPR3, [STR2 + GPR1*8+32]
-subpd     FPR3, [STR3 + GPR1*8+32]
-mulpd     FPR4, [STR2 + GPR1*8+48]
-subpd     FPR4, [STR3 + GPR1*8+48]
-end:
-movaps    [STR0 + GPR1*8], FPR1
-movaps    [STR0 + GPR1*8+16], FPR2
-movaps    [STR0 + GPR1*8+32], FPR3
-movaps    [STR0 + GPR1*8+48], FPR4
-
diff --git a/bench/x86-64/peak.ptt b/bench/x86-64/peak.ptt
deleted file mode 100644
index c03e2c8..0000000
--- a/bench/x86-64/peak.ptt
+++ /dev/null
@@ -1,49 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 8
-movaps FPR1, [SCALAR]
-sub  GPR2, 4
-sub  STR0, 32
-sub  STR1, 32
-mov   GPR1, GPR2
-neg   GPR1
-.align 16
-1:
-movaps    FPR2, [STR0 + GPR1 * 8 ]
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-movaps    FPR6, [STR0 + GPR1 * 8 ]
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-pshufd    FPR2, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8], FPR2
-movaps    FPR3, [STR0 + GPR1 * 8 + 16]
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-movaps    FPR7, [STR0 + GPR1 * 8 + 16 ]
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-pshufd    FPR3, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8 + 16], FPR3
-movaps    FPR4, [STR0 + GPR1 * 8 + 32]
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-movaps    FPR8, [STR0 + GPR1 * 8 + 32 ]
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-pshufd    FPR4, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8 + 32], FPR4
-movaps    FPR5, [STR0 + GPR1 * 8 + 48]
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-movaps    FPR9, [STR0 + GPR1 * 8 + 48 ]
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-pshufd    FPR5, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8 + 48], FPR5
-add GPR1, 8
-js 1b
-
-
diff --git a/bench/x86-64/peak_avx.ptt b/bench/x86-64/peak_avx.ptt
deleted file mode 100644
index 047178e..0000000
--- a/bench/x86-64/peak_avx.ptt
+++ /dev/null
@@ -1,49 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 16
-vmovaps ymm1, [SCALAR]
-sub  GPR2, 8
-sub  STR0, 64
-sub  STR1, 64
-mov   GPR1, GPR2
-neg   GPR1
-.align 32
-1:
-vmovaps    ymm2, [STR0 + GPR1 * 8 ]
-vaddpd     ymm2, ymm2, ymm1
-vmulpd     ymm2, ymm2, ymm1
-vmovaps    ymm6, [STR0 + GPR1 * 8 ]
-vaddpd     ymm2, ymm2, ymm1
-vmulpd     ymm2, ymm2, ymm1
-#vpshufd    ymm2, ymm1, 0x1
-vmovaps    [STR1 + GPR1 * 8], ymm2
-vmovaps    ymm3, [STR0 + GPR1 * 8 + 32]
-vaddpd     ymm3, ymm3, ymm1
-vmulpd     ymm3, ymm3, ymm1
-vmovaps    ymm7, [STR0 + GPR1 * 8 + 32 ]
-vaddpd     ymm3, ymm3, ymm1
-vmulpd     ymm3, ymm3, ymm1
-#vpshufd    ymm3, ymm1, 0x1
-vmovaps    [STR1 + GPR1 * 8 + 32], ymm3
-vmovaps    ymm4, [STR0 + GPR1 * 8 + 64]
-vaddpd     ymm4, ymm4, ymm1
-vmulpd     ymm4, ymm4, ymm1
-vmovaps    ymm8, [STR0 + GPR1 * 8 + 64 ]
-vaddpd     ymm4, ymm4, ymm1
-vmulpd     ymm4, ymm4, ymm1
-#vpshufd    ymm4, ymm1, 0x1
-vmovaps    [STR1 + GPR1 * 8 + 32], ymm4
-vmovaps    ymm5, [STR0 + GPR1 * 8 + 96]
-vaddpd     ymm5, ymm5, ymm1
-vmulpd     ymm5, ymm5, ymm1
-vmovaps    ymm9, [STR0 + GPR1 * 8 + 96]
-vaddpd     ymm5, ymm5, ymm1
-vmulpd     ymm5, ymm5, ymm1
-#vpshufd    ymm5, ymm1, 0x1
-vmovaps    [STR1 + GPR1 * 8 + 96], ymm5
-add GPR1, 16
-js 1b
-
-
diff --git a/bench/x86-64/peak_sse.ptt b/bench/x86-64/peak_sse.ptt
deleted file mode 100644
index c03e2c8..0000000
--- a/bench/x86-64/peak_sse.ptt
+++ /dev/null
@@ -1,49 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 8
-movaps FPR1, [SCALAR]
-sub  GPR2, 4
-sub  STR0, 32
-sub  STR1, 32
-mov   GPR1, GPR2
-neg   GPR1
-.align 16
-1:
-movaps    FPR2, [STR0 + GPR1 * 8 ]
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-movaps    FPR6, [STR0 + GPR1 * 8 ]
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-pshufd    FPR2, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8], FPR2
-movaps    FPR3, [STR0 + GPR1 * 8 + 16]
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-movaps    FPR7, [STR0 + GPR1 * 8 + 16 ]
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-pshufd    FPR3, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8 + 16], FPR3
-movaps    FPR4, [STR0 + GPR1 * 8 + 32]
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-movaps    FPR8, [STR0 + GPR1 * 8 + 32 ]
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-pshufd    FPR4, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8 + 32], FPR4
-movaps    FPR5, [STR0 + GPR1 * 8 + 48]
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-movaps    FPR9, [STR0 + GPR1 * 8 + 48 ]
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-pshufd    FPR5, FPR1, 0x1
-#movaps    [STR1 + GPR1 * 8 + 48], FPR5
-add GPR1, 8
-js 1b
-
-
diff --git a/bench/x86-64/peakflops.ptt b/bench/x86-64/peakflops.ptt
deleted file mode 100644
index 94c769a..0000000
--- a/bench/x86-64/peakflops.ptt
+++ /dev/null
@@ -1,37 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 8
-movaps FPR1, [SCALAR]
-sub  GPR2, 4
-sub  STR0, 32
-sub  STR1, 32
-mov   GPR1, GPR2
-neg   GPR1
-.align 32
-1:
-movaps    FPR2, [STR0 + GPR1 * 8 ]
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-movaps    FPR3, [STR0 + GPR1 * 8 + 16]
-add GPR1, 8
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-movaps    FPR4, [STR0 + GPR1 * 8 - 32]
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-movaps    FPR5, [STR0 + GPR1 * 8 - 16]
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-js 1b
-
-
diff --git a/bench/x86-64/peakflops_avx.ptt b/bench/x86-64/peakflops_avx.ptt
deleted file mode 100644
index d9f9885..0000000
--- a/bench/x86-64/peakflops_avx.ptt
+++ /dev/null
@@ -1,37 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 16
-vmovaps ymm1, [SCALAR]
-sub  GPR2, 8
-sub  STR0, 64
-sub  STR1, 64
-mov   GPR1, GPR2
-neg   GPR1
-.align 32
-1:
-vmovaps    ymm2, [STR0 + GPR1 * 8 ]
-vaddpd     ymm2, ymm2, ymm1
-vmulpd     ymm2, ymm2, ymm1
-vaddpd     ymm2, ymm2, ymm1
-vmulpd     ymm2, ymm2, ymm1
-vmovaps    ymm3, [STR0 + GPR1 * 8 + 32]
-add GPR1, 16
-vaddpd     ymm3, ymm3, ymm1
-vmulpd     ymm3, ymm3, ymm1
-vaddpd     ymm3, ymm3, ymm1
-vmulpd     ymm3, ymm3, ymm1
-vmovaps    ymm4, [STR0 + GPR1 * 8 - 64]
-vaddpd     ymm4, ymm4, ymm1
-vmulpd     ymm4, ymm4, ymm1
-vaddpd     ymm4, ymm4, ymm1
-vmulpd     ymm4, ymm4, ymm1
-vmovaps    ymm5, [STR0 + GPR1 * 8 - 32]
-vaddpd     ymm5, ymm5, ymm1
-vmulpd     ymm5, ymm5, ymm1
-vaddpd     ymm5, ymm5, ymm1
-vmulpd     ymm5, ymm5, ymm1
-js 1b
-
-
diff --git a/bench/x86-64/peakflops_sse.ptt b/bench/x86-64/peakflops_sse.ptt
deleted file mode 100644
index 94c769a..0000000
--- a/bench/x86-64/peakflops_sse.ptt
+++ /dev/null
@@ -1,37 +0,0 @@
-STREAMS 2
-TYPE DOUBLE
-FLOPS 2
-BYTES 16
-INC 8
-movaps FPR1, [SCALAR]
-sub  GPR2, 4
-sub  STR0, 32
-sub  STR1, 32
-mov   GPR1, GPR2
-neg   GPR1
-.align 32
-1:
-movaps    FPR2, [STR0 + GPR1 * 8 ]
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-addpd     FPR2, FPR1
-mulpd     FPR2, FPR1
-movaps    FPR3, [STR0 + GPR1 * 8 + 16]
-add GPR1, 8
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-addpd     FPR3, FPR1
-mulpd     FPR3, FPR1
-movaps    FPR4, [STR0 + GPR1 * 8 - 32]
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-addpd     FPR4, FPR1
-mulpd     FPR4, FPR1
-movaps    FPR5, [STR0 + GPR1 * 8 - 16]
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-addpd     FPR5, FPR1
-mulpd     FPR5, FPR1
-js 1b
-
-
diff --git a/bench/x86-64/stream_avx.ptt b/bench/x86-64/stream_avx.ptt
index 8fbaf7c..504430b 100644
--- a/bench/x86-64/stream_avx.ptt
+++ b/bench/x86-64/stream_avx.ptt
@@ -1,22 +1,22 @@
 STREAMS 3
 TYPE SINGLE
-FLOPS 4
-BYTES 48
-vbroadcastss ymm1, [SCALAR]
-LOOP 8
-vmovaps   ymm2, [STR1 + GPR1*8]
-vmovaps   ymm3, [STR1 + GPR1*8+16]
-vmovaps   ymm4, [STR1 + GPR1*8+32]
-vmovaps   ymm5, [STR1 + GPR1*8+48]
+FLOPS 2
+BYTES 12
+vmovaps ymm1, [SCALAR]
+LOOP 32
+vmovaps   ymm2, [STR1 + GPR1*4]
+vmovaps   ymm3, [STR1 + GPR1*4+32]
+vmovaps   ymm4, [STR1 + GPR1*4+64]
+vmovaps   ymm5, [STR1 + GPR1*4+96]
 vmulps    ymm2, ymm2, ymm1
-vaddps    ymm2, ymm2, [STR2 + GPR1*8]
+vaddps    ymm2, ymm2, [STR2 + GPR1*4]
 vmulps    ymm3, ymm3, ymm1
-vaddps    ymm3, ymm3, [STR2 + GPR1*8]
+vaddps    ymm3, ymm3, [STR2 + GPR1*4+32]
 vmulps    ymm4, ymm4, ymm1
-vaddps    ymm4, ymm4, [STR2 + GPR1*8]
+vaddps    ymm4, ymm4, [STR2 + GPR1*4+64]
 vmulps    ymm5, ymm5, ymm1
-vaddps    ymm5, ymm5, [STR2 + GPR1*8]
-vmovaps   [STR0 + GPR1*8], ymm2
-vmovaps   [STR0 + GPR1*8+16], ymm3
-vmovaps   [STR0 + GPR1*8+32], ymm4
-vmovaps   [STR0 + GPR1*8+48], ymm5
+vaddps    ymm5, ymm5, [STR2 + GPR1*4+96]
+vmovaps   [STR0 + GPR1*4], ymm2
+vmovaps   [STR0 + GPR1*4+32], ymm3
+vmovaps   [STR0 + GPR1*4+64], ymm4
+vmovaps   [STR0 + GPR1*4+96], ymm5
diff --git a/bench/x86-64/triad_avx.ptt b/bench/x86-64/triad_avx.ptt
index 3514cfd..55a97cd 100644
--- a/bench/x86-64/triad_avx.ptt
+++ b/bench/x86-64/triad_avx.ptt
@@ -1,12 +1,12 @@
 STREAMS 4
 TYPE DOUBLE
 FLOPS 2
-BYTES 16
-LOOP 32
-vmovapd ymm1, [STR1 + GPR1]
-vmovapd ymm2, [STR2 + GPR1]
-vmovapd ymm3, [STR3 + GPR1]
+BYTES 32
+LOOP 4
+vmovapd ymm1, [STR1 + GPR1*8]
+vmovapd ymm2, [STR2 + GPR1*8]
+vmovapd ymm3, [STR3 + GPR1*8]
 vmulpd  ymm0, ymm1, ymm2
 vaddpd  ymm0, ymm0, ymm3
-vmovapd [STR0 + GPR1], ymm0
+vmovapd [STR0 + GPR1*8], ymm0
 
diff --git a/config.mk b/config.mk
index 2c3f3be..e6128d7 100644
--- a/config.mk
+++ b/config.mk
@@ -1,6 +1,6 @@
 # Please have a look in INSTALL and the WIKI for details on
 # configuration options setup steps.
-# supported: GCC, GCCX86, MIC (ICC)
+# supported: GCC, MIC (ICC)
 COMPILER = GCC#NO SPACE
 
 # Define the color of the likwid-pin output
@@ -14,16 +14,21 @@ MANPREFIX = $(PREFIX)/man#NO SPACE
 
 # For the daemon based secure msr/pci access configure
 # the absolute path to the msr daemon executable.
-# Usually you can leave this to the default.
+# $(PREFIX)/bin/likwid-accessD
 ACCESSDAEMON = $(PREFIX)/sbin/likwid-accessD#NO SPACE
 
+# Build the accessDaemon. Have a look in the WIKI for details.
+BUILDDAEMON = true#NO SPACE
+
+#Build the setFrequencies tool
+BUILDFREQ = true#NO SPACE
 # Set the default mode for MSR access.
 # This can usually be overriden on the commandline.
 # Valid values are: direct, accessdaemon
-ACCESSMODE = accessdaemon#NO SPACE
+ACCESSMODE = accessdaemon$#NO SPACE
 
 # Change to true to a build shared library instead of a static one
-SHARED_LIBRARY = false#NO SPACE
+SHARED_LIBRARY = true#NO SPACE
 
 # Build Fortran90 module interface for marker API. Adopt Fortran compiler
 # in ./make/include_<COMPILER>.mk if necessary. Default: ifort .
@@ -32,17 +37,19 @@ FORTRAN_INTERFACE = false#NO SPACE
 # Instrument likwid-bench for use with likwid-perfctr
 INSTRUMENT_BENCH = false#NO SPACE
 
+# Use Portable Hardware Locality (hwloc) instead of CPUID
+USE_HWLOC = true#NO SPACE
+
 # Usually you do not need to edit below
 MAX_NUM_THREADS = 263
-MAX_NUM_NODES = 4
-HASH_TABLE_SIZE = 20
+MAX_NUM_NODES = 64
 CFG_FILE_PATH = /etc/likwid.cfg
 
 # Versioning Information
-VERSION = 3
-RELEASE = 1
-DATE    = 12.2.2014
+VERSION = 4
+RELEASE = 0
+DATE    = 16.06.2015
 
 LIBLIKWIDPIN = $(abspath $(PREFIX)/lib/liblikwidpin.so)
-LIKWIDFILTERPATH = $(abspath $(PREFIX)/share/likwid)
+LIKWIDFILTERPATH = $(abspath $(PREFIX)/share/likwid/filter)
 
diff --git a/debian/changelog b/debian/changelog
index 50c5120..5b05f9d 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,3 +1,9 @@
+likwid (4.0.0-1) unstable; urgency=medium
+
+  * new version
+
+ -- Thomas Roehl <Thomas.Roehl at fau.de>  Fri, 06 Nov 2015 14:11:05 +0200
+
 likwid (3.1.3+dfsg1-1) unstable; urgency=medium
 
   * source repack to remove non DFSG files
diff --git a/debian/control b/debian/control
index 0d4ff90..ec4359f 100644
--- a/debian/control
+++ b/debian/control
@@ -4,7 +4,7 @@ Section: misc
 Priority: optional
 Standards-Version: 3.9.6
 Build-Depends: debhelper (>= 9), dpkg-dev (>= 1.16.1~), gfortran
-Homepage: http://code.google.com/p/likwid/wiki/Introduction
+Homepage: https://github.com/RRZE-HPC/likwid/wiki
 Vcs-Git: git://anonscm.debian.org/likwid/likwid.git
 Vcs-Browser: http://anonscm.debian.org/cgit/likwid/likwid.git;a=summary
 
diff --git a/debian/copyright b/debian/copyright
index c4b2fcc..8e2c844 100644
--- a/debian/copyright
+++ b/debian/copyright
@@ -4,9 +4,9 @@ Source: http://ftp.fau.de/pub/likwid/
 Files-Excluded: test/stream.c
 
 Files: *
-Copyright: 2014 Jan Treibig (jt), jan.treibig at gmail.com
+Copyright: 2015 Thomas Roehl (tr), Thomas.Roehl at fau.de
 License: GPL-3+
- Copyright (C) 2014 Jan Treibig and Thomas Roehl
+ Copyright (C) 2015 Jan Treibig and Thomas Roehl
  .
  This program is free software: you can redistribute it and/or modify it
  under the terms of the GNU General Public License as published by the
diff --git a/debian/docs b/debian/docs
index e845566..ff37da8 100644
--- a/debian/docs
+++ b/debian/docs
@@ -1 +1,2 @@
 README
+INSTALL
diff --git a/debian/files b/debian/files
new file mode 100644
index 0000000..26b9e2a
--- /dev/null
+++ b/debian/files
@@ -0,0 +1 @@
+likwid_4.0.0-1_amd64.deb misc optional
diff --git a/debian/likwid.lintian-overrides b/debian/likwid.lintian-overrides
index 633bd56..e7d12c1 100644
--- a/debian/likwid.lintian-overrides
+++ b/debian/likwid.lintian-overrides
@@ -1,3 +1,3 @@
 # We want to include the library in the main package
-likwid: package-name-doesnt-match-sonames liblikwidpin3
-likwid: non-dev-pkg-with-shlib-symlink usr/lib/x86_64-linux-gnu/liblikwidpin.so.3.1 usr/lib/x86_64-linux-gnu/liblikwidpin.so
+#likwid: package-name-doesnt-match-sonames liblikwidpin4
+#likwid: non-dev-pkg-with-shlib-symlink usr/lib/x86_64-linux-gnu/liblikwidpin.so.3.1 usr/lib/x86_64-linux-gnu/liblikwidpin.so
diff --git a/debian/likwid.symbols b/debian/likwid.symbols
deleted file mode 100644
index 7ed7a6e..0000000
--- a/debian/likwid.symbols
+++ /dev/null
@@ -1,2 +0,0 @@
-liblikwidpin.so.3 likwid #MINVER#
- pthread_create at Base 3.1.2
diff --git a/debian/patches/01-manpages.patch b/debian/patches/01-manpages.patch
index 22ba369..39cfaf5 100644
--- a/debian/patches/01-manpages.patch
+++ b/debian/patches/01-manpages.patch
@@ -1,451 +1,124 @@
---- /dev/null
-+++ b/doc/feedGnuplot.1
-@@ -0,0 +1,190 @@
-+.TH feedGnuplot 1 <DATE> likwid\-<VERSION>
+Description: Add man page for integrated Lua interpreter
+Author: Thomas Roehl <thomas.roehl at fau.de>$
+Last-Update: 2015-11-15$
+---$
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/$
+--- /dev/null	2015-11-02 08:31:39.219640753 +0100
++++ doc/likwid-lua.1	2015-11-06 15:03:28.584036000 +0100
+@@ -0,0 +1,116 @@
++.\" $Id: lua.man,v 1.13 2011/11/16 17:16:53 lhf Exp $
++.TH LUA 1 "$Date: 2011/11/16 17:16:53 $"
 +.SH NAME
-+feedGnuplot \- General purpose pipe-oriented plotting tool
++lua \- Lua interpreter
 +.SH SYNOPSIS
-+.B likwid-setFreq 
-+.IR <coreId>
-+.IR <frequency>
-+.IR [<governor>]
-+
++.B lua
++[
++.I options
++]
++[
++.I script
++[
++.I args
++]
++]
 +.SH DESCRIPTION
-+.B feedGnuplot
-+is a pipe-oriented plotting frontend for GNUplot that can read internediate results and create a sort of live plot of the data.
-+.B feedGnuplot
-+is used by
-+.B likwid-perfscope(1)
-+to print performance counter data printed out by the timeline daemon mode of
-+.B likwid-perfctr(1).
-+The Perl script
-+.B feedGnuplot
-+is not written by the LIKWID Authors, it was written by Dima Kogan and published under GPL. The original web page is https://github.com/dkogan/feedgnuplot
++.B lua
++is the standalone Lua interpreter.
++It loads and executes Lua programs,
++either in textual source form or
++in precompiled binary form.
++(Precompiled binaries are output by
++.BR luac ,
++the Lua compiler.)
++.B lua
++can be used as a batch interpreter and also interactively.
++.LP
++The given
++.I options
++are handled in order and then
++the Lua program in file
++.I script
++is loaded and executed.
++The given
++.I args
++are available to
++.I script
++as strings in a global table named
++.BR arg .
++If no options or arguments are given,
++then
++.B "\-v \-i"
++is assumed when the standard input is a terminal;
++otherwise,
++.B "\-"
++is assumed.
++.LP
++In interactive mode,
++.B lua
++prompts the user,
++reads lines from the standard input,
++and executes them as they are read.
++If a line does not contain a complete statement,
++then a secondary prompt is displayed and
++lines are read until a complete statement is formed or
++a syntax error is found.
++If a line starts with
++.BR '=' ,
++then
++.B lua
++evaluates and displays
++the values of the expressions in the remainder of the line.
++.LP
++At the very start,
++before even handling the command line,
++.B lua
++checks the contents of the environment variables
++.B LUA_INIT_5_2
++or
++.BR LUA_INIT ,
++in that order.
++If the contents is of the form
++.RI '@ filename ',
++then
++.I filename
++is executed.
++Otherwise, the string is assumed to be a Lua statement and is executed.
 +.SH OPTIONS
 +.TP
-+.B \-h
-+prints a help message to standard output, then exits.#
++.BI \-e " stat"
++execute statement
++.IR stat .
 +.TP
-+.B \-\-[no]domain
-+If enabled, the first element of each line is the domain variable.  If not, the point index is used.
++.B \-i
++enter interactive mode after executing
++.IR script .
 +.TP
-+.B \-\-[no]dataid
-+If enabled, each data point is preceded by the ID of the data set that point corresponds to. This ID is
-+interpreted as a string, NOT as just a number. If not enabled, the order of the point is used.
-+.TP
-+.B \-\-[no]3d
-+Do [not] plot in 3D. This only makes sense with 
-+.B --domain.
-+Each domain here is an (x,y) tuple.
-+.TP
-+.B \-\-colormap
-+Show a colormapped xy plot. Requires extra data for the color. zmin/zmax can be used to set the extents of the colors.
-+Automatically increments extraValuesPerPoint.
-+.TP
-+.B \-\-[no]stream
-+Do [not] display the data a point at a time, as it comes in.
-+.TP
-+.B \-\-[no]lines
-+Do [not] draw lines to connect consecutive points.
-+.TP
-+.B \-\-[no]points
-+Do [not] draw points.
-+.TP
-+.B \-\-circles
-+Plot with circles. This requires a radius be specified for each point. Automatically increments extraValuesPerPoint.
-+.TP
-+.B \-\-xlabel " xxx
-+Set x-axis label.
-+.TP
-+.B \-\-ylabel " xxx
-+Set y-axis label.
-+.TP
-+.B \-\-y2label " xxx
-+Set y2-axis label. Does not apply to 3d plots.
-+.TP
-+.B \-\-zlabel " xxx
-+Set z-axis label. Only applies to 3d plots.
-+.TP
-+.B \-\-title " xxx
-+Set the title of the plot.
-+.TP
-+.B \-\-legend " curveID=legend
-+Set the label for a curve plot. Use this option multiple times for multiple curves. With 
-+.B --dataid
-+, curveID is the ID. Otherwise, it's the index of the curve, starting at 0.
-+.TP
-+.B \-\-autolegend
-+Use the curve IDs for the legend. Titles given with
-+.B --legend
-+override these.
-+.TP
-+.B \-\-xlen " xxx
-+When using 
-+.B --stream
-+, sets the size of the x-window to plot. Omit this or set it to 0 to plot ALL the data. Does not make sense with 3d plots. Implies
-+.B --monotonic
-+.TP
-+.B \-\-xmin " xxx
-+Set the minimal point in range for the x-axis. These are ignored in a streaming plot.
-+.TP
-+.B \-\-xmax " xxx
-+Set the maximal point in range for the x-axis. These are ignored in a streaming plot.
-+.TP
-+.B \-\-ymin " xxx
-+Set the minimal point in range for the y-axis.
-+.TP
-+.B \-\-ymax " xxx
-+Set the maximal point in range for the y-axis.
-+.TP
-+.B \-\-y2min " xxx
-+Set the minimal point in range for the y2-axis. Does not apply to 3d plots.
-+.TP
-+.B \-\-y2max " xxx
-+Set the maximal point in range for the y2-axis. Does not apply to 3d plots.
-+.TP
-+.B \-\-zmin " xxx
-+Set the minimal point in range for the z-axis. Only applies to 3d plots or colormaps.
-+.TP
-+.B \-\-zmax " xxx
-+Set the maximal point in range for the z-axis. Only applies to 3d plots or colormaps.
-+.TP
-+.B \-\-y2 " xxx
-+Plot the data specified by this curve ID on the y2 axis. Without
-+.B --dataid
-+, the ID is just an ordered 0-based index. Does not apply to 3d plots.
-+.TP
-+.B \-\-curvestyle " curveID=style
-+Additional styles per curve. With
-+.B --dataid
-+, curveID is the ID. Otherwise, it's the index of the curve, starting at 0. Use this option multiple times for multiple curves.
-+.TP
-+.B \-\-curvestyleall " xxx
-+Additional styles for ALL curves.
-+.TP
-+.B \-\-extracmds " xxx
-+Additional commands. These could contain extra global styles for instance.
-+.TP
-+.B \-\-size " xxx
-+Gnuplot size option.
-+.TP
-+.B \-\-square
-+Plot data with aspect ratio 1. For 3D plots, this controls the aspect ratio for all 3 axes.
-+.TP
-+.B \-\-square_xy
-+For 3D plots, set square aspect ratio for ONLY the x,y axes.
-+.TP
-+.B \-\-hardcopy " xxx
-+If not streaming, output to a file specified here. Format inferred from filename.
-+.TP
-+.B \-\-maxcurves " xxx
-+The maximum allowed number of curves. This is 100 by default, but can be reset with this option. This exists purely to prevent perl from allocating all of the system's memory when reading bogus data.
-+.TP
-+.B \-\-monotonic
-+If
-+.B --domain
-+is given, checks to make sure that the x-coordinate in the input data is monotonically increasing.If a given x-variable is in the past, all data currently cached for this curve is purged. Without 
-+.B --monotonic
-+, all data is kept. Does not make sense with 3d plots. No 
-+.B --monotonic
-+by default.
-+.TP
-+.B \-\-extraValuesPerPoint " xxx
-+How many extra values are given for each data point. Normally this is 0, and does not need to be specified, but sometimes we want extra data, like for colors or point sizes or error bars, etc.
-+.B feedGnuplot
-+options that require this (colormap, circles) automatically set it. This option is ONLY needed if unknown styles are used, with 
-+.B --curvestyleall
-+for instance.
-+.TP
-+.B \-\-dump
-+Instead of printing to gnuplot, print to STDOUT. For debugging.
-+
-+.SH EXAMPLE
-+.IP 1. 4
-+Simple real-time plotting example: plot how much data is received on the wlan0 network interface in bytes/second
-+.TP
-+.B while true; do sleep 1; cat /proc/net/dev; done | gawk '/wlan0/ {if(b) {print $2-b; fflush()} b=$2}' | \\
-+.B feedgnuplot --lines --stream --xlen 10 --ylabel 'Bytes/sec' --xlabel seconds
-+.PP
-+Reads the stats of the network interface 'wlan0' every second, reformats it with
-+.B gawk
-+and pipes the formated output into
-+.B feedGnuplot
-+qto create a line plot (
-+.B --lines
-+) of the streaming input (
-+.B --stream
-+). Always show the last 10 seconds (
-+.B --xlen
-+) and use the labels 'seconds' for the x-axis and 'Bytes/sec' for the y-axis.
-+.IP 2. 4
-+Simple real-time plotting example: plot the 'idle' CPU consumption against time
-+.TP
-+.B sar 1 -1 | awk '$1 ~ /..:..:../ && $8 ~/^[0-9\.]*$/ {print $1,$8; fflush()}' | \\
-+.B feedgnuplot --stream --domain --lines --timefmt '%H:%M:%S' --set 'format x "%H:%M:%S"'
-+.PP
-+Reads the CPU IDLE consumption and sets the current time as x-axis key.
-+
-+.SH AUTHOR
-+Written by Dima Kogan <dima at secretsauce.net>.
-+.SH BUGS
-+Report Bugs on <https://github.com/dkogan/feedgnuplot/issues>.
-+.SH "SEE ALSO"
-+gnuplot(1), awk(1), sar(1),  likwid-perfscope(1), likwid-perfctr(1)
---- /dev/null
-+++ b/doc/likwid-accessD.1
-@@ -0,0 +1,22 @@
-+.TH LIKWID-ACCESSD 1 <DATE> likwid\-<VERSION>
-+.SH NAME
-+likwid-accessD \- This tool forwards the access operations from LIKWID PerfMon tools
-+to the MSR device files
-+.SH DESCRIPTION
-+.B likwid-accessD
-+is a command line application that opens a UNIX file socket and waits for access
-+operations from LIKWID tools that require access to the MSR and PCI device
-+files. The MSR and PCI device files are only accessible for users with root
-+privileges, therefore
-+.B likwid-accessD
-+requires the suid-bit set.
-+Depending on the current system architecture,
-+.B likwid-accessD
-+permits only access to registers defined for the architecture.
-+
-+.SH AUTHOR
-+Written by Thomas Roehl <thomas.roehl at gmail.com>.
-+.SH BUGS
-+Report Bugs on <http://code.google.com/p/likwid/issues/list>.
-+.SH "SEE ALSO"
-+likwid-perfctr(1), likwid-powermeter(1), likwid-features(1), likwid-pin(1), likwid-topology(1),
---- /dev/null
-+++ b/doc/likwid-genCfg.1
-@@ -0,0 +1,30 @@
-+.TH LIKWID-GENCFG 1 <DATE> likwid\-<VERSION>
-+.SH NAME
-+likwid-genCfg \- Get system topology and write them to file for faster LIKWID startup
-+.SH SYNOPSIS
-+.B likwid-genCfg
-+.RB [\-hv]
-+.RB [ \-o
-+.IR <filename>]
-+.SH DESCRIPTION
-+.B likwid-genCfg
-+is a command line application that stores the system's CPU and NUMA topology to
-+file. LIKWID applications use this file to read in the topology fast instead of
-+re-gathering all values. The default output path is /etc/likwid.cfg.
-+.SH OPTIONS
-+.TP
-+.B \-h
-+prints a help message to standard output, then exits.
++.BI \-l " name"
++execute the equivalent of
++.IB name =require(' name ')
++before executing
++.IR script .
 +.TP
 +.B \-v
-+prints a version message to standard output, then exits.
-+.TP
-+.B \-\^o " <filename>
-+sets output file path (optional)
-+
-+.SH AUTHOR
-+Written by Thomas Roehl <thomas.roehl at gmail.com>.
-+.SH BUGS
-+Report Bugs on <http://code.google.com/p/likwid/issues/list>.
-+.SH "SEE ALSO"
-+likwid-topology(1), likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1)
---- /dev/null
-+++ b/doc/likwid-memsweeper.1
-@@ -0,0 +1,28 @@
-+.TH LIKWID-MEMSWEEPER 1 <DATE> likwid\-<VERSION>
-+.SH NAME
-+likwid-memsweeper \- A tool to clean up NUMA memory domains and last level caches.
-+.SH SYNOPSIS
-+.B likwid-memsweeper
-+.RB [\-hv]
-+.RB [ \-c
-+.IR <NUMA_ID> ]
-+.SH DESCRIPTION
-+.B likwid-memsweeper
-+is a command line application to shrink the file buffer cache by filling the NUMA domain with random pages. Moreover the tool invalidates all cachelines in the LLC.
-+.SH OPTIONS
-+.TP
-+.B \-h
-+prints a help message to standard output, then exits.
-+.TP
-+.B \-v
-+prints a version message to standard output, then exits.
-+.TP
-+.B \-\^c " <NUMA_ID>
-+set the NUMA domain for sweeping.
-+
-+.SH AUTHOR
-+Written by Thomas Roehl <thomas.roehl at gmail.com>.
-+.SH BUGS
-+Report Bugs on <http://code.google.com/p/likwid/issues/list>.
-+.SH "SEE ALSO"
-+likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1), likwid-topology(1),
---- /dev/null
-+++ b/doc/likwid-mpirun.1
-@@ -0,0 +1,81 @@
-+.TH LIKWID-MPIRUN 1 <DATE> likwid\-<VERSION>
-+.SH NAME
-+likwid-mpirun \- A tool to start and monitor MPI applications with LIKWID
-+.SH SYNOPSIS
-+.B likwid-memsweeper
-+.RB [\-hd]
-+.RB [ \-hostfile
-+.IR filename ]
-+.RB [ \-nperdomain
-+.IR number_of_processes_in_domain ]
-+.RB [ \-pin
-+.IR expression ]
-+.RB [ \-omp
-+.IR expression ]
-+.RB [ \-mpi
-+.IR expression ]
-+.RB [\-\-]
-+.SH DESCRIPTION
-+.B likwid-mpirun
-+is a command line application that wraps the vendor-specific mpirun tool and adds calls to
-+.B likwid-perfctr(1)
-+to the execution string. The user-given application is ran, measured and the results returned to the staring node.
-+.SH OPTIONS
-+.TP
-+.B \-h
-+prints a help message to standard output, then exits.
-+.TP
-+.B \-d
-+prints debug messages to standard output.
++show version information.
 +.TP
-+.B \-\^hostfile " filename
-+specifies the nodes to schedule the MPI processes on
-+.TP
-+.B \-\^nperdomain " number_of_processes_in_domain
-+specifies the processes per affinity domain (see
-+.B likwid-pin
-+for info about affinity domains)
-+.TP
-+.B \-\^pin " expression
-+specifies the pinning for hybrid execution (see
-+.B likwid-pin
-+for info about affinity domains)
-+.TP
-+.B \-\^omp " expression
-+enables hybrid setup. Can only be used in combination with
-+.B -pin.
-+The only possible value is: intel
-+.TP
-+.B \-\^mpi " expression
-+specifies the MPI implementation that should be used by the wrapper. Possible values are intelmpi, openmpi and mvapich2
++.B \-E
++ignore environment variables.
 +.TP
 +.B \-\-
-+stops parsing arguments for likwid-mpirun, in order to set options for underlying MPI implementation after \-\-.
-+
-+.SH EXAMPLE
-+.IP 1. 4
-+For standard application:
-+.TP
-+.B likwid-mpirun -np 32  ./myApp
-+.PP
-+Will run 32 MPI processes, each host is filled with as much processes as written in ppn
-+.IP 2. 4
-+With pinning:
-+.TP
-+.B likwid-mpirun -np 32 -nperdomain S:2  ./myApp
-+.PP
-+Will start 32 MPI processes with 2 processes per socket.
-+.IP 3. 4
-+For hybrid runs:
-+.TP
-+.B likwid-mpirun -np 32 -pin M0:0-3_M1:0-3  ./myApp
-+.PP
-+Will start 32 MPI processes with 2 processes per node. Threads of the first process are pinned to the cores 0-3 in NUMA domain 0 (M0). The OpenMP threads of the second process are pinned to the first four cores in NUMA domain 1 (M1)
-+
-+
-+.SH AUTHOR
-+Written by Thomas Roehl <thomas.roehl at gmail.com>.
-+.SH BUGS
-+Report Bugs on <http://code.google.com/p/likwid/issues/list>.
-+.SH "SEE ALSO"
-+likwid-pin(1), likwid-perfctr(1), likwid-features(1), likwid-powermeter(1), likwid-topology(1),
---- /dev/null
-+++ b/doc/likwid-perfscope.1
-@@ -0,0 +1,55 @@
-+.TH LIKWID-PERFSCOPE 1 <DATE> likwid\-<VERSION>
-+.SH NAME
-+likwid-perfscope \- Frontend for the timeline mode of
-+.B likwid-perfctr(1)
-+that on-the-fly generates pictures from the measurements
-+.SH SYNOPSIS
-+.B likwid-perfscope 
-+.RB [\-h]
-+.RB [ \-cores
-+.IR <cpu_list> ]
-+.RB [ \-freq
-+.IR <frequency> ]
-+.RB [ \-group
-+.IR <eventset> ]
-+.SH DESCRIPTION
-+.B likwid-perfscope
-+is a command line application written in Perl that uses the timeline daemon mode of
-+.B likwid-perfctr(1)
-+to create on-the-fly pictures with the current measurements. It uses the
-+.B feedGnuplot(1)
-+script to send the current data to gnuplot.
-+.SH OPTIONS
-+.TP
-+.B \-h
-+prints a help message to standard output, then exits.
-+.TP
-+.B \-\^cores " <cpu_list>
-+measures the given group on given CPUs in <cpu_list>
-+.TP
-+.B \-\^freq " <frequency>
-+reads the current performance values every <frequency>. Available suffixes are 's' and 'ms', e.g. 500ms. Default value is 1s
++stop handling options.
 +.TP
-+.B \-\^group " <eventset>
-+defines the events and counters that should be read. Possible values can be gathered from
-+.B likwid-perfctr(1).
-+Default is group 'FLOPS_DP'
-+
-+.SH EXAMPLE
-+.IP 1. 4
-+Monitor double precision floating-point operations:
-+.TP
-+.B likwid-perfscope -group FLOPS_DP -cores 0-3 -freq 500ms
-+.PP
-+Executes
-+.B likwid-perfctr
-+on the first four cores. The values are read every 500ms are forwarded to gnuplot using the
-+.B feedGnuplot
-+script.
-+
-+.SH AUTHOR
-+Written by Jan Treibig <jan.treibig at gmail.com>.
-+.SH BUGS
-+Report Bugs on <http://code.google.com/p/likwid/issues/list>.
-+.SH "SEE ALSO"
-+likwid-perfctr(1), feedGnuplot(1), likwid-pin(1), likwid-powermeter(1), likwid-setFrequencies(1)
---- /dev/null
-+++ b/doc/likwid-setFreq.1
-@@ -0,0 +1,24 @@
-+.TH LIKWID-SETFREQ 1 <DATE> likwid\-<VERSION>
-+.SH NAME
-+likwid-setFreq \- Mediator for
-+.B likwid-setFrequencies(1)
-+that performs the actual setting of CPU cores' frequency and governor.
-+.SH SYNOPSIS
-+.B likwid-setFreq 
-+.IR <coreId>
-+.IR <frequency>
-+.IR [<governor>]
-+
-+.SH DESCRIPTION
-+.B likwid-setFreq
-+is a command line application that mediates the request from
-+.B likwid-setFrequencies(1)
-+because setting a CPU core's frequency and/or governor requires root privileges. This executable must be suid-root.
-+
-+
-+.SH AUTHOR
-+Written by Jan Treibig <jan.treibig at gmail.com>.
-+.SH BUGS
-+Report Bugs on <http://code.google.com/p/likwid/issues/list>.
++.B \-
++stop handling options and execute the standard input as a file.
 +.SH "SEE ALSO"
-+likwid-setFrequencies(1), likwid-perfctr(1), feedGnuplot(1), likwid-pin(1), likwid-powermeter(1)
++.BR luac (1)
++.br
++The documentation at lua.org,
++especially section 7 of the reference manual.
++.SH DIAGNOSTICS
++Error messages should be self explanatory.
++.SH AUTHORS
++R. Ierusalimschy,
++L. H. de Figueiredo,
++W. Celes
++.\" EOF
diff --git a/debian/patches/03-Makefile-man.patch b/debian/patches/03-Makefile-man.patch
index 534381f..63ecd84 100644
--- a/debian/patches/03-Makefile-man.patch
+++ b/debian/patches/03-Makefile-man.patch
@@ -1,33 +1,46 @@
-Description: Add missing manual files
- Some manual pages are missing in common likwid-3.1.2 packet. This patch adds
- them to the install and uninstall target in Makefile
- Author: Thomas Roehl <thomas.roehl at fau.de>$
- Last-Update: 2014-10-28$
- ---$
- This patch header follows DEP-3: http://dep.debian.net/deps/dep3/$
---- a/Makefile
-+++ b/Makefile
-@@ -308,6 +308,13 @@ install:
- 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-pin.1 > $(MANPREFIX)/man1/likwid-pin.1
- 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-setFrequencies.1 > $(MANPREFIX)/man1/likwid-setFrequencies.1
- 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-bench.1 > $(MANPREFIX)/man1/likwid-bench.1
-+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/feedGnuplot.1 > $(MANPREFIX)/man1/feedGnuplot.1
-+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-accessD.1 > $(MANPREFIX)/man1/likwid-accessD.1
-+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-genCfg.1 > $(MANPREFIX)/man1/likwid-genCfg.1
-+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-memsweeper.1 > $(MANPREFIX)/man1/likwid-memsweeper.1
-+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-mpirun.1 > $(MANPREFIX)/man1/likwid-mpirun.1
-+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-perfscope.1 > $(MANPREFIX)/man1/likwid-perfscope.1
-+	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-setFreq.1 > $(MANPREFIX)/man1/likwid-setFreq.1
+Description: Add some missing man pages to the install routine
+Author: Thomas Roehl <thomas.roehl at fau.de>$
+Last-Update: 2015-11-15$
+---$
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/$
+--- Makefile.orig	2015-11-06 15:19:47.129872000 +0100
++++ Makefile	2015-11-06 15:25:28.289781000 +0100
+@@ -1,3 +1,4 @@
++#w
+ #
+ # =======================================================================================
+ #
+@@ -268,10 +269,13 @@
+ 	@echo "===> INSTALL libraries to $(PREFIX)/lib"
+ 	@mkdir -p $(PREFIX)/lib
+ 	@install -m 755 liblikwid.so $(PREFIX)/lib
++	@ln -s liblikwid.so $(PREFIX)/lib/liblikwid.so.$(VERSION)
+ 	@install -m 644 liblikwid.a $(PREFIX)/lib
+ 	@install -m 755 liblikwidpin.so $(PREFIX)/lib
++	@ln -s liblikwidpin.so $(PREFIX)/lib/liblikwidpin.so.$(VERSION)
+ 	@install -m 644 ext/lua/liblua.a $(PREFIX)/lib
+ 	@install -m 755 ext/hwloc/libhwloc.so $(PREFIX)/lib
++	@ln -s libhwloc.so $(PREFIX)/lib/libhwloc.so.$(VERSION)
+ 	@install -m 644 ext/hwloc/libhwloc.a $(PREFIX)/lib
+ 	@echo "===> INSTALL man pages to $(MANPREFIX)/man1"
+ 	@mkdir -p $(MANPREFIX)/man1
+@@ -287,6 +291,10 @@
+ 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-mpirun.1 > $(MANPREFIX)/man1/likwid-mpirun.1
+ 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-perfscope.1 > $(MANPREFIX)/man1/likwid-perfscope.1
+ 	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-setFreq.1 > $(MANPREFIX)/man1/likwid-setFreq.1
++	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-agent.1 > $(MANPREFIX)/man1/likwid-agent.1
++	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-bench.1 > $(MANPREFIX)/man1/likwid-bench.1
++	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-setFrequencies.1 > $(MANPREFIX)/man1/likwid-setFrequencies.1
++	@sed -e "s/<VERSION>/$(VERSION)/g" -e "s/<DATE>/$(DATE)/g" < $(DOC_DIR)/likwid-lua.1 > $(MANPREFIX)/man1/likwid-lua.1
  	@chmod 644 $(MANPREFIX)/man1/likwid-*
  	@echo "===> INSTALL headers to $(PREFIX)/include"
  	@mkdir -p $(PREFIX)/include
-@@ -333,7 +340,8 @@ uninstall:
- 	@echo "===> REMOVING daemon applications from $(PREFIX)/sbin"
- 	@rm -f $(addprefix $(PREFIX)/sbin/,$(DAEMON_APPS)) 
- 	@echo "===> REMOVING man pages from $(MANPREFIX)/man1"
--	@rm -f $(addprefix $(MANPREFIX)/man1/,$(addsuffix  .1,$(APPS)))
-+	@rm -f $(MANPREFIX)/man1/likwid-*
-+	@rm -f $(MANPREFIX)/man1/feedGnuplot.1
- 	@echo "===> REMOVING headers from $(PREFIX)/include"
- 	@rm -f $(PREFIX)/include/likwid*.h
- 	@echo "===> REMOVING libs from $(PREFIX)/lib"
+@@ -334,6 +342,8 @@
+ 	@rm -f $(MANPREFIX)/man1/feedGnuplot.1
+ 	@rm -f $(MANPREFIX)/man1/likwid-setFreq.1
+ 	@rm -f $(MANPREFIX)/man1/likwid-accessD.1
++	@rm -f $(MANPREFIX)/man1/likwid-bench.1
++	@rm -f $(MANPREFIX)/man1/likwid-lua.1
+ 	@echo "===> REMOVING header from $(PREFIX)/include"
+ 	@rm -f $(PREFIX)/include/likwid.h
+ 	@rm -f $(PREFIX)/include/bstrlib.h
diff --git a/debian/patches/11-hwloc-soname.patch b/debian/patches/11-hwloc-soname.patch
new file mode 100644
index 0000000..cd2cdef
--- /dev/null
+++ b/debian/patches/11-hwloc-soname.patch
@@ -0,0 +1,16 @@
+Description: Add soname for hwloc shared library
+Author: Thomas Roehl <thomas.roehl at fau.de>$
+Last-Update: 2015-11-15$
+---$
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/$
+--- a/ext/hwloc/Makefile.orig	2015-11-06 15:10:49.424142000 +0100
++++ b/ext/hwloc/Makefile	2015-11-06 15:12:11.757397000 +0100
+@@ -41,7 +41,7 @@
+ 	$(Q)${AR} -cq $(STATIC_LIBHWLOC) $(OBJ)
+ 
+ $(SHARED_LIBHWLOC): $(OBJ)
+-	${Q}$(CC) $(LFLAGS) -Wall -shared -fPIC -o $(SHARED_LIBHWLOC) $(OBJ) $(LIBS)
++	${Q}$(CC) $(LFLAGS) -Wall -shared -fPIC -Wl,-soname,$(SHARED_LIBHWLOC).$(VERSION) -o $(SHARED_LIBHWLOC) $(OBJ) $(LIBS)
+ 
+ #PATTERN RULES
+ $(BUILD_DIR)/%.o:  %.c
diff --git a/debian/patches/12-lua-soname.patch b/debian/patches/12-lua-soname.patch
new file mode 100644
index 0000000..2538619
--- /dev/null
+++ b/debian/patches/12-lua-soname.patch
@@ -0,0 +1,16 @@
+Description: Add soname for lua shared library
+Author: Thomas Roehl <thomas.roehl at fau.de>$
+Last-Update: 2015-11-15$
+---$
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/$
+--- a/ext/lua/Makefile.orig	2015-11-06 15:13:53.827498000 +0100
++++ b/ext/lua/Makefile	2015-11-06 15:14:11.793856000 +0100
+@@ -40,7 +40,7 @@
+ 	$(Q)${AR} -cq $(STATIC_LIBLUA) $(OBJ)
+ 
+ $(SHARED_LIBLUA):
+-	$(Q)$(CC) $(LFLAGS) -shared -fPIC -o $(SHARED_LIBLUA) $(OBJ)
++	$(Q)$(CC) $(LFLAGS) -shared -fPIC -Wl,-soname,$(SHARED_LIBLUA).$(VERSION) -o $(SHARED_LIBLUA) $(OBJ)
+ 
+ $(INTERPRETER): $(STATIC_LIBLUA) $(BUILD_DIR)/lua.o
+ 	$(Q)$(CC) -o $@ $(LFLAGS) $(BUILD_DIR)/lua.o $(STATIC_LIBLUA) $(LIBS)
diff --git a/debian/patches/13-likwid-soname.patch b/debian/patches/13-likwid-soname.patch
new file mode 100644
index 0000000..2146e1d
--- /dev/null
+++ b/debian/patches/13-likwid-soname.patch
@@ -0,0 +1,16 @@
+Description: Add soname for likwid shared library
+Author: Thomas Roehl <thomas.roehl at fau.de>$
+Last-Update: 2015-11-15$
+---$
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/$
+--- Makefile.orig	2015-11-06 15:44:47.268372000 +0100
++++ Makefile	2015-11-06 15:45:07.144040000 +0100
+@@ -133,7 +133,7 @@
+ 
+ $(DYNAMIC_TARGET_LIB): $(BUILD_DIR) $(PERFMONHEADERS) $(OBJ)
+ 	@echo "===>  CREATE SHARED LIB  $(DYNAMIC_TARGET_LIB)"
+-	$(Q)${CC} $(DEBUG_FLAGS) $(SHARED_LFLAGS) $(SHARED_CFLAGS) -o $(DYNAMIC_TARGET_LIB) $(OBJ) $(LIBS) $(LIBHWLOC) $(LIBLUA)
++	$(Q)${CC} $(DEBUG_FLAGS) $(SHARED_LFLAGS) $(SHARED_CFLAGS) -Wl,-soname,$(DYNAMIC_TARGET_LIB).$(VERSION) -o $(DYNAMIC_TARGET_LIB) $(OBJ) $(LIBS) $(LIBHWLOC) $(LIBLUA)
+ 
+ $(DAEMON_TARGET): $(SRC_DIR)/access-daemon/accessDaemon.c
+ 	@echo "===>  Build access daemon likwid-accessD"
diff --git a/debian/patches/14-man-bench-fix.patch b/debian/patches/14-man-bench-fix.patch
new file mode 100644
index 0000000..bbd13de
--- /dev/null
+++ b/debian/patches/14-man-bench-fix.patch
@@ -0,0 +1,13 @@
+Description: Correct mistake in man page of likwid-bench
+Author: Thomas Roehl <thomas.roehl at fau.de>$
+Last-Update: 2015-11-15$
+---$
+This patch header follows DEP-3: http://dep.debian.net/deps/dep3/$
+--- doc/likwid-bench.1.orig	2015-11-06 15:36:42.962141000 +0100
++++ doc/likwid-bench.1	2015-11-06 15:36:52.646748000 +0100
+@@ -1,5 +1,4 @@
+ .TH LIKWID-BENCH 1 <DATE> likwid\-<VERSION>
+-.WARN
+ .SH NAME
+ likwid-bench \- low-level benchmark suite and microbenchmarking framework
+ .SH SYNOPSIS
diff --git a/debian/watch b/debian/watch
index 0ee9ee2..45ab058 100644
--- a/debian/watch
+++ b/debian/watch
@@ -1,3 +1,3 @@
-version=3
+version=4
 opts=repacksuffix=+dfsg1,dversionmangle=s/\+dfsg1\d*$// \
   http://ftp.fau.de/pub/likwid/likwid-(\d[\d.]*)\.tar\.gz
diff --git a/doc/Doxyfile b/doc/Doxyfile
new file mode 100644
index 0000000..dbfba97
--- /dev/null
+++ b/doc/Doxyfile
@@ -0,0 +1,1781 @@
+# Doxyfile 1.7.6.1
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or sequence of words) that should
+# identify the project. Note that if you do not use Doxywizard you need
+# to put quotes around the project name if it contains spaces.
+
+PROJECT_NAME           = "LIKWID"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO           = doc/logo.png
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       =
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = YES
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                =
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding
+# "class=itcl::class" will allow you to use the command class in the
+# itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given extension.
+# Doxygen has a built-in mapping, but you can override or extend it using this
+# tag. The format is ext=language, where ext is a file extension, and language
+# is one of the parsers supported by doxygen: IDL, Java, Javascript, CSharp, C,
+# C++, D, PHP, Objective-C, Python, Fortran, VHDL, C, C++. For instance to make
+# doxygen treat .inc files as Fortran files (default is PHP), and .f files as C
+# (default is Fortran), use: inc=Fortran f=C. Note that for custom extensions
+# you also need to set FILE_PATTERNS otherwise the files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to indicate getter
+# and setter methods for a property. Setting this option to YES (the default)
+# will make doxygen replace the get and set methods by a property in the
+# documentation. This will only work if the methods are indeed getting or
+# setting a simple type. If this is not the case, or you want to show the
+# methods anyway, you should set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and
+# unions with only public data fields will be shown inline in the documentation
+# of the scope in which they are defined (i.e. file, namespace, or group
+# documentation), provided this scope is documented. If set to NO (the default),
+# structs, classes, and unions are shown on a separate page (for HTML and Man
+# pages) or section (for LaTeX and RTF).
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+SYMBOL_CACHE_SIZE      = 0
+
+# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
+# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
+# their name and scope. Since this can be an expensive process and often the
+# same symbol appear multiple times in the code, doxygen keeps a cache of
+# pre-resolved symbols. If the cache is too small doxygen will become slower.
+# If the cache is too large, memory is wasted. The cache size is given by this
+# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = NO
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = NO
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = NO
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = NO
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = YES
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# If the sources in your project are distributed over multiple directories
+# then setting the SHOW_DIRECTORIES tag to YES will show the directory hierarchy
+# in the documentation. The default is NO.
+
+SHOW_DIRECTORIES       = NO
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+SHOW_FILES             = YES
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. The create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files
+# containing the references data. This must be a list of .bib files. The
+# .bib extension is automatically appended if omitted. Using this command
+# requires the bibtex tool to be installed. See also
+# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style
+# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this
+# feature you need bibtex and perl available in the search path.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = NO
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            = "$file:$line: $text"
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = ./src/includes/likwid.h ./doc/likwid-doxygen.md ./src/includes/perfmon_types.h ./src/includes/topology_types.h  ./src/includes/power_types.h ./src/includes/tree_types.h ./doc/archs/ ./doc/lua-doxygen.md ./doc/applications/ ./doc/likwid.cfg.md ./src/likwid.f90
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS          = *.md
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                =
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        = AccessDataRecord
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           = ./examples
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = NO
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C and C++ comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = NO
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            = doc/html
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+#  for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is advised to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when
+# changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If the tag is left blank doxygen
+# will generate a default style sheet. Note that doxygen will try to copy
+# the style sheet file to the HTML output directory, so don't put your own
+# style sheet in the HTML output directory as well, or it will be erased!
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the style sheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = YES
+
+# If the HTML_ALIGN_MEMBERS tag is set to YES, the members of classes,
+# files or namespaces will be aligned in HTML using tables. If set to
+# NO a bullet list will be used.
+
+HTML_ALIGN_MEMBERS     = YES
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded. For this to work a browser that supports
+# JavaScript and DHTML is required (for instance Mozilla 1.0+, Firefox
+# Netscape 6.0+, Internet explorer 5.0+, Konqueror, or Safari).
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely identify
+# the documentation publisher. This should be a reverse domain-name style
+# string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the master .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs)
+# at top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it. Since the tabs have the same information as the
+# navigation tree you can set this option to NO if you already set
+# GENERATE_TREEVIEW to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+# Since the tree basically has the same information as the tab index you
+# could consider to set DISABLE_INDEX to NO when enabling this option.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# By enabling USE_INLINE_TREES, doxygen will generate the Groups, Directories,
+# and Class Hierarchy pages using a tree view instead of an ordered list.
+
+USE_INLINE_TREES       = NO
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to the
+# mathjax.org site, so you can quickly see the result without installing
+# MathJax, but it is strongly recommended to install a local copy of MathJax
+# before deployment.
+
+MATHJAX_RELPATH        = http://www.mathjax.org/mathjax
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
+# names that should be enabled during MathJax rendering.
+
+MATHJAX_EXTENSIONS     =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = NO
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           = latex
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           =
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See
+# http://en.wikipedia.org/wiki/BibTeX for more info.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             = rtf
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load style sheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             = man
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          = .3
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = NO
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = NO
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             =
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles.
+# Optionally an initial location of the external documentation
+# can be added for each tagfile. The format of a tag file without
+# this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths or
+# URLs. If a location is present for each tag, the installdox tool
+# does not have to be run to correct the links.
+# Note that each tag file must have a unique name
+# (where the name does NOT include the path)
+# If a tag file is not located in the directory in which doxygen
+# is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              = /usr/bin/perl
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = YES
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will use the Helvetica font for all dot files that
+# doxygen generates. When you want a differently looking font you can specify
+# the font name using DOT_FONTNAME. You need to make sure dot is able to find
+# the font, which can be done by putting it in a standard location or by setting
+# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the Helvetica font.
+# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to
+# set the path where dot can find it.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = YES
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = YES
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH, SHOW_DIRECTORIES and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used. If you choose svg you need to set
+# HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible in IE 9+ (other browsers do not have this requirement).
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+# Note that this requires a modern browser other than Internet Explorer.
+# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you
+# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible. Older versions of IE do not have SVG support.
+
+INTERACTIVE_SVG        = NO
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = YES
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/doc/applications/likwid-accessD.md b/doc/applications/likwid-accessD.md
new file mode 100644
index 0000000..c80481e
--- /dev/null
+++ b/doc/applications/likwid-accessD.md
@@ -0,0 +1,55 @@
+/*! \page likwid-accessD <CODE>likwid-accessD</CODE>
+
+<H1>Information</H1>
+
+<CODE>likwid-accessD</CODE> is a command line application that opens a UNIX file socket and waits for access
+operations from LIKWID tools that require access to the MSR and PCI device
+files. The MSR and PCI device files are commonly only accessible for users with root
+privileges, therefore <CODE>likwid-accessD</CODE> requires the suid-bit set or a suitable libcap setting.
+Depending on the current system architecture, <CODE>likwid-accessD</CODE> permits only access to registers defined for the architecture.
+
+<!--<H1>Security concerns</H1>
+The <CODE>likwid-accessD</CODE> is a critical part of LIKWID. The accesses to the MSR and often also PCI devices are restricted to users with root privileges. In order to allow users the access to the MSR/PCI devices, the users have to get temporarily elevated privileges. There are currently two ways of achieving this in the Linux operating system. The convenient method are the suid/guid bits that allow an application to execute with the privileges of the owner (suid) or group (guid). Th [...]
+Both methods should be safe but there are exploits for the MSR devices, general suid applications and the <CODE>cap_sys_rawio</CODE>. We checked all exploits we found and built the access daemon so that it is not vulnerable for the exploits. By restricting the accessible registers and closing all file handles -->
+
+<H1>Build</H1>
+The building of <CODE>likwid-accessD</CODE> can be controlled through the <CODE>config.mk</CODE> file. Depending on the variable <CODE>BUILDDAEMON</CODE> the daemon code is built or not. The path to <CODE>likwid-accessD</CODE> is compiled into the LIKWID library, so if you want to use the access daemon from an uncommon path, you have to set the <CODE>ACCESSDAEMON</CODE> variable.
+
+<H1>Setup</H1>
+In order to allow <CODE>likwid-accessD</CODE> to run with elevated priviledges, there are three ways
+<UL>
+<LI>SUID Method:<BR>
+<CODE>
+root: # chown root:root likwid-accessD<BR>
+root: # chmod u+s likwid-accessD<BR>
+</CODE>
+</LI>
+<LI>GUID Method: (PCI devices cannot be accesses with this method but we are working on it)<BR>
+<CODE>
+root: # groupadd likwid<BR>
+root: # chown root:likwid likwid-accessD<BR>
+root: # chmod g+s likwid-accessD<BR>
+</CODE>
+</LI>
+<LI>Libcap Method:<BR>
+<CODE>
+root: # setcap cap_sys_rawio+ep likwid-accessD
+</CODE>
+</LI>
+</UL>
+There are Linux distributions where settings the suid permission on <CODE>likwid-accessD</CODE> is not enough. Try also to set the capabilities for <CODE>likwid-accessD</CODE>. 
+
+<H1>Protocol</H1>
+Every likwid instance will start its own daemon. This client-server pair will communicate with a socket file in <CODE>/tmp</CODE>  named <CODE>likwid-$PID</CODE>. The daemon only accepts one connection. As soon as the connect is successful the socket file will be deleted.
+
+From there the communication consists of write read pairs issued from the client. The daemon will ensure allowed register ranges relevant for the likwid applications. Other register access will be silently dropped and logged to <CODE>syslog</CODE>.
+
+On shutdown the client will terminate the daemon with a exit message.
+
+The daemon has the following error handling:
+<UL>
+<LI>To prevent daemons not stopped correctly the daemon has a timeout on startup.</LI>
+<LI>If the client prematurely disconnects the daemon terminates.</LI>
+<LI>If the client disconnects between a read and write the daemon catches <CODE>SIGPIPE</CODE>  and disconnects.</LI>
+</UL>
+*/
diff --git a/doc/applications/likwid-agent.md b/doc/applications/likwid-agent.md
new file mode 100644
index 0000000..44cbb65
--- /dev/null
+++ b/doc/applications/likwid-agent.md
@@ -0,0 +1,94 @@
+/*! \page likwid-agent <CODE>likwid-agent</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-agent</CODE> is a daemon application that uses \ref likwid-perfctr to measure hardware performance counters and write them to various output back-ends. The basic configuration is in a global configuration file that must be given on command line. The configuration of the hardware event sets is done with extra files suitable for each architecture. Besides the hardware event configuration, the raw data can be transformed using formulas to interested metrics. In order to output  [...]
+
+<H1>Config file</H1>
+The global configuration file has the following options:
+<TABLE>
+<TR>
+  <TH>Option
+                                             
+</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>GROUPPATH <path></TD>
+  <TD>Path to the group files containing event set and output definitions. See section <B>Group files</B> for information.</TD>
+</TR>
+<TR>
+  <TD>EVENTSET <group1> <group2> ...</TD>
+  <TD>Space separated list of groups (without .txt) that should be monitored.</TD>
+</TR>
+<TR>
+  <TD>DURATION <time></TD>
+  <TD>Measurement duration in seconds for each group.</TD>
+</TR>
+<TR>
+  <TD>LOGPATH <path></TD>
+  <TD>Sets the output logfile path for the measured data. Each monitoring group logs to its own file likwid.<group>.log</TD>
+</TR>
+<TR>
+  <TD>LOGSTYLE <update/log></TD>
+  <TD>Specifies whether new data should be appended to the files (log) or the file should be emptied first (update).<BR> Update is a common option if you read in the data afterwards by some monitoring tool like cacti, nagios, ... Default is log</TD>
+</TR>
+<TR>
+  <TD>GMETRIC <True/False></TD>
+  <TD>Activates the output to gmetric.</TD>
+</TR>
+<TR>
+  <TD>GMETRICPATH <path></TD>
+  <TD>Set path to the gmetric executable.</TD>
+</TR>
+<TR>
+  <TD>GMETRICCONFIG <path></TD>
+  <TD>Set path to a custom gmetric config file.</TD>
+</TR>
+<TR>
+  <TD>RRD <True/False></TD>
+  <TD>Activates the output to RRD files (Round Robin Database).</TD>
+</TR>
+<TR>
+  <TD>RRDPATH <path></TD>
+  <TD>Output path for the RRD files. The files are named according to the group and each output metric is saved as DS with function GAUGE. The RRD is configured with RRA entries to store average, minimum and maximum of 10 minutes for one hour, of 60 min for one day and daily data for one month.</TD>
+</TR>
+<TR>
+  <TD>SYSLOG <True/False></TD>
+  <TD>Activates the output to system log using logger.</TD>
+</TR>
+<TR>
+  <TD>SYSLOGPRIO <prio></TD>
+  <TD>Set the priority for the system log. The default priority is 'local0.notice'.</TD>
+</TR>
+</TABLE>
+
+<H1>Group files</H1>
+The group files are adapted performance group files as used by <CODE>likwid-perfctr</CODE>.
+This makes it easy to uses the predefined and often used performance groups as basis for the monitoring. The folder structure of for the groups is <CODE><GROUPPATH>/<SHORT_ARCH_NAME>/</CODE> with <SHORT_ARCH_NAME> similar to the ones for the performance groups, like 'sandybridge' or 'haswellEP'.
+
+
+<TABLE>
+<TR>
+  <TH>Option
+                                            
+</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>SHORT <string></TD>
+  <TD>A short descriptive information about the group.</TD>
+</TR>
+<TR>
+  <TD>EVENTSET<BR><counter1> <event1><BR><counter2>:<option1>:<option2> <event2></TD>
+  <TD>Definition of the eventset similar to the performance groups. See performance_groups for details.</TD>
+</TR>
+<TR>
+  <TD>METRICS<BR><metricname> <formula><BR><filter> <metricname> <formula></TD>
+  <TD>Definition of the output metrics. The syntax follows the METRICS definition of the performance groups as used by \ref likwid-perfctr . If no function is set at the beginning of the line, <formula> is evaluated for every CPU and send to the output back-ends. The <metricname> gets the prefix "T<cpuid> ". To avoid writing to much data to the back-ends, the data can be reduced by <filter>. The possible filter options are MIN, MAX, AVG, SUM, ONCE. The ONCE filter [...]
+</TR>
+
+</TABLE>
+
+<H1>Notice</H1>
+There is currently no predefined init script for <CODE>likwid-agent</CODE>, you have to create it yourself for your distribution.
+*/
diff --git a/doc/applications/likwid-bench.md b/doc/applications/likwid-bench.md
new file mode 100644
index 0000000..fc642e1
--- /dev/null
+++ b/doc/applications/likwid-bench.md
@@ -0,0 +1,93 @@
+/*! \page likwid-bench <CODE>likwid-bench</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-bench</CODE> is a benchmark suite for low-level (assembly) benchmarks to measure bandwidths and instruction throughput for specific instruction code on x86 systems. The currently included benchmark codes include common data access patterns like load and store but also calculations like vector triad and sum.
+<CODE>likwid-bench</CODE> includes architecture specific benchmarks for x86, x86_64 and x86 for Intel Xeon Phi coprocessors. The performance values can either be calculated by <CODE>likwid-bench</CODE> or measured using hardware performance counters by using \ref likwid-perfctr as a wrapper to <CODE>likwid-bench</CODE>. This requires to build <CODE>likwid-bench</CODE> with instrumentation enabled in config.mk (<CODE>INSTRUMENT_BENCH</CODE>).
+
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h</TD>
+  <TD>Print help message</TD>
+</TR>
+<TR>
+  <TD>-a</TD>
+  <TD>List all available benchmarks</TD>
+</TR>
+<TR>
+  <TD>-p</TD>
+  <TD>List all available thread affinity domains</TD>
+</TR>
+<TR>
+  <TD>-d <delim></TD>
+  <TD>Use <delim> instead of ',' for the output of -p</TD>
+</TR>
+<TR>
+  <TD>-l <test></TD>
+  <TD>List characteristics of <test> like number of streams, data used per loop iteration, ...</TD>
+</TR>
+<TR>
+  <TD>-t <test></TD>
+  <TD>Perform assembly benchmark <test></TD>
+</TR>
+<TR>
+  <TD>-s <min_time></TD>
+  <TD>Minimal time in seconds to run the benchmark.<BR>Using this time, the iteration count is determined automatically to provide reliable results. Default is 1. If the determined iteration count is below 10, it is normalized to 10.</TD>
+</TR>
+<TR>
+  <TD>-w <workgroup></TD>
+  <TD>Set a workgroup for the benchmark. A workgroup can have different formats:<BR>
+  <TABLE>
+    <TR>
+      <TH>Format</TH>
+      <TH>Description</TH>
+    </TR>
+    <TR>
+      <TD><affinity_domain>:<size></TD>
+      <TD>Allocate in total <size> in affinity domain <affinity_domain>.<BR><CODE>likwid-bench</CODE> starts as many threads as available in affinity domain <affinity_domain></TD>
+    </TR>
+    <TR>
+      <TD><affinity_domain>:<size>:<num_threads></TD>
+      <TD>Allocate in total <size> in affinity domain <affinity_domain>.<BR><CODE>likwid-bench</CODE> starts <num_threads> in affinity domain <affinity_domain></TD>
+    </TR>
+    <TR>
+      <TD><affinity_domain>:<size>:<num_threads>:<chunk_size>:<stride></TD>
+      <TD>Allocate in total <size> in affinity domain <affinity_domain>.<BR><CODE>likwid-bench</CODE> starts <num_threads> in affinity domain <affinity_domain> with <chunk_size> selected in row and a distance of <stride>.<BR>See \ref CPU_expressions on the \ref likwid-pin page for further information.</TD>
+    </TR>
+    <TR>
+      <TD><above_formats>-<streamID>:<stream_domain></TD>
+      <TD>In combination with every above mentioned format, the test streams (arrays, vectors) can be place in different affinity domains than the threads.<BR>This can be achieved by adding a stream placement option -<streamID>:<stream_domain> for all streams of the test to the workgroup definition.<BR>The stream with <streamID> is placed in affinity domain <stream_domain>.<BR>The amount of streams of a test can be determined with the -l <test> commandline o [...]
+    </TR>
+  </TD>
+  </TABLE>
+</TR>
+</TABLE>
+
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-bench -t copy -w S0:100kB</CODE><BR>
+Run test <CODE>copy</CODE> using all threads in affinity domain <CODE>S0</CODE>. The input and output stream of the <CODE>copy</CODE> benchmark sum up to <CODE>100kB</CODE> placed in affinity domain <CODE>S0</CODE>. The iteration count is calculated automatically.
+</LI>
+<LI><CODE>likwid-bench -t triad -i 100 -w S0:1GB:2:1:2</CODE><BR>
+Run test <CODE>triad</CODE> using <CODE>2</CODE> threads in affinity domain <CODE>S0</CODE>. Assuming <CODE>S0 = 0,4,1,5</CODE> the threads are pinned to CPUs 0 and 1, hence skipping of one thread during selection. The streams of the <CODE>triad</CODE> benchmark sum up to <CODE>1GB</CODE> placed in affinity domain <CODE>S0</CODE>. The number of iteration is explicitly set to <CODE>100</CODE>
+</LI>
+<LI><CODE>likwid-bench -t update -w S0:100kB -w S1:100kB</CODE><BR>
+Run test <CODE>update</CODE> using all threads in affinity domain <CODE>S0</CODE> and <CODE>S1</CODE>. The threads scheduled on <CODE>S0</CODE> use stream that sum up to <CODE>100kB</CODE>. Similar to <CODE>S1</CODE> the threads are placed there working only on their socket-local streams. The results of both workgroups are combined.
+</LI>
+<LI><CODE>likwid-perfctr -c E:S0:4 -g MEM -m likwid-bench -t update -w S0:100kB:4</CODE><BR>
+Run test <CODE>update</CODE> using <CODE>4</CODE> threads in affinity domain <CODE>S0</CODE>. The input and output stream of the <CODE>copy</CODE> benchmark sum up to <CODE>100kB</CODE> placed in affinity domain <CODE>S0</CODE>. The benchmark execution is measured using the \ref Marker_API. It measures the <CODE>MEM</CODE> performance group on the first four CPUs of the <CODE>S0</CODE> affinity domain. For further information about hardware performance counters see \ref likwid-perfctr<BR [...]
+</LI>
+<LI><CODE>likwid-bench -t copy -w S0:1GB:2:1:2-0:S1,1:S1</CODE><BR>
+Run test <CODE>copy</CODE> using <CODE>2</CODE> threads in affinity domain <CODE>S0</CODE> skipping one thread during selection. The two streams used in the <CODE>copy</CODE> benchmark have the IDs 0 and 1 and a summed up size of <CODE>1GB</CODE>. Both streams are placed in affinity domain <CODE>S1</CODE>.
+</LI>
+</UL>
+
+
+
+*/
diff --git a/doc/applications/likwid-genTopoCfg.md b/doc/applications/likwid-genTopoCfg.md
new file mode 100644
index 0000000..ae758c8
--- /dev/null
+++ b/doc/applications/likwid-genTopoCfg.md
@@ -0,0 +1,29 @@
+/*! \page likwid-genTopoCfg <CODE>likwid-genTopoCfg</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-genTopoCfg</CODE> is a command line application that stores the system's CPU and NUMA topology to
+file. LIKWID applications use this file to read in the topology fast instead of re-gathering all values. The path to the topology configuration can be set in the global LIKWID configuration file, see \ref likwid.cfg.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message.</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information.</TD>
+</TR>
+<TR>
+  <TD>-o <file></TD>
+  <TD>Use <file> instead of the default output /etc/likwid-topo.cfg./TD>
+</TR>
+</TABLE>
+
+
+*/
+
diff --git a/doc/applications/likwid-memsweeper.md b/doc/applications/likwid-memsweeper.md
new file mode 100644
index 0000000..570c7cb
--- /dev/null
+++ b/doc/applications/likwid-memsweeper.md
@@ -0,0 +1,34 @@
+/*! \page likwid-memsweeper <CODE>likwid-memsweeper</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-memsweeper</CODE> is a command line application to shrink the file buffer cache by filling the NUMA domain with random pages. Moreover, the tool invalidates all cachelines in the LLC.
+
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message.</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information.</TD>
+</TR>
+<TR>
+  <TD>-c <list></TD>
+  <TD>Sweeps the memory and LLC cache for NUMA domains listed in <list>.</TD>
+</TR>
+</TABLE>
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-memsweeper -c 0,1</CODE><BR>
+Cleans the memory and LLC on NUMA nodes identified by the node IDs 0 and 1.
+</LI>
+</UL>
+
+*/
diff --git a/doc/applications/likwid-mpirun.md b/doc/applications/likwid-mpirun.md
new file mode 100644
index 0000000..aee12d6
--- /dev/null
+++ b/doc/applications/likwid-mpirun.md
@@ -0,0 +1,83 @@
+/*! \page likwid-mpirun <CODE>likwid-mpirun</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-mpirun</CODE>
+A tool to start and monitor MPI applications with LIKWID. It can be used as supplement of the MPI implementations' startup programm like <CODE>mpirun</CODE> or <CODE>mpiexec</CODE> with some enhancements for pinning of OpenMP thread in hybrid jobs. Moreover, <CODE>likwid-mpirun</CODE> can insert calls to \ref likwid-perfctr to measure hardware performance counters for each MPI process and its threads, including Marker API. Since the <A HREF="http://modules.sourceforge.net/">modules</A> s [...]
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information</TD>
+</TR>
+<TR>
+  <TD>-d, --debug</TD>
+  <TD>Print debug information</TD>
+</TR>
+<TR>
+  <TD>-n, -np, --n, --np <arg></TD>
+  <TD>Specify the number of processes for MPI</TD>
+</TR>
+<TR>
+  <TD>--nperdomain <domain>:<arg></TD>
+  <TD>Schedule <arg> MPI processes for each affinity domain starting with <domain>, e.g S:2 translates in two MPI processes per socket.<BR><CODE>likwid-mpirun</CODE> assumes that all participating hosts have the same topology.</TD>
+</TR>
+<TR>
+  <TD>--hostfile <file></TD>
+  <TD>Specify the file that should be used as hostfile.<BR>If not set, <CODE>likwid-mpirun</CODE> checks the <CODE>PBS_NODEFILE</CODE>, <CODE>LOADL_HOSTFILE</CODE> and <CODE>SLURM_HOSTFILE</CODE> environment variable</TD>
+</TR>
+<TR>
+  <TD>--pin <expr></TD>
+  <TD>For hybrid pinning specify the thread pinning expression for each MPI process.<BR>The format is similar to \ref CPU_expressions separated by '_' for multiple processes.<BR>If -np is not set, the number of MPI processes is calculated using the pinning expressions.</TD>
+</TR>
+<TR>
+  <TD>-s, --skip <arg></TD>
+  <TD>'arg' must be a bitmask in hex. Threads with the ID equal to a set bit in bitmask will be skipped during pinning<BR>Example: 0x1 = Thread 0 is skipped.</TD>
+</TR>
+<TR>
+  <TD>--mpi <mpitype></TD>
+  <TD>Specify the type of the MPI implementation.<BR><CODE>likwid-mpirun</CODE> tries to read the MPI implementation from the <A HREF="http://modules.sourceforge.net/">modules</A> system.<BR>If not recognized automatically, possible values are <B>intelmpi</B>, <B>openmpi</B> and <B>mvapich2</B>.</TD>
+</TR>
+<TR>
+  <TD>--omp <omptype></TD>
+  <TD>Specify the type of OpenMP implementation.<BR><CODE>likwid-mpirun</CODE> tries to read the OpenMP implementation using <I>ldd</I> and the <A HREF="http://modules.sourceforge.net/">modules</A> system.<BR>If not recognized automatically, possible values are <B>intel</B> and <B>gnu</B></TD>
+</TR>
+<TR>
+  <TD>-g, --group <eventset></TD>
+  <TD>Use \ref likwid-perfctr to measure performance data for the MPI processes and OpenMP threads.<BR><eventset> can be either a performance group or a custom event string.<BR>For details see \ref performance_groups.</TD>
+</TR>
+<TR>
+  <TD>-m, --marker</TD>
+  <TD>Activate the \ref Marker_API for the measurements with \ref likwid-perfctr.</TD>
+</TR>
+<TR>
+  <TD>-O</TD>
+  <TD>Print results in CSV format (conform to <A HREF="https://tools.ietf.org/html/rfc4180">RFC 4180</A>)</TD>
+</TR>
+</TABLE>
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-mpirun -np 32 ./a.out</CODE><BR>
+Runs <CODE>./a.out</CODE> with 32 MPI processes distributed over the hosts in <CODE>PBS_NODEFILE</CODE>
+</LI>
+<LI><CODE>likwid-mpirun -nperdomain S:1 ./a.out</CODE><BR>
+Runs <CODE>./a.out</CODE> using one MPI process per socket over the hosts in <CODE>PBS_NODEFILE</CODE>, <CODE>LOADL_HOSTFILE</CODE> or <CODE>SLURM_HOSTFILE</CODE>.<BR>The total amount of processes is calculated by <numberOfSocketDomains> * <processCountPerDomain> * <hostsInHostfile>
+</LI>
+<LI><CODE>likwid-mpirun --hostfile host.list -pin S0:2_S1:2 ./a.out</CODE><BR>
+Runs <CODE>./a.out</CODE> using two MPI processes per host in <CODE>host.list</CODE>.<BR>The first MPI process on each host and its 2 threads are pinned to the first two CPUs on socket <CODE>S0</CODE>,<BR>the second MPI process on each host and its 2 threads are pinned to the first two CPUs on socket <CODE>S1</CODE>
+</LI>
+<LI><CODE>likwid-mpirun -nperdomain S:2 -g MEM ./a.out</CODE><BR>
+Runs <CODE>./a.out</CODE> with 2 MPI processes per socket on each host in <CODE>PBS_NODEFILE</CODE>, <CODE>LOADL_HOSTFILE</CODE> or <CODE>SLURM_HOSTFILE</CODE> and measure the <CODE>MEM</CODE> performance group<BR>
+Only one process per socket measures the Uncore/RAPL counters, the other one(s) only core-local counters.
+</LI>
+</UL>
+*/
diff --git a/doc/applications/likwid-perfctr.md b/doc/applications/likwid-perfctr.md
new file mode 100644
index 0000000..9efc789
--- /dev/null
+++ b/doc/applications/likwid-perfctr.md
@@ -0,0 +1,260 @@
+/*! \page likwid-perfctr <CODE>likwid-perfctr</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-perfctr</CODE> is a lightweight command line application to configure and read out hardware performance monitoring data
+on supported x86 processors. It can measure either as wrapper without changing the measured application
+or with \ref Marker_API functions inside the code, which will turn on and off the counters. Moreover, there are the timeline and stethoscope mode.
+There are preconfigured performance groups with useful event sets and derived metrics. Additonally, arbitrary events can be measured with
+custom event sets. The \ref Marker_API can measure mulitple named regions and the results are accumulated over multiple region calls.
+<P>
+<B>Note</B> that <CODE>likwid-perfctr</CODE> measures all events on the specified CPUs and not only the context of the executable. On a highly loaded system it will be hard to determine which part of the given application caused the counter increment. Moreover, it is necessary to ensure that processes and threads are pinned to dedicated resources. You can either pin the application yourself or use the builtin pin functionality.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message.</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information.</TD>
+</TR>
+<TR>
+  <TD>-V, --verbose <level></TD>
+  <TD>Verbose output during execution for debugging. Possible values for <level>:
+  <TABLE>
+    <TR>
+      <TD>0</TD>
+      <TD>Output only errors</TD>
+    </TR>
+    <TR>
+      <TD>1</TD>
+      <TD>Output some information</TD>
+    </TR>
+    <TR>
+      <TD>2</TD>
+      <TD>Output detailed information</TD>
+    </TR>
+    <TR>
+      <TD>3</TD>
+      <TD>Output developer information</TD>
+    </TR>
+  </TABLE>
+  </TD>
+</TR>
+<TR>
+  <TD>-i, --info</TD>
+  <TD>Print \a CPUID information about processor and about Intel Performance Monitoring features.</TD>
+</TR>
+<TR>
+  <TD>-g, --group <arg></TD>
+  <TD>Specify which event string or performance group should be measured.</TD>
+</TR>
+<TR>
+  <TD>-c <arg></TD>
+  <TD>Defines the CPUs that should be measured<BR>See \ref CPU_expressions on the \ref likwid-pin page for information about the syntax.</TD>
+</TR>
+<TR>
+  <TD>-C <arg></TD>
+  <TD>Defines the CPUs that should be measured and pin the executable to the CPUs<BR>See \ref CPU_expressions on the \ref likwid-pin page for information about the syntax.</TD>
+</TR>
+<TR>
+  <TD>-H</TD>
+  <TD>Print information about a performance group given with -g, --group option.</TD>
+</TR>
+<TR>
+  <TD>-m</TD>
+  <TD>Run in marker API mode</TD>
+</TR>
+<TR>
+  <TD>-a</TD>
+  <TD>Print available performance groups for current processor.</TD>
+</TR>
+<TR>
+  <TD>-e</TD>
+  <TD>Print available counters and performance events and suitable options of current processor.</TD>
+</TR>
+<TR>
+  <TD>-E <pattern></TD>
+  <TD>Print available performance events matching <pattern> and print the usable counters for the found events.<BR>The matching is done with *<pattern>*, so all events matching the substring are returned.</TD>
+</TR>
+<TR>
+  <TD>-o, --output <file></TD>
+  <TD>Store all ouput to file instead of stdout. LIKWID enables the reformatting of output files according to their suffix.<BR>You can place additional output formatters in folder <CODE><PREFIX>/share/likwid/filter</CODE>. LIKWID ships with one filter script <CODE>xml</CODE> written in Perl and a Perl template for developing own output scripts. If the suffix is <CODE>.csv</CODE>, the internal CSV printer is used for file output.<BR>Moreover, there are substitutions possible in the  [...]
+</TR>
+<TR>
+  <TD>-S <time></TD>
+  <TD>Specify the time between starting and stopping of counters. Can be used to monitor applications. Option does not require an executable<BR>Examples for <time> are 1s, 250ms, 500us.</TD>
+</TR>
+<TR>
+  <TD>-t <time></TD>
+  <TD>Activates the timeline mode that reads the counters in the given frequency <time> during the whole run of the executable<BR>Examples for <time> are 1s, 250ms, 500us.</TD>
+</TR>
+<TR>
+  <TD>-T <time></TD>
+  <TD>If multiple event sets are given on commandline, switch every <time> to next group. Default is 2s.<BR>Examples for <time> are 1s, 250ms, 500us.<BR>If only a single event set is given, the default read frequency is 30s to catch overflows.</TD>
+</TR>
+<TR>
+  <TD>-O</TD>
+  <TD>Print output in CSV format (conform to <A HREF="https://tools.ietf.org/html/rfc4180">RFC 4180</A>). The output contains some markers that help to parse the output.</TD>
+</TR>
+<TR>
+  <TD>-s, --skip <arg></TD>
+  <TD>'arg' must be a bitmask in hex. Threads with the ID equal to a set bit in bitmask will be skipped during pinning<BR>Example: 0x1 = Thread 0 is skipped.</TD>
+</TR>
+</TABLE>
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-perfctr -C 0-2 -g TLB ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPUs 0,1,2 and measure on the specified CPUs the performance group <CODE>TLB</CODE>. If not set, the environment variable <CODE>OMP_NUM_THREADS</CODE> is set to 3.
+</LI>
+<LI><CODE>likwid-perfctr  -C 0-4  -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPUs 0,1,2,3,4 and measure on the specified CPUs the event set <CODE>INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3</CODE>.<BR>The event set consists of two event definitions:
+    <UL>
+    <LI><CODE>INSTRUCTIONS_RETIRED_SSE:PMC0</CODE> measures event <CODE>INSTRUCTIONS_RETIRED_SSE</CODE> using counter register named <CODE>PMC0</CODE></LI>
+    <LI><CODE>CPU_CLOCKS_UNHALTED:PMC3</CODE> measures event <CODE>CPU_CLOCKS_UNHALTED</CODE> using counter register named <CODE>PMC3</CODE>. This event can be used to calculate the run time of the application.</LI>
+    </UL>
+</LI>
+
+<LI><CODE>likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,UNC_L3_LINES_IN_ANY:UPMC0 ./a.out</CODE><BR>
+Run and pin executable <CODE>./a.out</CODE> on CPU 0 with a custom event set containing three events.<BR>The event set consists of three event definitions:
+    <UL>
+    <LI><CODE>INSTR_RETIRED_ANY:FIXC0</CODE> measures event <CODE>INSTR_RETIRED_ANY</CODE> using Intel's fixed-purpose counter register named <CODE>FIXC0</CODE>.</LI>
+    <LI><CODE>CPU_CLK_UNHALTED_CORE:FIXC1</CODE> measures event <CODE>CPU_CLOCKS_UNHALTED</CODE> using Intel's fixed-purpose counter register named <CODE>FIXC1</CODE>. This event can be used to calculate the run time of the application.</LI>
+    <LI><CODE>UNC_L3_LINES_IN_ANY:UPMC0</CODE> measures event <CODE>UNC_L3_LINES_IN_ANY</CODE> using Uncore counter register named <CODE>UPMC0</CODE>. Uncore counters are socket-specific, hence LIKWID reads the counter registers only on one CPU per socket.</LI>
+    </UL>
+</LI>
+
+<LI><CODE>likwid-perfctr -m -C 0-4  -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./a.out</CODE><BR>
+Run and pin the executable to CPUs 0,1,2,3,4 and activate the Marker API. The code in <CODE>a.out</CODE> is assumed to be instrumented with LIKWID's Marker API. Only the marked code regions are measured.
+    <UL>
+    <LI><CODE>INSTRUCTIONS_RETIRED_SSE:PMC0</CODE> measures event <CODE>INSTRUCTIONS_RETIRED_SSE</CODE> using counter register named <CODE>PMC0</CODE>.</LI>
+    <LI><CODE>CPU_CLOCKS_UNHALTED:PMC3</CODE> measures event <CODE>CPU_CLOCKS_UNHALTED</CODE> using counter register named <CODE>PMC3</CODE>. This event can be used to calculate the run time of the application.</LI>
+    </UL>
+The Marker API for C/C++ offers 6 functions to measure named regions. You can use instrumented code with and without LIKWID. In order to activate the Marker API, <CODE>-DLIKWID_PERFMON</CODE> needs to be added to the compiler call. The following listing describes each function shortly (complete list see \ref Marker_API):
+    <UL>
+    <LI><CODE>LIKWID_MARKER_INIT</CODE>: Initialize LIKWID globally. Must be called in serial region and only once.</LI>
+    <LI><CODE>LIKWID_MARKER_THREADINIT</CODE>: Initialize LIKWID for each thread. Must be called in parallel region and executed by every thread.</LI>
+    <LI><CODE>LIKWID_MARKER_START('compute')</CODE>: Start a code region and associate it with the name 'compute'. The names are freely selectable and are used for grouping and outputting regions.</LI>
+    <LI><CODE>LIKWID_MARKER_STOP('compute')</CODE>: Stop the code region associated with the name 'compute'.</LI>
+    <LI><CODE>LIKWID_MARKER_SWITCH</CODE>: Switches to the next performance group or event set in a round-robin fashion. Can be used to measure the same region with multiple events. If called inside a code region, the results for all groups will be faulty. Be aware that each programming of the config registers causes overhead.</LI>
+    <LI><CODE>LIKWID_MARKER_CLOSE</CODE>: Finalize LIKWID globally. Should be called in the end of your application. This writes out all region results to a file that is picked up by <CODE>likwid-perfctr</CODE> for evaluation.</LI>
+    </UL>
+</LI>
+
+<LI><CODE>likwid-perfctr -c 0-3  -g FLOPS_DP -t 300ms ./a.out 2> out.txt</CODE><BR>
+Runs the executable <CODE>a.out</CODE> and measures the performance group <CODE>FLOPS_DP</CODE> on CPUs 0,1,2,3 every 300 ms. Since <CODE>-c</CODE> is used, the application is not pinned to the CPUs and <CODE>OMP_NUM_THREADS</CODE> is not set. The performance group <CODE>FLOPS_DP</CODE> is not available on every architecture, use <CODE>likwid-perfctr -a</CODE> for a complete list. Please note, that <CODE>likwid-perfctr</CODE> writes the measurements to stderr while the application's outp [...]
+The syntax of the timeline mode output lines is:<BR>
+<CODE><groupID> <numberOfEvents> <numberOfThreads> <Timestamp> <Event1_Thread1> <Event1_Thread2> ... <EventN_ThreadN></CODE><BR>
+You can also use the tool \ref likwid-perfscope to print the measured values live with <CODE>gnuplot</CODE>.
+</LI>
+
+<LI><CODE>likwid-perfctr -c 0-3  -g FLOPS_DP -S 2s</CODE><BR>
+Measures the performance group <CODE>FLOPS_DP</CODE> on CPUs 0,1,2,3 for 2 seconds. This option can be used to measure application from external or to perform low-level system monitoring.
+</LI>
+
+<LI><CODE>likwid-perfctr -c S0:0\@S1:0  -g LLC_LOOKUPS_DATA_READ:CBOX0C0:STATE=0x9 -S 2s</CODE><BR>
+Measures the event <CODE> LLC_LOOKUPS_DATA_READ</CODE> on the first CPU of socket 0 and the first CPU on socket 1 for 2 seconds using the counter 0 in CBOX 0 (LLC cache coherency engine). The counting is filtered to only lookups in the 'invalid' and 'modified' state. Look at the microarchitecture Uncore documentation for possible bitmasks. Which option is available for which counter class can be found in section \ref Architectures.
+</LI>
+</UL>
+
+\anchor performance_groups
+<H1>Performance groups</H1>
+One of the outstanding features of LIKWID are the performance groups. Each microarchitecture has its own set of events and related counters and finding the suitable events in the documentation is tedious. Moreover, the raw results of the events are often not meaningful, they need to be combined with other events like run time or clock speed. LIKWID addresses those problems by providing performance groups that specify a set of events and counter combinations as well as a set of derived me [...]
+<B>Please note that performance groups is a feature of the Lua API and not available for the C/C++ API.</B>
+<H3>Directory structure</H3>
+While installation of LIKWID, the performance groups are copied to the path <CODE>${INSTALL_PREFIX}/share/likwid</CODE>. In this folder there is one subfolder per microarchitecture that contains all performance groups for that microarchitecture. The folder names are not freely selectable, they are defined in <CODE>src/topology.c</CODE>. For every microarchitecture at the time of release, there is already a folder that can be extended with your own performance groups. You can change the p [...]
+<H3>Syntax of performance group files</H3>
+<CODE>SHORT <string></CODE> // Short description of the performance group<BR>
+<BR>
+<CODE>EVENTSET</CODE> // Starts the event set definition<BR>
+<CODE><counter>(:<options>) <event></CODE> // Each line defines one event/counter combination with optional options.<BR>
+<CODE>FIXC0 INSTR_RETIRED_ANY</CODE> // Example<BR>
+<BR>
+<CODE>METRICS</CODE> // Starts the derived metric definitions<BR>
+<CODE><metricname> <formula></CODE> // Each line defines one derived metric. <CODE><metricname></CODE> can contain spaces, <CODE><formula></CODE> must be free of spaces. The counter names (with options) and the variables <CODE>time</CODE> and <CODE>inverseClock</CODE> can be used as variables in <CODE><formula></CODE>.
+<CODE>CPI  FIXC1/FIXC0</CODE> // Example<BR>
+<BR>
+<CODE>LONG</CODE> // Starts the detailed description of the performance group<BR>
+<CODE><TEXT></CODE> // <CODE><TEXT></CODE> is displayed with <CODE>-H</CODE> commandline option
+
+\anchor Marker_API
+<H1>Marker API</H1>
+The Marker API enables measurement of user-defined code regions in order to get deeper insight what is happening at a specific point in the application. The Marker API itself has 8 commands. In order to activate the Marker API, the code must be compiled with <CODE>-DLIKWID_PERFMON</CODE>. If the code is compiled without this define, the Marker API functions perform no operation and cause no overhead. You can also run code compiled with LIKWID_PERFMON defined without measurements but a me [...]
+Even pure serial applications have to call LIKWID_MARKER_THREADINIT to initialize the accessDaemon or the direct accesses.<BR>
+The names for the regions can be freely chosen but <I>whitespaces are not allowed</I>.
+<H2>C/C++ Code</H2>
+<H3>Original code</H3>
+<CODE>
+\#include <stdlib.h><BR>
+\#include <stdio.h><BR>
+\#include <omp.h><BR>
+<BR>
+int main(int argc, char* argv[])<BR>
+{<BR>
+  int i=0;<BR>
+  double sum = 0;<BR>
+\#pragma omp parallel for reduction(+:sum)<BR>
+  for(i=0;i<100000;i++)<BR>
+  {<BR>
+    sum += 1.0/(omp_get_thread_num()+1);<BR>
+  }<BR>
+  printf("Sum is %f\n", sum);<BR>
+  return 0;<BR>
+}<BR>
+</CODE>
+<H3>Instrumented code</H3>
+<CODE>
+\#include <stdlib.h><BR>
+\#include <stdio.h><BR>
+\#include <omp.h><BR>
+\#include <likwid.h><BR>
+<BR>
+int main(int argc, char* argv[])<BR>
+{<BR>
+  int i=0;<BR>
+  double sum = 0;<BR>
+  LIKWID_MARKER_INIT;<BR>
+\#pragma omp parallel<BR>
+{<BR>
+  LIKWID_MARKER_THREADINIT;<BR>
+}<BR>
+\#pragma omp parallel<BR>
+{<BR>
+  LIKWID_MARKER_START("sum");<BR>
+\#pragma omp for reduction(+:sum)<BR>
+  for(i=0;i<100000;i++)<BR>
+  {<BR>
+    sum += 1.0/(omp_get_thread_num()+1);<BR>
+  }<BR>
+  LIKWID_MARKER_STOP("sum");<BR>
+}<BR>
+  printf("Sum is %f\n", sum);<BR>
+  LIKWID_MARKER_CLOSE;<BR>
+  return 0;<BR>
+}<BR>
+</CODE>
+The LIKWID package contains an example code: see \ref C-markerAPI-code or \ref F-markerAPI-code.
+<H3>Running code</H3>
+With the help of <CODE>likwid-perfctr</CODE> the counters are configured to the selected events. The counters are also started and stopped by <CODE>likwid-perfctr</CODE>, the Marker API only reads the counters to minimize the overhead of the instrumented application. Only if you use <CODE>LIKWID_MARKER_SWITCH</CODE> the Marker API itself configures a new event set to the registers. Basically, <CODE>likwid-perfctr</CODE> exports the whole configuration needed by the Marker API through env [...]
+In order to build your instrumented application:<BR>
+<CODE>$CC -openmp -L<PATH_TO_LIKWID_LIBRARY> -I<PATH_TO_LIKWID_INCLUDES> <SRC_CODE> -o <EXECUTABLE> -llikwid</CODE><BR>
+With standard installation, the paths are <CODE><PATH_TO_LIKWID_LIBRARY>=/usr/local/lib</CODE> and <CODE><PATH_TO_LIKWID_INCLUDES>=/usr/local/include</CODE><BR>
+Example Marker API call:<BR>
+<CODE>likwid-perfctr -C 0-4 -g L3 <B>-m</B> ./a.out</CODE>
+<BR>
+<BR>
+
+<H2>Fortran Code</H2>
+Besides the Marker API for C/C++ programms, LIKWID offers to build a Fortran module to access the Marker API functions from Fortran. Only the Marker API calls are exported, not the whole API. In <CODE>config.mk</CODE> the variable <CODE>FORTRAN_INTERFACE</CODE> must be set to true. LIKWID's default is to use the Intel Fortran compiler to build the interface but it can be modified to use GCC's Fortran compiler in <CODE>make/include_<COMPILER></CODE>.<BR>
+The LIKWID package contains an example code: see \ref F-markerAPI-code.
+
+<H2>Hints for the usage of the Marker API</H2>
+Since the calls to the LIKWID library are executed by your application, the runtime will raise and in specific circumstances, there are some other problems like the time measurement. You can execute <CODE>LIKWID_MARKER_THREADINIT</CODE> and <CODE>LIKWID_MARKER_START</CODE> inside the same parallel region but put a barrier between the calls to ensure that there is no big timing difference between the threads. The common way is to init LIKWID and the participating threads inside of an init [...]
+
+*/
diff --git a/doc/applications/likwid-perfscope.md b/doc/applications/likwid-perfscope.md
new file mode 100644
index 0000000..71c8984
--- /dev/null
+++ b/doc/applications/likwid-perfscope.md
@@ -0,0 +1,107 @@
+/*! \page likwid-perfscope <CODE>likwid-perfscope</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-perfscope</CODE> is a command line application written in Lua that uses the timeline daemon mode of \ref likwid-perfctr
+to create on-the-fly pictures with the current measurements. It uses the <A HREF="https://github.com/dkogan/feedgnuplot">feedGnuplot</A> Perl script to send the current data to  <A HREF="http://www.gnuplot.info/">gnuplot</A>. In order to make it more convenient for users, preconfigured plots of interesting metrics are embedded into <CODE>likwid-perfscope</CODE>. Since the plot windows are normally closed directly after the execution of the monitored applications, <CODE>likwid-perfscope</ [...]
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message.</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information.</TD>
+</TR>
+<TR>
+  <TD>-a, --all</TD>
+  <TD>Print available predefined plot configurations for current processor.</TD>
+</TR>
+<TR>
+  <TD>-d, --dump</TD>
+  <TD>Print measurements to stdout.</TD>
+</TR>
+<TR>
+  <TD>-p, --plotdump</TD>
+  <TD>Use feedGnuplots feature to dump plot configuration and its data to stdout.</TD>
+</TR>
+<TR>
+  <TD>-c <arg></TD>
+  <TD>Defines the CPUs that should be measured<BR>See \ref CPU_expressions on the \ref likwid-pin page for information about the syntax.</TD>
+</TR>
+<TR>
+  <TD>-C <arg></TD>
+  <TD>Defines the CPUs that should be measured and pin the executable to the CPUs<BR>See \ref CPU_expressions on the \ref likwid-pin page for information about the syntax.</TD>
+</TR>
+<TR>
+  <TD>-t, --time <time></TD>
+  <TD>Specify the measurement time for each plot. <time> is handled over to \ref likwid-perfctr with the -t option. <BR>Examples for <time> are 1s, 250ms, 500us.</TD>
+</TR>
+<TR>
+  <TD>-g, --group <arg></TD>
+  <TD>Specify a predefined plot with optional changes or an eventset with plot configuration. See \ref plot_configuration for details.</TD>
+</TR>
+<TR>
+  <TD>-r, --range <arg></TD>
+  <TD>Specify the amount of data points that should be visible in the plots. Often refered to as sliding window.</TD>
+</TR>
+<TR>
+  <TD>--host <arg></TD>
+  <TD>Connect to <arg> via ssh and execute likwid-perfctr and the application there. The plots are created on the local machine. Often used if measured on hosts without X11 or GnuPlot.</TD>
+</TR>
+</TABLE>
+
+\anchor plot_configuration
+<H1>Plot configurations</H1>
+<CODE>likwid-perfscope</CODE> extends the format of the eventset option of \ref likwid-perfctr to make it more conveniet for the users. It accepts either a plot configuration of interesting metrics which are embedded into <CODE>likwid-perfscope</CODE> or a custom eventset suitable for \ref likwid-perfctr extended by the plot configuration. A plot configuration can be set with key=value pairs separated by ':' and has to contain at least a definition of a formula for plotting. If specifyed [...]
+<TABLE>
+<TR>
+  <TH>Option
+                          
+  </TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>title=<string><BR>TITLE=<string></TD>
+  <TD>Use <string> as title for the plot. The string has to be quoted if it contains spaces. No ':' are allowed in the string</TD>
+</TR>
+<TR>
+  <TD>xtitle=<string><BR>XTITLE=<string></TD>
+  <TD>Use <string> as label for the x-axis. The default label is 'Time'. The string has to be quoted if it contains spaces. No ':' are allowed in the string</TD>
+</TR>
+<TR>
+  <TD>ytitle=<string><BR>YTITLE=<string></TD>
+  <TD>Use <string> as label for the left y-axis. The string has to be quoted if it contains spaces. No ':' are allowed in the string</TD>
+</TR>
+<TR>
+  <TD><string>=<string></TD>
+  <TD>Use the first <string> as legend entry and the second <string> as input forumla for the plot. The result is printed over the run time. The names of the specified counters can be used as variables in the formula. Additional variables are 'time' for the measurement time and 'inverseClock' for the inverted clock frequency. No spaces are allowed in the formula.</TD>
+</TR>
+<TR>
+  <TD>y2title=<string><BR>Y2TITLE=<string><BR>y2title=<id-string><BR>Y2TITLE=<id-string></TD>
+  <TD>Use <string> as label for the right y-axis. If <id-string> is given, the formula with id is associated with the y2-axis. If used with predefined plot configurations, be aware that the formula 1 is part of the plot configuration. If no id is given, the y2-axis is associated with the last given formula. The string has to be quoted if it contains spaces. No ':' are allowed in the string</TD>
+</TR>
+</TABLE>
+
+<H1>Examples</H1>
+<UL>
+<LI><CODE>likwid-perfscope -g L3_BAND -C 0-2 -t 1s ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPUs 0,1,2 and use the predefined plot configuration <CODE>L3_BAND</CODE> The plot is updated ever second.
+</LI>
+<LI><CODE>likwid-perfscope -g L3_BAND:TITLE="My Title" -C S0:1 -t 500ms ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPU 1 on Socket 0 and use the predefined plot configuration <CODE>L3_BAND</CODE> but change the title for the plot to "My Title".
+</LI>
+<LI><CODE>likwid-perfscope -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,CPI=FIXC0/FIXC1:YTITLE="CPI" -C 0 --time 2s ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPUs 0 and use the custom event set <CODE>INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1</CODE>. The last event set entry specifies custom plot options. The plotted formula is <CODE>FIXC0/FIXC1</CODE> and the plot title and legend entry is set to 'CPI'.
+</LI>
+<LI><CODE>likwid-perfscope -g L3_BAND,CPI=FIXC0/FIXC1:Y2TITLE="2-Cycles per Instruction" -C 0 --time 2s ./a.out</CODE><BR>
+Pin the executable <CODE>./a.out</CODE> to CPU 0 and use the predefined plot configuration  <CODE>L3_BAND</CODE> to measure every 2 seconds. Additionally, a formula <CODE>FIXC0/FIXC1</CODE> with the name <CODE>CPI</CODE> is given. The right y-axis is associated to the given function and labeled with <CODE>Cycles per Instruction</CODE>. The formula ID 2 is not needed in this case as the default behavior is to associate the right y-axis to the last formula given.
+</LI>
+</UL>
+
+*/
diff --git a/doc/applications/likwid-pin.md b/doc/applications/likwid-pin.md
new file mode 100644
index 0000000..b8c8a1e
--- /dev/null
+++ b/doc/applications/likwid-pin.md
@@ -0,0 +1,170 @@
+/*! \page likwid-pin <CODE>likwid-pin</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-pin</CODE> is a command line application to pin a sequential or multithreaded application to dedicated processors. It can be used as replacement for taskset.
+Opposite to taskset no affinity mask but single processors are specified. For multithreaded applications based on the <A HREF="https://computing.llnl.gov/tutorials/pthreads/"><CODE>pthreads</CODE></A> library the <CODE>pthread_create</CODE> library call is overloaded through <CODE>LD_PRELOAD</CODE> and each created thread is pinned to a dedicated processor as specified in the pinning list. Per default every generated thread is pinned to the core in the order of calls to <CODE>pthread_cre [...]
+<BR>
+For OpenMP implementations, GCC and ICC compilers are explicitly supported. Clang's OpenMP backend should also work as it is built on top of Intel's OpenMP runtime library. Others may also work.<BR>
+<BR>
+<CODE>likwid-pin</CODE> sets the environment variable <CODE>OMP_NUM_THREADS</CODE> for you if not already present. It will set as many threads as present in the pin expression.  Be aware that with <A HREF="https://computing.llnl.gov/tutorials/pthreads/"><CODE>pthreads</CODE></A> the parent thread is always pinned. If you create for example 4 threads with <CODE>pthread_create</CODE> and do not use the parent process as worker you still have to provide <CODE>num_threads + 1</CODE> processo [...]
+<BR>
+<CODE>likwid-pin</CODE> supports different numberings for pinning. Per default physical numbering of the cores is used. This is the numbering also \ref likwid-topology reports. But also logical numbering inside the node or the sockets can be used. For details look at \ref CPU_expressions. <!--If using with a N (e.g. -c N:0-6) the cores are logical numbered over the whole node. Physical cores come first. If a system e.g. has 8 cores with 16 SMT threads with -c N:0-7 you get all physical c [...]
+
+For applications where first touch policy on NUMA systems cannot be employed <CODE>likwid-pin</CODE> can be used to turn on interleave memory placement. This can significantly speed up the performance of memory bound multi threaded codes. All NUMA nodes the user pinned threads to are used for interleaving.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information</TD>
+</TR>
+<TR>
+  <TD>-V, --verbose <level></TD>
+  <TD>Verbose output during execution for debugging. Possible values for <level>:
+  <TABLE>
+    <TR>
+      <TD>0</TD>
+      <TD>Output only errors</TD>
+    </TR>
+    <TR>
+      <TD>1</TD>
+      <TD>Output some information</TD>
+    </TR>
+    <TR>
+      <TD>2</TD>
+      <TD>Output detailed information</TD>
+    </TR>
+    <TR>
+      <TD>3</TD>
+      <TD>Output developer information</TD>
+    </TR>
+  </TABLE>
+  </TD>
+</TR>
+<TR>
+  <TD>-c <arg></TD>
+  <TD>Define the CPUs that the application should be pinned on. LIKWID provides an intuitive and feature-rich syntax for CPU expressions.<BR>See section \ref CPU_expressions for details.</TD>
+</TR>
+<TR>
+  <TD>-S, --sweep</TD>
+  <TD>Sweep memory and clean LLC of NUMA domains used by the given CPU expression</TD>
+</TR>
+<TR>
+  <TD>-i</TD>
+  <TD>Activate interleaved memory policy for NUMA domains used by the given CPU expression</TD>
+</TR>
+<TR>
+  <TD>-p</TD>
+  <TD>Print the thread affinity domains. If -c is set on the commandline, the affinity domains filled only with the given CPUs are printed.</TD>
+</TR>
+<TR>
+  <TD>-q, --quiet</TD>
+  <TD>Don't print infos of the pinning process</TD>
+</TR>
+<TR>
+  <TD>-s, --skip <arg></TD>
+  <TD>'arg' must be a bitmask in hex. Threads with the ID equal to a set bit in bitmask will be skipped during pinning<BR>Example: 0x1 = Thread 0 is skipped.</TD>
+</TR>
+<TR>
+  <TD>-d</TD>
+  <TD>Set the delimiter for the output of -p. Default is ','</TD>
+</TR>
+</TABLE>
+
+\anchor thread_affinity_domains
+<H1>Affinity Domains</H1>
+While gathering the system topology, LIKWID groups the CPUs into so-called thread affinity domains. A thread affinity domain is a group of CPU IDs that are related to some kind of central entity of the system. The most common domain is the node domain (<CODE>N</CODE>) that contains all CPUs available in the system. Other domains group the CPUs according to socket, LLC or NUMA node relation. <CODE>likwid-pin</CODE> prints out all available affinity domains with the commandline option <COD [...]
+<TABLE>
+<TR>
+  <TH>Domain name</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD><CODE>N</CODE></TD>
+  <TD>Includes all CPUs in the system</TD>
+</TR>
+<TR>
+  <TD><CODE>S<number></CODE></TD>
+  <TD>Includes all CPUs that reside on CPU socket x</TD>
+</TR>
+<TR>
+  <TD><CODE>C<number></CODE></TD>
+  <TD>Includes all CPUs that share the same LLC with ID <CODE><number></CODE>.<BR>This domain often contains the same CPUs as the <CODE>S<number></CODE> domain because many CPU socket have a LLC shared by all CPUs of the socket</TD>
+</TR>
+<TR>
+  <TD><CODE>M<number></CODE></TD>
+  <TD>Includes all CPUs that are attached to the same NUMA memory domain</TD>
+</TR>
+</TABLE>
+
+\anchor CPU_expressions
+<H1>CPU expressions</H1>
+One outstanding feature of LIKWID are the CPU expressions which are resolved to the CPUs in the actual system. There are multiple formats that can be chosen where each offers a convenient way to select the desired CPUs for execution or measurement. The CPU expressions are used for <CODE>likwid-pin</CODE> as well as \ref likwid-perfctr. This section introduces the 4 formats and gives examples.
+
+<H3>Physical numbering:</H3>
+The first and probably most natural way of defining a list of CPUs is the usage of the physical numbering, similar to the numbering of the operating system and the IDs printed by \ref likwid-topology. The desired CPU IDs can be set as comma-separated list, as range or a combination of both.
+<UL>
+<LI><CODE>-c 1</CODE><BR>
+Run only on CPU with ID 1
+</LI>
+<LI><CODE>-c 1,4</CODE><BR>
+Run on CPUs with ID 1 and 4
+</LI>
+<LI><CODE>-c 1-3</CODE><BR>
+Run on CPUs ranging from ID 1 to ID 3, hence CPUs 1,2,3
+</LI>
+<LI><CODE>-c 0,1-3</CODE><BR>
+Run on CPU with ID 0 and the CPU range starting from ID 1 to ID3, hence 0,1,2,3
+</LI>
+</UL>
+<H3>Logical numbering:</H3>
+Besides the enumeration of physical CPU IDs, LIKWID supports the logical numbering inside of an affinity domain. For logical selection, the indicies inside of the desired affinity domain has to be given on the commandline. The logical numbering can be selected by prefixing the cpu expression with <CODE>L:</CODE>. The format is <CODE>L:<indices></CODE> assuming affinity domain <CODE>N</CODE> or <CODE>L:<affinity domain>:<indices></CODE>. Moreover, it is automatically act [...]
+<UL>
+<LI><CODE>-c L:0</CODE><BR>
+Run only on CPU 0, the first entry in the <B>sorted</B> affinity domain <CODE>N</CODE>
+</LI>
+<LI><CODE>-c L:0,4</CODE><BR>
+Run on the first and fifth entry in the <B>sorted</B> affinity domain <CODE>N</CODE>
+</LI>
+<LI><CODE>-c L:1-3</CODE><BR>
+Run on CPUs ranging from index 1 to index 3 in the <B>sorted</B> affinity domain <CODE>N</CODE>, hence CPUs 1,2,3.
+</LI>
+<LI><CODE>-c L:N:1,4-6</CODE><BR>
+Run on CPUs with index 1 and the range of indices from 4 to 6 in given <B>sorted</B> affinity domain <CODE>N</CODE>, hence CPUs 1,4,5,6.
+</LI>
+</UL>
+<H3>Numbering by expression:</H3>
+The most powerful format is probably the expression format. The format combines the input values for a selection function in a convenient way. In order to activate the expression format, the CPU string must be prefixed with <CODE>E:</CODE>. The basic format is <CODE>E:<affinity domain>:<numberOfThreads></CODE> which selects simply the given <CODE><numberOfThreads></CODE> in the supplied <CODE><affinity domain></CODE>. The extended format is <CODE>E:<affinity do [...]
+<UL>
+<LI><CODE>-c E:N:1</CODE><BR>
+Selects the first entry in the node affinity domain, thus CPU 0
+</LI>
+<LI><CODE>-c E:N:2</CODE><BR>
+Selects the first two entries in the node affinity domain, thus CPUs 0 and 4
+</LI>
+<LI><CODE>-c E:N:2:1:2</CODE><BR>
+Selects 1 CPU in a row and skips 1 entries thus we get CPUs 0 and 1
+</LI>
+<LI><CODE>-c E:N:4:2:4</CODE><BR>
+Selects in total 4 CPUs, 2 in a row with a stride of 4, thus CPUs 0,4,2,6
+</LI>
+</UL>
+<H3>Scatter expression:</H3>
+The scatter expression distributes the threads evenly over the desired affinity domains. In contrast to the previous selection methods, the scatter expression schedules threads over multiple affinity domains. Although you can also select <CODE>N</CODE> as scatter domain, the intended domains are <CODE>S</CODE>, <CODE>C</CODE> and <CODE>M</CODE>. The scattering selects physical cores first. For the examples we assume that the socket affinity domain looks like this: <CODE>S0 = 0,4,1,5</COD [...]
+<UL>
+<LI><CODE>-c S:scatter</CODE><BR>
+The resulting CPU list is 0,2,1,3,4,6,5,7
+</LI>
+<LI><CODE>-c M:scatter</CODE><BR>
+Scatter the threads evenly over all NUMA memory domains. A kind of interleaved thread policy.
+</LI>
+</UL>
+*/
diff --git a/doc/applications/likwid-powermeter.md b/doc/applications/likwid-powermeter.md
new file mode 100644
index 0000000..489689d
--- /dev/null
+++ b/doc/applications/likwid-powermeter.md
@@ -0,0 +1,75 @@
+/*! \page likwid-powermeter <CODE>likwid-powermeter</CODE>
+
+<H1>Information</H1>
+likwid-powermeter is a command line application to get the energy comsumption on Intel RAPL capable processors. Currently
+all Intel CPUs starting with Intel SandyBridge are supported. It also prints information about TDP and Turbo Mode steps supported.
+The Turbo Mode information works on all Turbo mode enabled Intel processors. The tool can be either used in stethoscope mode for a specified duration or as a wrapper to your application measuring your complete run. RAPL works on a per package (socket) base.
+Please note that the RAPL counters are also accessible as normal events withing \ref likwid-perfctr.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information</TD>
+</TR>
+<TR>
+  <TD>-V, --verbose <level></TD>
+  <TD>Verbose output during execution for debugging. Possible values for <level>:
+  <TABLE>
+    <TR>
+      <TD>0</TD>
+      <TD>Output only errors</TD>
+    </TR>
+    <TR>
+      <TD>1</TD>
+      <TD>Output some information</TD>
+    </TR>
+    <TR>
+      <TD>2</TD>
+      <TD>Output detailed information</TD>
+    </TR>
+    <TR>
+      <TD>3</TD>
+      <TD>Output developer information</TD>
+    </TR>
+  </TABLE>
+  </TD>
+</TR>
+<TR>
+  <TD>-c <arg></TD>
+  <TD>Specify sockets to measure</TD>
+</TR>
+<TR>
+  <TD>-M <0|1></TD>
+  <TD>Set access mode to access MSRs. 0=direct, 1=accessDaemon</TD>
+</TR>
+<TR>
+  <TD>-s <time></TD>
+  <TD>Set measure duration in us, ms or s. (default 2s)</TD>
+</TR>
+<TR>
+  <TD>-i, --info</TD>
+  <TD>Print information from <CODE>MSR_*_POWER_INFO</CODE> register and Turbo mode</TD>
+</TR>
+<TR>
+  <TD>-t</TD>
+  <TD>Print current temperatures of all CPU cores</TD>
+</TR>
+<TR>
+  <TD>-f</TD>
+  <TD>Print current temperatures of all CPU cores in Fahrenheit</TD>
+</TR>
+<TR>
+  <TD>-p</TD>
+  <TD>Print dynamic clocking and CPI values, uses \ref likwid-perfctr</TD>
+</TR>
+</TABLE>
+*/
diff --git a/doc/applications/likwid-setFreq.md b/doc/applications/likwid-setFreq.md
new file mode 100644
index 0000000..0db59e6
--- /dev/null
+++ b/doc/applications/likwid-setFreq.md
@@ -0,0 +1,13 @@
+/*! \page likwid-setFreq <CODE>likwid-setFreq</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-setFreq</CODE> is a command line application that mediates the actual setting of CPU cores' frequency and governor for \ref likwid-setFrequencies. Since only users with root priviledges are allowed to change the frequency of CPU cores, <CODE>likwid-setFreq</CODE> needs to be suid-root.
+
+<H1>Setup</H1>
+Setting the suid-root bit:<BR>
+<CODE>
+root: # chown root:root likwid-setFreq<BR>
+root: # chmod u+s likwid-setFreq
+</CODE>
+
+*/
diff --git a/doc/applications/likwid-setFrequencies.md b/doc/applications/likwid-setFrequencies.md
new file mode 100644
index 0000000..e753a9e
--- /dev/null
+++ b/doc/applications/likwid-setFrequencies.md
@@ -0,0 +1,50 @@
+/*! \page likwid-setFrequencies <CODE>likwid-setFrequencies</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-setFrequencies</CODE> is a command line application to set the clock frequency of CPU cores. Since only priviledged users are allowed to change the frequency of CPU cores, the application works in combination with a daemon
+\ref likwid-setFreq . The daemon needs the suid permission bit to be set in order to manipulate the sysfs entries. With <CODE>likwid-setFrequencies</CODE> the clock of all cores inside a the cpu_list or affinity domain can be set to a specific frequency or governor at once.
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message.</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information.</TD>
+</TR>
+<TR>
+  <TD>-l</TD>
+  <TD>Print all configurable frequencies.</TD>
+</TR>
+<TR>
+  <TD>-p</TD>
+  <TD>Print the current frequencies for all CPU cores.</TD>
+</TR>
+<TR>
+  <TD>-m</TD>
+  <TD>Print all configurable governors./TD>
+</TR>
+<TR>
+  <TD>-c <arg></TD>
+  <TD>Define the CPUs that should be modified. For information about the syntax see \ref CPU_expressions on the \ref likwid-pin page.</TD>
+</TR>
+<TR>
+  <TD>-f, --freq <arg></TD>
+  <TD>Specify the frequency for the selected CPUs.</TD>
+</TR>
+<TR>
+  <TD>-g <arg></TD>
+  <TD>Specify the governor for the selected CPUs.</TD>
+</TR>
+</TABLE>
+
+<H1>Notice</H1>
+Shortly before releasing the first version of LIKWID 4, the CPU frequency module and its behavior have changed compared to the previous <B>cpufreq</B> module. It is not possible anymore to set the CPU clock to a fixed frequency, you can only define a performance level called P-State. Inside that level, the CPU can vary its clock frequency. <CODE>likwid-setFrequencies</CODE> and its daemon \ref likwid-setFreq do not have support for the new kernel module <B>intel_pstate</B>. Therefore, th [...]
+
+*/
diff --git a/doc/applications/likwid-topology.md b/doc/applications/likwid-topology.md
new file mode 100644
index 0000000..f57a045
--- /dev/null
+++ b/doc/applications/likwid-topology.md
@@ -0,0 +1,68 @@
+/*! \page likwid-topology <CODE>likwid-topology</CODE>
+
+<H1>Information</H1>
+<CODE>likwid-topology</CODE> is a command line application to print the thread and cache topology on multicore x86 processors. Used with mono spaced fonts it can
+draw the processor topology of a machine in ASCII art. Beyond topology <CODE>likwid-topology</CODE> determines the nominal clock of a processor and prints detailed informations about the caches hierarchy.<BR>
+
+<H1>Options</H1>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>-h, --help</TD>
+  <TD>Print help message</TD>
+</TR>
+<TR>
+  <TD>-v, --version</TD>
+  <TD>Print version information</TD>
+</TR>
+<TR>
+  <TD>-V, --verbose <level></TD>
+  <TD>Verbose output during execution for debugging. Possible values for <level>:
+  <TABLE>
+    <TR>
+      <TD>0</TD>
+      <TD>Output only errors</TD>
+    </TR>
+    <TR>
+      <TD>1</TD>
+      <TD>Output some information</TD>
+    </TR>
+    <TR>
+      <TD>2</TD>
+      <TD>Output detailed information</TD>
+    </TR>
+    <TR>
+      <TD>3</TD>
+      <TD>Output developer information</TD>
+    </TR>
+  </TABLE>
+  </TD>
+</TR>
+<TR>
+  <TD>-c, --caches</TD>
+  <TD>Print detailed information about all cache levels</TD>
+</TR>
+<TR>
+  <TD>-C, --clock</TD>
+  <TD>Measure the nominal clock frequency and print it</TD>
+</TR>
+<TR>
+  <TD>-g</TD>
+  <TD>ASCII art output of the system's topology</TD>
+</TR>
+<TR>
+  <TD>-O</TD>
+  <TD>Print output in CSV format (conform to <A HREF="https://tools.ietf.org/html/rfc4180">RFC 4180</A>).</TD>
+</TR>
+<TR>
+  <TD>-o, --output <file></TD>
+  <TD>Write the output to file <file> instead of stdout. According to the used filename suffix, LIKWID tries to reformat the output to the specified format.<BR>By now, LIKWID ships with one filter script <CODE>xml</CODE> written in Perl and a Perl template for developing own output scripts. If the suffix is <CODE>.csv</CODE>, the internal CSV printer is used for file output.<BR>If <CODE>\%h</CODE> is in the filename, it is replaced by the host name.</TD>
+</TR>
+</TABLE>
+
+
+
+*/
diff --git a/doc/archs/atom.md b/doc/archs/atom.md
new file mode 100644
index 0000000..58a506c
--- /dev/null
+++ b/doc/archs/atom.md
@@ -0,0 +1,104 @@
+/*! \page atom Intel® Atom
+
+<P>The Intel® Atom performance monitoring counters are equal to the ones of the Intel® Core 2 microarchitecture but the event set is different.</P>
+<H1>Available performance monitors for the Intel® Atom microarchitecture</H1>
+<UL>
+<LI>\ref ATOM_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref ATOM_PMC "General-purpose counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor ATOM_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>The Core2/Atom microarchitecture is the first architecture offering a set of fixed-purpose counters. Each counter can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor ATOM_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Atom microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+
+
+*/
diff --git a/doc/archs/broadwell.md b/doc/archs/broadwell.md
new file mode 100644
index 0000000..ff207af
--- /dev/null
+++ b/doc/archs/broadwell.md
@@ -0,0 +1,203 @@
+/*! \page broadwell Intel® Broadwell
+
+<P>This page is valid for Broadwell, Broadwell single socket server (Xeon D) and Broadwell EP/EN/EX. No Uncore support by now, no documentation is available for the Uncore counters of Broadwell</P>
+
+<H1>Available performance monitors for the Intel® Broadwell microarchitecture</H1>
+<UL>
+<LI>\ref BRD_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref BRD_PMC "General-purpose counters"</LI>
+<LI>\ref BRD_THERMAL "Thermal counters"</LI>
+<LI>\ref BRD_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor BRD_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor BRD_PMC
+<H2>General-purpose counters</H2>
+<P>Commonly the Intel® Broadwell microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>in_transaction</TD>
+  <TD>N</TD>
+  <TD>Set bit 32 in config register</TD>
+  <TD>Only available if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+<TR>
+  <TD>in_transaction_aborted</TD>
+  <TD>N</TD>
+  <TD>Set bit 33 in config register</TD>
+  <TD>Only counter PMC2 and only if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Broadwell microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Broadwell microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with the [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/BDW">https://download.01.org/perfmon/BDW</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/BDW">https://download.01.org/perfmon/BDW</A>.</TD>
+</TR>
+</TABLE>
+<P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility for performance monitoring. Although we can program it from user-space, the results are always 0.</P>
+
+\anchor BRD_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® Haswell microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor BRD_POWER
+<H2>Power counter</H2>
+<P>The Intel® Broadwell microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/core2.md b/doc/archs/core2.md
new file mode 100644
index 0000000..679da04
--- /dev/null
+++ b/doc/archs/core2.md
@@ -0,0 +1,103 @@
+/*! \page core2 Intel® Core2
+
+<H1>Available performance monitors for the Intel® Core2 microarchitecture</H1>
+<UL>
+<LI>\ref FIXED "Fixed-purpose counters"</LI>
+<LI>\ref PMC "General-purpose counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor FIXED
+<H2>Fixed-purpose counters</H2>
+<P>The Intel Core2 microarchitecture is the first architecture offering a set of fixed-purpose counters. Each counter can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Core2 microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+
+
+*/
diff --git a/doc/archs/haswell.md b/doc/archs/haswell.md
new file mode 100644
index 0000000..65836bd
--- /dev/null
+++ b/doc/archs/haswell.md
@@ -0,0 +1,203 @@
+/*! \page haswell Intel® Haswell
+
+<H1>Available performance monitors for the Intel® Haswell microarchitecture</H1>
+<UL>
+<LI>\ref HAS_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref HAS_PMC "General-purpose counters"</LI>
+<LI>\ref HAS_THERMAL "Thermal counters"</LI>
+<LI>\ref HAS_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor HAS_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HAS_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Haswell microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>in_transaction</TD>
+  <TD>N</TD>
+  <TD>Set bit 32 in config register</TD>
+  <TD>Only available if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+<TR>
+  <TD>in_transaction_aborted</TD>
+  <TD>N</TD>
+  <TD>Set bit 33 in config register</TD>
+  <TD>Only counter PMC2 and only if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Haswell microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Haswell microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with the OFF [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8077 and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/HSW">https://download.01.org/perfmon/HSW</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/HSW">https://download.01.org/perfmon/HSW</A>.</TD>
+</TR>
+</TABLE>
+<P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility. Although we can program it from user-space, the results are always 0.</P>
+
+\anchor HAS_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® Haswell microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor HAS_POWER
+<H2>Power counter</H2>
+<P>The Intel® Haswell microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+
+*/
+
+
diff --git a/doc/archs/haswellep.md b/doc/archs/haswellep.md
new file mode 100644
index 0000000..9368c54
--- /dev/null
+++ b/doc/archs/haswellep.md
@@ -0,0 +1,896 @@
+/*! \page haswellep Intel® Haswell EP/EN/EX
+
+
+<H1>Available performance monitors for the Intel® Haswell EP/EN/EX microarchitecture</H1>
+<UL>
+<LI>\ref HASEP_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref HASEP_PMC "General-purpose counters"</LI>
+<LI>\ref HASEP_THERMAL "Thermal counters"</LI>
+<LI>\ref HASEP_POWER "Power measurement counters"</LI>
+<LI>\ref HASEP_BBOX "Home Agent counters"</LI>
+<LI>\ref HASEP_SBOX "Ring transfer counters"</LI>
+<LI>\ref HASEP_QBOX "Intel® QPI Link Layer counters"</LI>
+<LI>\ref HASEP_CBOX "Last Level cache counters"</LI>
+<LI>\ref HASEP_UBOX "Uncore management counters"</LI>
+<LI>\ref HASEP_WBOX "Power control unit counters"</LI>
+<LI>\ref HASEP_IBOX "Coherency for IIO traffic counters"</LI>
+<LI>\ref HASEP_MBOX "Integrated memory controller counters"</LI>
+<LI>\ref HASEP_RBOX "Ring-to-QPI interface counters"</LI>
+<LI>\ref HASEP_PBOX "Ring-to-PCIe interface counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor HASEP_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>in_transaction</TD>
+  <TD>N</TD>
+  <TD>Set bit 32 in config register</TD>
+  <TD>Only available if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+<TR>
+  <TD>in_transaction_aborted</TD>
+  <TD>N</TD>
+  <TD>Set bit 33 in config register</TD>
+  <TD>Only counter PMC2 and only if Intel® Transactional Synchronization Extensions are available</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Haswell microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied wit [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SLM">https://download.01.org/perfmon/HSX</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SLM">https://download.01.org/perfmon/HSX</A>.</TD>
+</TR>
+</TABLE>
+<P>The event MEM_TRANS_RETIRED_LOAD_LATENCY is not available because it needs programming of PEBS registers. PEBS is a kernel-level measurement facility. Although we can programm it from user-space, the results are always 0.</P>
+
+\anchor HASEP_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor HASEP_POWER
+<H2>Power counter</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+
+
+\anchor HASEP_BBOX
+<H2>Home Agent counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the Home Agent (HA) in the Uncore. The description from Intel®:<BR>
+<I>Each HA is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the HA is responsible for ordering memory reads/writes, coming in from the modular Ring, to a given address such that the IMC (memory controller).
+</I><BR>
+The Home Agent performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the HA. For systems where each socket has 12 or more cores, there are both HAs available. The name BBOX originates from the Nehalem EX Uncore monitoring where this functional unit is called BBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>46 bit hex address</TD>
+  <TD>Extract bits 6-31 and set bits 6-31 in PCI_UNC_HA_PMON_ADDRMATCH0 register of PCI device<BR>Extract bits 32-45 and set bits 0-13 in PCI_UNC_HA_PMON_ADDRMATCH1 register of PCI device</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_SBOX
+<H2>Ring-to-Ring interface counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture manages the socket internal traffic through ring-based networks. Depending on the system's configuration there are multiple rings in one socket. The SBOXes organizes the traffic between the rings. The description from Intel®:<BR>
+<I>The SBox manages the interface between the two Rings.<BR>
+The processor is composed of two independent rings connected via two sets of bi-directional buffered switches. Each set of bi-directional buffered switches is partitioned into two ingress/egress pairs. Further, each ingress/egress pair is associated with a ring stop on adjacent rings. This ring stop is termed an Sbo. The processor has up to 4 SBos depending on SKU. The Sbo can be simply thought of as a conduit for the ring, but must also help maintain ordering of traffic to ensure functi [...]
+</I><BR>
+The SBOX hardware performance counters are exposed to the operating system through the MSR interface. There are maximal four of those interfaces but not all must be present. The name SBOX originates from the Nehalem EX Uncore monitoring where the functional unit to the QPI network is called SBOX but it had a different duty..
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>SBOX<0-3>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0-3>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0-3>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0-3>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>tid</TD>
+  <TD>N</TD>
+  <TD>Set bit 19 in config register</TD>
+  <TD>This option has no real effect because TID filtering can be activated but there is no possibility to specify the TID somewhere.</TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_QBOX
+<H2>QPI interface counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the QPI Link layer (QPI) in the Uncore. The description from Intel®:<BR>
+<I>The Intel® QPI Link Layer is responsible for packetizing requests from the caching agent on the way out to the system interface. As such, it shares responsibility with the CBo(s) as the Intel® QPI caching agent(s). It is responsible for converting CBo requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring messages to Intel® QPI packets and vice versa. On Intel® Xeon processor  [...]
+</I><BR>
+The QPI hardware performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the QPI. The actual amount of QBOX counters depend on the CPU core count of one socket. If your system has not all interfaces but interface 0 does not work, try the other ones. The QBOX was introduced for the Haswell EP microarchitecture, for older Uncore-aware architectures the QBOX and the SBOX are the same.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>QBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>QBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>QBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>QBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>QBOX<0,1>FIX0</TD>
+  <TD>QPI_RATE</TD>
+</TR>
+<TR>
+  <TD>QBOX<0,1>FIX1</TD>
+  <TD>QPI_RX_IDLE</TD>
+</TR>
+<TR>
+  <TD>QBOX<0,1>FIX2</TD>
+  <TD>QPI_RX_LLR</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for QBOX<0,1>C<0,1,2,3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_RX_MATCH_0 register of PCI device</TD>
+  <TD>This option matches the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_RX_MATCH_1 register of PCI device</TD>
+  <TD>This option matches the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>match2</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_TX_MATCH_0 register of PCI device</TD>
+  <TD>This option matches the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>match3</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_TX_MATCH_1 register of PCI device</TD>
+  <TD>This option matches the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_RX_MASK_0 register of PCI device</TD>
+  <TD>This option masks the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>mask1</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_RX_MASK_1 register of PCI device</TD>
+  <TD>This option masks the receive side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>mask2</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_V3_QPI_PMON_TX_MASK_0 register of PCI device</TD>
+  <TD>This option masks the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+<TR>
+  <TD>mask3</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_V3_QPI_PMON_TX_MASK_1 register of PCI device</TD>
+  <TD>This option masks the transmit side. Check <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for bit fields.</TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_CBOX
+<H2>Last Level cache counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the LLC coherency engine in the Uncore. The description from Intel®:<BR>
+<I>The LLC coherence engine (CBo) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a CBo via the ring interconnect. The CBo is responsible for managing data delivery
+from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the LLC; generating snoops and collecting snoop responses from the local cores when the MESIF protocol requires it.
+</I><BR>
+The LLC hardware performance counters are exposed to the operating system through the MSR interface. The maximal amount of supported coherency engines for the Intel® Haswell EP/EN/EX microarchitecture is 17. E7-8800 v2 systems have all 17 engines, the E5-2600 v2 only 10 of them and the E5-1600 v2 only 6. It may be possible that your systems does not have all CBOXes, LIKWID will skip the unavailable ones in the setup phase. The name CBOX originates from the Nehalem EX Uncore monitorin [...]
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-17>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-17>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-17>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-17>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>tid</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 0-4 in MSR_UNC_C<0-17>_PMON_BOX_FILTER register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>state</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 17-22 in MSR_UNC_C<0-17>_PMON_BOX_FILTER register</TD>
+  <TD>M: 0x28, F: 0x10, M: 0x08, E: 0x04, S: 0x02, I: 0x01</TD>
+</TR>
+<TR>
+  <TD>nid</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Set bits 0-15 in MSR_UNC_C<0-17>_PMON_BOX_FILTER1 register</TD>
+  <TD>Note: Node 0 has value 0x0001</TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>9 bit hex value</TD>
+  <TD>Set bits 20-28 in MSR_UNC_C<0-17>_PMON_BOX_FILTER1 register</TD>
+  <TD>A list of valid opcodes can be found in the <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>2 bit hex address</TD>
+  <TD>Set bits 30-31 in MSR_UNC_C<0-17>_PMON_BOX_FILTER1 register</TD>
+  <TD>See the <A HREF="http://www.Intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v3 Uncore Manual</A> for more information.</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides an event LLC_LOOKUP which can be filtered with the 'state' option. If no 'state' is set, LIKWID sets the state to 0x1F, the default value to measure all lookups.</P>
+
+\anchor HASEP_UBOX
+<H2>Uncore management counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the management box in the Uncore. The description from Intel®:<BR>
+<I>The UBox serves as the system configuration controller within the physical processor. In this capacity, the UBox acts as the central unit for a variety of functions:
+<UL>
+<LI>The master for reading and writing physically distributed registers across Intel® Xeon processor E5 v3 family using the Message Channel.</LI>
+<LI>The UBox is the intermediary for interrupt traffic, receiving interrupts from the system and dispatching interrupts to the appropriate core.</LI>
+<LI>The UBox serves as the system lock master used when quiescing the platform (e.g., Intel® QPI bus lock).</LI>
+</UL>
+</I><BR>
+The Uncore management performance counters are exposed to the operating system through the MSR interface. The name UBOX originates from the Nehalem EX Uncore monitoring where those functional units are called UBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOXFIX</TD>
+  <TD>UBOX_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for UBOX<0,1> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_WBOX
+<H2>Power control unit counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the power control unit (PCU) in the Uncore. The description from Intel®:<BR>
+<I>The PCU is the primary Power Controller for the Intel® Xeon processor E5 v3 family. Intel® Xeon processor E5 v3 family uncore implements a power control unit acting as a core/uncore power and thermal manager. It runs its firmware on an internal micro-controller and coordinates the socket’s power states.
+</I><BR>
+The PCU performance counters are exposed to the operating system through the MSR interface. The name WBOX originates from the Nehalem EX Uncore monitoring where those functional units are called WBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>WBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX0FIX</TD>
+  <TD>CORES_IN_C3</TD>
+</TR>
+<TR>
+  <TD>WBOX1FIX</TD>
+  <TD>CORES_IN_C6</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>32 bit hex value</TD>
+  <TD>Set bits 0-31 in MSR_UNC_PCU_PMON_BOX_FILTER register</TD>
+  <TD>Band0: bits 0-7, Band1: bits 8-15, Band2: bits 16-23, Band3: bits 24-31</TD>
+</TR>
+<TR>
+  <TD>occupancy</TD>
+  <TD>2 bit hex value</TD>
+  <TD>Set bit 14-15 in config register</TD>
+  <TD>Cores in C0: 0x1, in C3: 0x2, in C6: 0x3</TD>
+</TR>
+<TR>
+  <TD>occupancy_edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>occupancy_invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 30 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_IBOX
+<H2>IRP box counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the IRP box in the Uncore. The description from Intel®:<BR>
+<I>IRP is responsible for maintaining coherency for IIO traffic that needs to be coherent (e.g. cross-socket P2P).
+</I>
+
+The IRP box counters are exposed to the operating system through the PCI interface. The IBOX was introduced with the Intel® IvyBridge EP/EN/EX microarchitecture.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>IBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>IBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_MBOX
+<H2>Memory controller counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the integrated Memory Controllers (iMC) in the Uncore. The description from Intel®:<BR>
+<I>The Intel® Xeon processor E5 v3 family integrated Memory Controller provides the interface to DRAM and communicates to the rest of the uncore through the Home Agent (i.e. the IMC does not connect to the Ring).<BR>
+In conjunction with the HA, the memory controller also provides a variety of RAS features, such as ECC, lockstep, memory access retry, memory scrubbing, thermal throttling, mirroring, and rank sparing.
+</I><BR>
+The integrated Memory Controllers performance counters are exposed to the operating system through PCI interfaces. There may be two memory controllers in the system (E7-8800 v2). There are 4 different PCI devices per memory controller, each covering one memory channel. Each channel has 4 different general-purpose counters and one fixed counter for the DRAM clock. The four channels of the first memory controller are MBOX0-3, the four channels of the second memory controller (if available) [...]
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>FIX</TD>
+  <TD>DRAM_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter MBOX<0-7>C<0-3>)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_RBOX
+<H2>Ring-to-QPI counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the Ring-to-QPI (R3QPI) interface in the Uncore. The description from Intel®:<BR>
+<I>R3QPI is the interface between the Intel® QPI Link Layer, which packetizes requests, and the Ring.<BR>
+R3QPI is the interface between the ring and the Intel® QPI Link Layer. It is responsible for translating between ring protocol packets and flits that are used for transmitting data across the Intel® QPI interface. It performs credit checking between the local Intel® QPI LL, the remote Intel® QPI LL and other agents on the local ring.
+</I><BR>
+The Ring-to-QPI performance counters are exposed to the operating system through PCI interfaces. Since the RBOXes manage the traffic from the LLC-connecting ring interface on the socket with the QPI interfaces (SBOXes), the amount is similar to the amount of SBOXes. See at SBOXes how many are available for which system configuration. The name RBOX originates from the Nehalem EX Uncore monitoring where those functional units are called RBOX.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>RBOX<0,1,2>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1,2>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1,2>C2</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor HASEP_PBOX
+<H2>Ring-to-PCIe counters</H2>
+<P>The Intel® Haswell EP/EN/EX microarchitecture provides measurements of the Ring-to-PCIe (R2PCIe) interface in the Uncore. The description from Intel®:<BR>
+<I>R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe.
+</I><BR>
+The Ring-to-PCIe performance counters are exposed to the operating system through a PCI interface. Independent of the system's configuration, there is only one Ring-to-PCIe interface per CPU socket.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/interlagos.md b/doc/archs/interlagos.md
new file mode 100644
index 0000000..cec7734
--- /dev/null
+++ b/doc/archs/interlagos.md
@@ -0,0 +1,107 @@
+/*! \page interlagos AMD® Interlagos
+
+<H1>Available performance monitors for the AMD® Interlagos microarchitecture</H1>
+<UL>
+<LI>\ref ILG_PMC "General-purpose counters"</LI>
+<LI>\ref ILG_UPMC "Northbridge general-purpose counters"</LI>
+</UL>
+
+
+\anchor ILG_PMC
+<H2>General-purpose counters</H2>
+<P>The AMD® Interlagos microarchitecture provides 6 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC5</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD>The value for threshold can range between 0x0 and 0x1F</TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor ILG_UPMC
+<H2>Northbridge general-purpose counters</H2>
+<P>The AMD® Interlagos microarchitecture provides 4 general-purpose counters for the Northbridge consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UPMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/ivybridge.md b/doc/archs/ivybridge.md
new file mode 100644
index 0000000..3008475
--- /dev/null
+++ b/doc/archs/ivybridge.md
@@ -0,0 +1,190 @@
+/*! \page ivybridge Intel® IvyBridge
+
+<H1>Available performance monitors for the Intel® IvyBridge microarchitecture</H1>
+<UL>
+<LI>\ref IVB_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref IVB_PMC "General-purpose counters"</LI>
+<LI>\ref IVB_THERMAL "Thermal counters"</LI>
+<LI>\ref IVB_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor IVB_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVB_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® IvyBridge microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® IvyBridge microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® IvyBridge microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with the  [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/IVB">https://download.01.org/perfmon/IVB</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/IVB">https://download.01.org/perfmon/IVB</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor IVB_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® IvyBridge microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor IVB_POWER
+<H2>Power counter</H2>
+<P>The Intel® IvyBridge microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2*</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+<P>*) The PWR2 counter is often not implemented by Intel® IvyBridge systems</P>
+*/
+
+
diff --git a/doc/archs/ivybridgeep.md b/doc/archs/ivybridgeep.md
new file mode 100644
index 0000000..09f0bcd
--- /dev/null
+++ b/doc/archs/ivybridgeep.md
@@ -0,0 +1,790 @@
+/*! \page ivybridgeep Intel® IvyBridge EP/EN/EX
+
+<H1>Available performance monitors for the Intel® IvyBridge microarchitecture</H1>
+<UL>
+<LI>\ref IVBEP_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref IVBEP_PMC "General-purpose counters"</LI>
+<LI>\ref IVBEP_THERMAL "Thermal counters"</LI>
+<LI>\ref IVBEP_POWER "Power measurement counters"</LI>
+<LI>\ref IVBEP_BBOX "Home Agent counters"</LI>
+<LI>\ref IVBEP_SBOX "Intel® QPI Link Layer counters"</LI>
+<LI>\ref IVBEP_CBOX "Last Level cache counters"</LI>
+<LI>\ref IVBEP_UBOX "Uncore management counters"</LI>
+<LI>\ref IVBEP_WBOX "Power control unit counters"</LI>
+<LI>\ref IVBEP_IBOX "Coherency for IIO traffic counters"</LI>
+<LI>\ref IVBEP_MBOX "Integrated memory controller counters"</LI>
+<LI>\ref IVBEP_RBOX "Ring-to-QPI interface counters"</LI>
+<LI>\ref IVBEP_PBOX "Ring-to-PCIe interface counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor IVBEP_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® IvyBridge EP/EN/EX microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can b [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel®.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/IVT">https://download.01.org/perfmon/IVT</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel®.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/IVT">https://download.01.org/perfmon/IVT</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor IVBEP_POWER
+<H2>Power counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2*</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+<P>*) The PWR2 counter is often not implemented by Intel® IvyBridge systems</P>
+
+\anchor IVBEP_BBOX
+<H2>Home Agent counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the Home Agent (HA) in the Uncore. The description from Intel®:<BR>
+<I>The HA is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the HA is responsible for ordering memory reads/writes, coming in from the modular Ring, to a given address such that the iMC (memory controller).</I><BR>
+The HA hardware performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the HA but only for the E7-8800 v2 both are available. The name BBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
+  <TD>A table of all valid opcodes can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>46 bit hex address</TD>
+  <TD>Extract bits 6-31 and set bits 6-31 in PCI_UNC_HA_PMON_ADDRMATCH0 register of PCI device<BR>Extract bits 32-45 and set bits 0-13 in PCI_UNC_HA_PMON_ADDRMATCH1 register of PCI device</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_SBOX
+<H2>LLC-to-QPI interface counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the QPI Link layer (QPI) in the Uncore. The description from Intel®:<BR>
+<I>The Intel® QPI Link Layer is responsible for packetizing requests from the caching agent on the way out to the system interface. As such, it shares responsibility with the CBo(s) as the Intel® QPI caching agent(s). It is responsible for converting CBo requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring messages to Intel® QPI packets and vice versa. On Ivy Bridge, Intel® QPI [...]
+</I><BR>
+The QPI hardware performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the QPI. If your system has not all interfaces but interface 0 does not work, try the other one. The name SBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>SBOX<0,1,2>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1,2>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1,2>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1,2>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1,2>FIX</TD>
+  <TD>QPI_RATE</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for SBOX<0-2>C<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_QPI_PMON_MATCH_0 register of PCI device</TD>
+  <TD>A description of matching capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_QPI_PMON_MATCH_1 register of PCI device</TD>
+  <TD>A description of matching capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_QPI_PMON_MASK_0 register of PCI device</TD>
+  <TD>A description of masking capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>mask1</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_QPI_PMON_MASK_1 register of PCI device</TD>
+  <TD>A description of masking capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_CBOX
+<H2>CBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the LLC coherency engine in the Uncore. The description from Intel®:<BR>
+<I>The LLC coherence engine (CBo) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a CBo via the ring interconnect. The CBo is responsible for managing data delivery from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the LLC;
+generating snoops and collecting snoop responses from the local cores when the MESIF protocol requires it.
+</I><BR>
+The LLC hardware performance counters are exposed to the operating system through the MSR interface. The maximal amount of supported coherency engines for the Intel® IvyBridge EP/EN/EX microarchitecture is 15. E7-8800 v2 systems have all 15 engines, the E5-2600 v2 only 10 of them and the E5-1600 v2 only 6. It may be possible that your systems does not have all CBOXes, LIKWID will skip the unavailable ones in the setup phase. The name CBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-15>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-15>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-15>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-15>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>tid</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 0-4 in MSR_UNC_C<0-15>_PMON_BOX_FILTER register</TD>
+  <TD>A description of filter capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>state</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 17-22 in MSR_UNC_C<0-15>_PMON_BOX_FILTER register</TD>
+  <TD>M: 0x28, F: 0x10, M: 0x08, E: 0x04, S: 0x02, I: 0x01</TD>
+</TR>
+<TR>
+  <TD>nid</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Set bits 0-15 in MSR_UNC_C<0-15>_PMON_BOX_FILTER1 register</TD>
+  <TD>Note: Node 0 has value 0x0001</TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>9 bit hex value</TD>
+  <TD>Set bits 20-28 in MSR_UNC_C<0-15>_PMON_BOX_FILTER1 register</TD>
+  <TD>A table of all valid opcodes can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>2 bit hex address</TD>
+  <TD>Set bits 30-31 in MSR_UNC_C<0-15>_PMON_BOX_FILTER1 register</TD>
+  <TD>A description of matching capabilities can be found in the <A HREF="http://www.Intel®.de/content/www/de/de/processors/xeon/xeon-e5-2600-v2-uncore-manual.html">Intel® Xeon E5-2600 v2 Uncore Manual</A>.</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides an event LLC_LOOKUP which can be filtered with the 'state' option. If no 'state' is set, LIKWID sets the state to 0x1F, the default value to measure all lookups.</P>
+
+
+\anchor IVBEP_UBOX
+<H2>Uncore management counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the management box in the Uncore. The description from Intel®:<BR>
+<I>
+The UBox serves as the system configuration controller within the physical processor. In this capacity, the UBox acts as the central unit for a variety of functions:
+<UL>
+<LI>The master for reading and writing physically distributed registers across physical processor using the Message Channel.</LI>
+<LI>The UBox is the intermediary for interrupt traffic, receiving interrupts from the system and dispatching interrupts to the appropriate core.</LI>
+<LI>The UBox serves as the system lock master used when quiescing the platform (e.g., Intel® QPI bus lock).</LI>
+</UL>
+</I><BR>
+The Uncore management performance counters are exposed to the operating system through the MSR interface. The name UBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOXFIX</TD>
+  <TD>UBOX_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for UBOX<0,1> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_WBOX
+<H2>Power control unit counters</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the power control unit (PCU) in the Uncore. The description from Intel®:<BR>
+<I>The PCU is the primary Power Controller for the physical processor package. The uncore implements a power control unit acting as a core/uncore power and thermal manager. It runs its firmware on an internal micro-controller and coordinates the socket’s power states.
+</I><BR>
+The PCU performance counters are exposed to the operating system through the MSR interface. The name WBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>WBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX0FIX</TD>
+  <TD>CORES_IN_C3</TD>
+</TR>
+<TR>
+  <TD>WBOX1FIX</TD>
+  <TD>CORES_IN_C6</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>32 bit hex value</TD>
+  <TD>Set bits 0-31 in MSR_UNC_PCU_PMON_BOX_FILTER register</TD>
+  <TD>Band0: bits 0-7, Band1: bits 8-15, Band2: bits 16-23, Band3: bits 24-31</TD>
+</TR>
+<TR>
+  <TD>occupancy</TD>
+  <TD>2 bit hex value</TD>
+  <TD>Set bit 14-15 in config register</TD>
+  <TD>Cores in C0: 0x1, in C3: 0x2, in C6: 0x3</TD>
+</TR>
+<TR>
+  <TD>occupancy_edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>occupancy_invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 30 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_IBOX
+<H2>IBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the IRP box in the Uncore. The description from Intel®:<BR>
+<I>IRP is responsible for maintaining coherency for IIO traffic that needs to be coherent (e.g. cross-socket P2P).
+</I><BR>
+The IRP box counters are exposed to the operating system through the PCI interface. The IBOX was introduced with the Intel® IvyBridge EP/EN/EX microarchitecture.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>IBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>IBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_MBOX
+<H2>MBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the integrated Memory Controllers (iMC) in the Uncore. The description from Intel®:<BR>
+<I>The integrated Memory Controller provides the interface to DRAM and communicates to the rest of the uncore through the Home Agent (i.e. the iMC does not connect to the Ring).<BR>
+In conjunction with the HA, the memory controller also provides a variety of RAS features, such as ECC, lockstep, memory access retry, memory scrubbing, thermal throttling, mirroring, and rank sparing.
+</I><BR>
+The uncore management performance counters are exposed to the operating system through PCI interfaces. There may be two memory controllers in the system (E7-8800 v2). There are 4 different PCI devices per memory controller, each covering one memory channel. Each channel has 4 different general-purpose counters and one fixed counter for the DRAM clock. The four channels of the first memory controller are MBOX0-3, the four channels of the second memory controller (if available) are named M [...]
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-7>FIX</TD>
+  <TD>DRAM_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter MBOX<0-7>C<0-3>)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_RBOX
+<H2>RBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the Ring-to-QPI (R3QPI) interface in the Uncore. The description from Intel®:<BR>
+<I>R3QPI is the interface between the Intel® QPI Link Layer, which packetizes requests, and the Ring.<BR>
+R3QPI is the interface between the ring and the Intel® QPI Link Layer. It is responsible for translating between ring protocol packets and flits that are used for transmitting data across the Intel® QPI interface. It performs credit checking between the local Intel® QPI LL, the remote Intel® QPI LL and other agents on the local ring.
+</I><BR>
+The R3QPI performance counters are exposed to the operating system through PCI interfaces. Since the RBOXes manage the traffic from the LLC-connecting ring interface on the socket with the QPI interfaces (SBOXes), the amount is similar to the amount of SBOXes. See at SBOXes how many are available for which system configuration. The name RBOX originates from the Nehalem EX Uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>RBOX<0,1,2>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1,2>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1,2>C2</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor IVBEP_PBOX
+<H2>PBOX counter</H2>
+<P>The Intel® IvyBridge EP/EN/EX microarchitecture provides measurements of the Ring-to-PCIe (R2PCIe) interface in the Uncore. The description from Intel®:<BR>
+<I>R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe.
+</I><BR>
+The R2PCIe performance counters are exposed to the operating system through a PCI interface. Independent of the system's configuration, there is only one Ring-to-PCIe interface.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/k10.md b/doc/archs/k10.md
new file mode 100644
index 0000000..a5ab582
--- /dev/null
+++ b/doc/archs/k10.md
@@ -0,0 +1,68 @@
+/*! \page k10 AMD® K10
+
+<H1>Available performance monitors for the AMD® K10 microarchitecture</H1>
+<UL>
+<LI>\ref K10_PMC "General-purpose counters"</LI>
+</UL>
+
+\anchor K10_PMC
+<H2>General-purpose counters</H2>
+<P>The AMD® K10 microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD>The value for threshold can range between 0x0 and 0x3</TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/k8.md b/doc/archs/k8.md
new file mode 100644
index 0000000..5bcdcce
--- /dev/null
+++ b/doc/archs/k8.md
@@ -0,0 +1,68 @@
+/*! \page k8 AMD® K8
+
+<H1>Available performance monitors for the AMD® K8 microarchitecture</H1>
+<UL>
+<LI>\ref K8_PMC "General-purpose counters"</LI>
+</UL>
+
+\anchor K8_PMC
+<H2>General-purpose counters</H2>
+<P>The AMD® K8 microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD>The value for threshold can range between 0x0 and 0x3</TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/kabini.md b/doc/archs/kabini.md
new file mode 100644
index 0000000..41824cc
--- /dev/null
+++ b/doc/archs/kabini.md
@@ -0,0 +1,162 @@
+/*! \page kabini AMD® Kabini
+
+<H1>Available performance monitors for the AMD® Kabini microarchitecture</H1>
+<UL>
+<LI>\ref KAB_PMC "General-purpose counters"</LI>
+<LI>\ref KAB_CPMC "L2 cache general-purpose counters"</LI>
+<LI>\ref KAB_UPMC "Northbridge general-purpose counters"</LI>
+</UL>
+
+
+\anchor KAB_PMC
+<H2>General-purpose counters</H2>
+<P>The AMD® Kabini microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD>The value for threshold can range between 0x0 and 0x3</TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+<H1>Counters available for one hardware thread per shared L2 cache</H1>
+\anchor KAB_CPMC
+<H2>L2 general-purpose counters</H2>
+<P>The AMD® Kabini microarchitecture provides 4 general-purpose counters for measuring L2 cache events. They consist of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CPMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CPMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CPMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CPMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD>The value for threshold can range between 0x0 and 0x3</TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>tid</TD>
+  <TD>4 bit hex value</TD>
+  <TD>Set bits 56-59 in config register</TD>
+  <TD>If bit equals 0, the events of the thread are counted. See <A HREF="http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2012/10/48751_16h_bkdg.pdf">BIOS and Kernel Developer’s Guide (BKDG) for AMD Family 16h Processors</A> for details.</TD>
+</TR>
+<TR>
+  <TD>nid</TD>
+  <TD>4 bit hex value</TD>
+  <TD>Set bits 48-51 in config register</TD>
+  <TD>If bit equals 0, the events of the thread are counted. See <A HREF="http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2012/10/48751_16h_bkdg.pdf">BIOS and Kernel Developer’s Guide (BKDG) for AMD Family 16h Processors</A> for details.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor KAB_UPMC
+<H2>Northbridge general-purpose counters</H2>
+<P>The AMD® Kabini microarchitecture provides 4 general-purpose counters for the Northbridge consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UPMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/nehalem.md b/doc/archs/nehalem.md
new file mode 100644
index 0000000..b2d45b8
--- /dev/null
+++ b/doc/archs/nehalem.md
@@ -0,0 +1,237 @@
+/*! \page nehalem Intel® Nehalem
+
+<H1>Available performance monitors for the Intel® Nehalem microarchitecture</H1>
+<UL>
+<LI>\ref NEH_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref NEH_PMC "General-purpose counters"</LI>
+<LI>\ref NEH_UNCORE "General-purpose counters for the Uncore"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor NEH_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor NEH_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Nehalem microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Nehalem microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Nehalem microarchitecture has one of those registers. Own filtering can be applied with the OFFCORE_RESPONSE_0_OPTIONS event. Only for those events two more counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xFF and written to bits 0-7 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/NHM-EP">https://download.01.org/perfmon/NHM-EP</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xF7 and written to bits 8-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/NHM-EP">https://download.01.org/perfmon/NHM-EP</A>.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor NEH_UNCORE
+<H2>Uncore counters</H2>
+<P>The Intel® Nehalem microarchitecture provides 8 general-purpose counters consisting of a config and a counter register. Moreover, there is a fixed-purpose counter to measure the clock of the Uncore.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UPMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC5</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC6</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC7</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMCFIX</TD>
+  <TD>UNCORE_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for UPMC<0-7> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 40-47 in MSR_UNCORE_ADDR_OPCODE_MATCH register</TD>
+  <TD>Documented but register only available in Westmere architecture. A list of valid opcodes can be found in the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A>.</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>40 bit physical memory address</TD>
+  <TD>Extract bits 3-39 from address and write them to bits 3-39 in MSR_UNCORE_ADDR_OPCODE_MATCH register</TD>
+  <TD>Documented but register only available in Westmere architecture. </TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/nehalemex.md b/doc/archs/nehalemex.md
new file mode 100644
index 0000000..8bbb735
--- /dev/null
+++ b/doc/archs/nehalemex.md
@@ -0,0 +1,554 @@
+/*! \page nehalemex Intel® Nehalem EX
+
+<H1>Available performance monitors for the Intel® Nehalem EX microarchitecture</H1>
+<UL>
+<LI>\ref NEHEX_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref NEHEX_PMC "General-purpose counters"</LI>
+<LI>\ref NEHEX_MBOX "Memory controller counters"</LI>
+<LI>\ref NEHEX_BBOX "Home Agent counters"</LI>
+<LI>\ref NEHEX_RBOX "Crossbar router counters"</LI>
+<LI>\ref NEHEX_CBOX "Last Level cache counters"</LI>
+<LI>\ref NEHEX_SBOX "LLC-to-QPI interface counters"</LI>
+<LI>\ref NEHEX_WBOX "Power control unit counters"</LI>
+<LI>\ref NEHEX_UBOX "Uncore management counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor NEHEX_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Nehalem EX microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Nehalem EX microarchitecture has two of those registers. Own filtering can be applied with the OFFCORE_RESPONSE_0_OPTIONS event. Only for those events two more counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xFF and written to bits 0-7 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/NHM-EX">https://download.01.org/perfmon/NHM-EX</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xF7 and written to bits 8-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/NHM-EX">https://download.01.org/perfmon/NHM-EX</A>.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor NEHEX_MBOX
+<H2>MBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the memory controllers in the Uncore. The description from Intel®:<BR>
+<I>The memory controller interfaces to the Intel® 7500 Scalable Memory Buffers and translates read and write commands into specific Intel® Scalable Memory Interconnect (Intel® SMI) operations. Intel SMI is based on the FB-DIMM architecture, but the Intel 7500 Scalable Memory Buffer is not an AMB2 device and has significant exceptions to the FB-DIMM2 architecture. The memory controller also provides a variety of RAS features, such as ECC, memory scrubbing, thermal throttling,  [...]
+</I><BR>
+The Intel® Nehalem EX microarchitecture has 2 memory controllers, each with 6 general-purpose counters. They are exposed through the MSR interface to the operating system kernel. The MBOX and RBOX setup routines are taken from Likwid 3, they are not as flexible as the newer setup routines but programming of the MBOXes and RBOXes is tedious for Westmere EX. It is not possible to specify a FVID (Fill Victim Index) for the MBOX or IPERF option for RBOXes.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C5</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Special handling for events</H3>
+<P>For the events DRAM_CMD_ALL and DRAM_CMD_ILLEGAL two counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>34 bit address</TD>
+  <TD>Set bits 0-33 in MSR_M<0,1>_PMON_ADDR_MATCH register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>60 bit hex value</TD>
+  <TD>Extract bits 6-33 from address and set bits 0-27 in MSR_M<0,1>_PMON_ADDR_MASK register</TD>
+  <TD></TD>
+</TR>
+</TABLE><BR>
+<P>For the events THERM_TRP_DN and THERM_TRP_UP you cannot measure events for all and one specific DIMM simultaneously because they program the same filter register MSR_M<0,1>_PMON_MSC_THR and have contrary configurations.</P>
+<P>Although the events FVC_EV<0-3> are available to measure multiple memory events, some overlap and do not allow simultaneous measuring. That's because they program the same filter register MSR_M<0,1>_PMON_ZDP and have contrary configurations. One case are the FVC_EV<0-3>_BBOX_CMDS_READS and FVC_EV<0-3>_BBOX_CMDS_WRITES events that measure memory reads or writes but cannot be measured at the same time.</P>
+
+
+
+\anchor NEHEX_BBOX
+<H2>BBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the Home Agent in the Uncore. The description from Intel®:<BR>
+<I>The B-Box is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the B-Box is responsible for ordering memory reads/writes to a given address such that the M-Box does not have to perform this conflict checking. All requests for memory attached to the coupled M-Box must first be ordered through the B-Box.
+</I><BR>
+The memory traffic in an Intel® Nehalem EX system is controller by the Home Agents. Each MBOX has a corresponding BBOX. Each BBOX offers 4 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Special handling for events</H3>
+<P>For the matching events MSG_IN_MATCH, MSG_ADDR_IN_MATCH, MSG_OPCODE_ADDR_IN_MATCH, MSG_OPCODE_IN_MATCH, MSG_OPCODE_OUT_MATCH, MSG_OUT_MATCH, OPCODE_ADDR_IN_MATCH, OPCODE_IN_MATCH, OPCODE_OUT_MATCH and ADDR_IN_MATCH two counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>60 bit hex value</TD>
+  <TD>Set bits 0-59 in MSR_B<0,1>_PMON_MATCH register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/Assets/en_US/PDF/designguide/323535.pdf">Intel® Xeon® Processor 7500 Series Uncore Programming Guide</A></TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>60 bit hex value</TD>
+  <TD>Set bits 0-59 in MSR_B<0,1>_PMON_MASK register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/Assets/en_US/PDF/designguide/323535.pdf">Intel® Xeon® Processor 7500 Series Uncore Programming Guide</A></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_RBOX
+<H2>RBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the crossbar router in the Uncore. The description from Intel®:<BR>
+<I>The Crossbar Router (R-Box) is a 8 port switch/router implementing the Intel® QuickPath Interconnect Link and Routing layers. The R-Box is responsible for routing and transmitting all intra- and inter-processor communication.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has two interfaces to the RBOX although each socket contains only one crossbar router. Each RBOX offers 8 general-purpose counters. They are exposed through the MSR interface to the operating system kernel. The RBOX setup routine is taken from Likwid 3.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C5</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C6</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C7</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_CBOX
+<H2>CBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the LLC coherency engine in the Uncore. The description from Intel®:<BR>
+<I>For the Intel Xeon Processor 7500 Series, the LLC coherence engine (C-Box) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a C-Box via the ring interconnect. The C-Box is responsible for managing data delivery from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the LLC; generating snoops and collecting snoop res [...]
+The C-Box is also the gate keeper for all Intel® QuickPath Interconnect (Intel® QPI) messages that originate in the core and is responsible for ensuring that all Intel QuickPath Interconnect messages that pass through the socket’s LLC remain coherent.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has 8 CBOX instances. Each CBOX offers 6 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C5</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_SBOX
+<H2>SBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the LLC-to-QPI interface in the Uncore. The description from Intel®:<BR>
+<I>The S-Box represents the interface between the last level cache and the system interface. It manages flow control between the C and R & B-Boxes. The S-Box is broken into system bound (ring to Intel® QPI) and ring bound (Intel® QPI to ring) connections.<BR>
+As such, it shares responsibility with the C-Box(es) as the Intel® QPI caching agent(s). It is responsible for converting C-box requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring messages to Intel® QPI packets and vice versa.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has 2 SBOX instances. Each SBOX offers 4 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>Only for the TO_R_PROG_EV events two counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>64 bit hex value</TD>
+  <TD>Set bit 0-63 in MSR_S<0,1>_PMON_MATCH register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/Assets/en_US/PDF/designguide/323535.pdf">Intel® Xeon® Processor 7500 Series Uncore Programming Guide</A></TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>39 bit hex value</TD>
+  <TD>Set bit 0-38 in MSR_S<0,1>_PMON_MASK register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/Assets/en_US/PDF/designguide/323535.pdf">Intel® Xeon® Processor 7500 Series Uncore Programming Guide</A></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_WBOX
+<H2>WBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the power controller in the Uncore. The description from Intel®:<BR>
+<I>The W-Box is the primary Power Controller for the Intel® Xeon® Processor 7500 Series.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has one WBOX and it offers 4 general-purpose counters and one fixed counter. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>WBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOXFIX</TD>
+  <TD>UNCORE_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor NEHEX_UBOX
+<H2>UBOX counters</H2>
+<P>The Intel® Nehalem EX microarchitecture provides measurements of the system configuration controller in the Uncore. The description from Intel®:<BR>
+<I>The U-Box serves as the system configuration controller for the Intel® Xeon® Processor E7 Family.
+</I><BR>
+The Intel® Nehalem EX microarchitecture has one UBOX and it offers a single general-purpose counter. It is exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+*/
diff --git a/doc/archs/pentiumm.md b/doc/archs/pentiumm.md
new file mode 100644
index 0000000..8ebc46d
--- /dev/null
+++ b/doc/archs/pentiumm.md
@@ -0,0 +1,63 @@
+/*! \page pentiumm Intel® Pentium M
+
+<H1>Available performance monitors for the Intel® Pentium M microarchitecture</H1>
+<UL>
+<LI>\ref PM_PMC "General-purpose counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor PM_PMC
+<H2>PMC counters</H2>
+The Intel® Pentium M microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/phi.md b/doc/archs/phi.md
new file mode 100644
index 0000000..ac256c8
--- /dev/null
+++ b/doc/archs/phi.md
@@ -0,0 +1,78 @@
+/*! \page phi Intel® Xeon Phi
+
+<P>To use LIKWID you have to turn of power management on the MIC. LIKWID relies on
+RDTSC being used for wallclock time. On the MIC this is only given if power
+management is turned off. This can be configured in
+<CODE>/etc/sysconfig/mic/default.conf</CODE>.<BR>
+
+At the end of this file the power management is configured. The following
+configuration worked:<BR>
+<CODE>PowerManagement "cpufreq_off;corec6_off;pc3_off;pc6_off"</CODE>
+</P>
+
+<H1>Available performance monitors for the Intel® Xeon Phi microarchitecture</H1>
+<UL>
+<LI>\ref PHI_PMC "General-purpose counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor PHI_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Xeon Phi microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/archs/sandybridge.md b/doc/archs/sandybridge.md
new file mode 100644
index 0000000..385a724
--- /dev/null
+++ b/doc/archs/sandybridge.md
@@ -0,0 +1,189 @@
+/*! \page sandybridge Intel® SandyBridge
+
+<H1>Available performance monitors for the Intel® IvyBridge microarchitecture</H1>
+<UL>
+<LI>\ref SNB_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref SNB_PMC "General-purpose counters"</LI>
+<LI>\ref SNB_THERMAL "Thermal counters"</LI>
+<LI>\ref SNB_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor SNB_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNB_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® SandyBridge microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® SandyBridge microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® SandyBridge microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with [...]
+</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel®®.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel®®® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SNB">https://download.01.org/perfmon/SNB</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel®®.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel®®® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SNB">https://download.01.org/perfmon/SNB</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor SNB_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® SandyBridge microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor SNB_POWER
+<H2>Power counter</H2>
+<P>The Intel® SandyBridge microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2*</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+<P>*) The PWR2 counter is often not implemented by Intel® SandyBridge systems</P>
+*/
diff --git a/doc/archs/sandybridgeep.md b/doc/archs/sandybridgeep.md
new file mode 100644
index 0000000..ce98c8a
--- /dev/null
+++ b/doc/archs/sandybridgeep.md
@@ -0,0 +1,775 @@
+/*! \page sandybridgeep Intel® SandyBridge EP/EN
+
+<H1>Available performance monitors for the Intel® IvyBridge microarchitecture</H1>
+<UL>
+<LI>\ref SNBEP_FIXED Fixed-purpose counters</LI>
+<LI>\ref SNBEP_PMC General-purpose counters</LI>
+<LI>\ref SNBEP_THERMAL Thermal counters</LI>
+<LI>\ref SNBEP_POWER Power measurement counters</LI>
+<LI>\ref SNBEP_MBOX Integrated memory controller counters</LI>
+<LI>\ref SNBEP_CBOX Last Level cache counters</LI>
+<LI>\ref SNBEP_UBOX Uncore management counters</LI>
+<LI>\ref SNBEP_SBOX Intel® QPI Link Layer counters</LI>
+<LI>\ref SNBEP_BBOX Home Agent counters</LI>
+<LI>\ref SNBEP_WBOX Power control unit counters</LI>
+<LI>\ref SNBEP_RBOX Ring-to-QPI interface counters</LI>
+<LI>\ref SNBEP_PBOX Ring-to-PCIe interface counters</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor SNBEP_FIXED
+<H2>Fixed counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_PMC
+<H2>PMC counters</H2>
+<P>The Intel® SandyBridge microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® SandyBridge microarchitecture provides measureing of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® SandyBridge microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0x8FFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/JKT">https://download.01.org/perfmon/JKT</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-37 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/JKT">https://download.01.org/perfmon/JKT</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® SandyBridge microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor SNBEP_POWER
+<H2>Power counter</H2>
+<P>The Intel® SandyBridge microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_MBOX
+<H2>Memory controller counters</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the integrated Memory Controllers (iMC) in the uncore. The description from Intel®:<BR>
+<I>The integrated Memory Controller provides the interface to DRAM and communicates to the rest of the uncore through the Home Agent (i.e. the iMC does not connect to the Ring).<BR>
+In conjunction with the HA, the memory controller also provides a variety of RAS features, such as ECC, lockstep, memory access retry, memory scrubbing, thermal throttling, mirroring, and rank sparing.
+</I><BR>
+The uncore management performance counters are exposed to the operating system through PCI interfaces. All SandyBridge based systems have one memory controller. There are 4 different PCI devices per memory controller, each covering one memory channel. Each channel has 4 different general-purpose counters and one fixed counter for the DRAM clock. The name MBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>MBOX<0-3>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-3>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-3>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-3>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0-3>FIX</TD>
+  <TD>DRAM_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter MBOX<0-3>C<0-3>)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_CBOX
+<H2>Last Level cache counters</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the LLC coherency engine in the uncore. The description from Intel®:<BR>
+<I>The LLC coherence engine (CBo) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a CBo via the ring interconnect. The CBo is responsible for managing data delivery from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the
+LLC; generating snoops and collecting snoop responses from the local cores when the MESIF protocol requires it.
+</I><BR>
+The Last Level cache performance counters are exposed to the operating system through the MSR interface. SandyBridge EN/EP systems have maximal 8 CBOXes, each with 4 general-purpose counters. The name CBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-7>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>9 bit opcode identifier, see uncore performance monitoring guide for SandyBridge</TD>
+  <TD>Set bits 23-31 in CBOX filter register MSR_UNC_C<0-7>_PMON_BOX_FILTER</TD>
+  <TD>LIKWID checks whether the given value is a valid opcode. A list of all valid opcodes can be found in the <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A></TD>
+</TR>
+<TR>
+  <TD>state</TD>
+  <TD>5 bit state representation</TD>
+  <TD>Set bits 18-22 in CBOX filter register MSR_UNC_C<0-7>_PMON_BOX_FILTER</TD>
+  <TD>F: 0x10,<BR>M: 0x08,<BR>E: 0x04,<BR>S: 0x02,<BR>I: 0x01</TD>
+</TR>
+<TR>
+  <TD>nid</TD>
+  <TD>8 bit node ID</TD>
+  <TD>Set bits 10-17 in CBOX filter register MSR_UNC_C<0-7>_PMON_BOX_FILTER</TD>
+  <TD>Note that for Node ID 0 the hex value should be 0x01.</TD>
+</TR>
+<TR>
+  <TD>tid</TD>
+  <TD>5 bit thread ID value</TD>
+  <TD>Set bits 0-4 in CBOX filter register MSR_UNC_C<0-7>_PMON_BOX_FILTER</TD>
+  <TD>Bit 0 means physical or logical thread, bits 1-3 the core ID</TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides an event LLC_LOOKUP which can be filtered with the 'state' option. If no 'state' is set, LIKWID sets the state to 0x1F, the default value to measure all lookups.</P>
+
+\anchor SNBEP_UBOX
+<H2>Uncore management counters</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the management box in the uncore. The description from Intel®:<BR>
+<I>The UBox serves as the system configuration controller for the Intel® Xeon Processor E5-2600 family uncore.<BR>
+In this capacity, the UBox acts as the central unit for a variety of functions:<BR>
+<UL>
+<LI>The master for reading and writing physically distributed registers across the uncore using the Message Channel.</LI>
+<LI>The UBox is the intermediary for interrupt traffic, receiving interrupts from the sytem and dispatching interrupts to the appropriate core.</LI>
+<LI>The UBox serves as the system lock master used when quiescing the platform (e.g., Intel® QPI bus lock).</LI>
+</UL>
+</I><BR>
+The uncore management performance counters are exposed to the operating system through the MSR interface. The name UBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UBOXFIX</TD>
+  <TD>UBOX_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter UBOX<0,1>)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_SBOX
+<H2>Intel® QPI Link Layer counters</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the QPI Link layer (QPI) in the uncore. The description from Intel®:<BR>
+<I>The Intel® QPI Link Layer is responsible for packetizing requests from the caching agent on the way out to the system interface. As such, it shares responsibility with the CBo(s) as the Intel® QPI caching agent(s). It is responsible for converting CBo requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring
+messages to Intel® QPI packets and vice versa.<BR>
+The Intel® QPI is split into two separate layers. The Intel® QPI LL (link layer) is responsible for generating, transmitting, and receiving packets with the Intel®® QPI link.
+</I><BR>
+The QPI hardware performance counters are exposed to the operating system through PCI interfaces. There are two of those interfaces for the QPI. If your system has not all interfaces but interface 0 does not work, try the other one. The name SBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>FIX</TD>
+  <TD>QPI_RATE, QPI_SLOW_MODE</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for counter SBOX<0,1>C<0-3>)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_QPI_PMON_MATCH_0 register of PCI device</TD>
+  <TD>Only if corresponding device available. See <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A> for fields in PCI_UNC_QPI_PMON_MATCH_0</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_QPI_PMON_MATCH_1 register of PCI device</TD>
+  <TD>Only if corresponding device available. See <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A> for fields in PCI_UNC_QPI_PMON_MATCH_1</TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>32 bit hex address</TD>
+  <TD>Input value masked with 0x8003FFF8 and written to bits 0-31 in the PCI_UNC_QPI_PMON_MASK_0 register of PCI device</TD>
+  <TD>Only if corresponding device available. See <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A> for fields in PCI_UNC_QPI_PMON_MASK_0</TD>
+</TR>
+<TR>
+  <TD>mask1</TD>
+  <TD>20 bit hex address</TD>
+  <TD>Input value masked with 0x000F000F and written to bits 0-19 in the PCI_UNC_QPI_PMON_MASK_1 register of PCI device</TD>
+  <TD>Only if corresponding device available. See <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A> for fields in PCI_UNC_QPI_PMON_MASK_1</TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_BBOX
+<H2>BBOX counter</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the Home Agent (HA) in the uncore. The description from Intel®:<BR>
+<I>The HA is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel®® QuickPath Interconnect Specification). Additionally, the HA is responsible for ordering memory reads/writes, coming in from the modular Ring, to a given address such that the iMC (memory controller).<BR>
+In other words, it is the coherency agent responsible for guarding the memory controller. All requests for memory attached to the coupled iMC must first be ordered through the HA.
+</I><BR>
+The HA hardware performance counters are exposed to the operating system through PCI interfaces. The name BBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>BBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>6 bit hex value</TD>
+  <TD>Set bits 0-5 in PCI_UNC_HA_PMON_OPCODEMATCH register of PCI device</TD>
+  <TD>A table of all valid opcodes can be found in the <A HREF="http://www.intel.de/content/www/de/de/processors/xeon/xeon-e5-2600-uncore-guide.html">Intel® E5-2600 uncore monitoring guide</A>.</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>46 bit hex address</TD>
+  <TD>Extract bits 6-31 and set bits 6-31 in PCI_UNC_HA_PMON_ADDRMATCH0 register of PCI device<BR>Extract bits 32-45 and set bits 0-13 in PCI_UNC_HA_PMON_ADDRMATCH1 register of PCI device</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_WBOX
+<H2>WBOX counter</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the power control unit (PCU) in the uncore. The description from Intel®:<BR>
+<I>The PCU is the primary Power Controller for the physical processor package.<BR>
+The uncore implements a power control unit acting as a core/uncore power and thermal manager. It runs its firmware on an internal micro-controller and coordinates the socket’s power states.
+</I><BR>
+The PCU performance counters are exposed to the operating system through the MSR interface. The name WBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>WBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX0FIX</TD>
+  <TD>CORES_IN_C3</TD>
+</TR>
+<TR>
+  <TD>WBOX1FIX</TD>
+  <TD>CORES_IN_C6</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>32 bit hex value</TD>
+  <TD>Set bits 0-31 in MSR_UNC_PCU_PMON_BOX_FILTER register</TD>
+  <TD>Band0: bits 0-7,<BR>Band1: bits 8-15,<BR>Band2: bits 16-23,<BR>Band3: bits 24-31</TD>
+</TR>
+<TR>
+  <TD>occupancy</TD>
+  <TD>2 bit hex value</TD>
+  <TD>Set bit 14-15 in config register</TD>
+  <TD>Cores<BR>in C0: 0x1,<BR>in C3: 0x2,<BR>in C6: 0x3</TD>
+</TR>
+<TR>
+  <TD>occupancy_edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>occupancy_invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 30 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_RBOX
+<H2>RBOX counter</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the Ring-to-QPI (R3QPI) interface in the uncore. The description from Intel®:<BR>
+<I>R3QPI is the interface between the Intel® QPI Link Layer, which packetizes requests, and the Ring.<BR>
+R3QPI is the interface between the ring and the Intel® QPI Link Layer. It is responsible for translating between ring protocol packets and flits that are used for transmitting data across the Intel® QPI interface. It performs credit checking between the local Intel® QPI LL, the remote Intel® QPI LL and other agents on the local ring.
+</I><BR>
+The R3QPI performance counters are exposed to the operating system through PCI interfaces. Since the RBOXes manage the traffic from the LLC-connecting ring interface on the socket with the QPI interfaces (SBOXes), the amount is similar to the amount of SBOXes. See at SBOXes how many are available for which system configuration. The name RBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SNBEP_PBOX
+<H2>PBOX counter</H2>
+<P>The Intel® SandyBridge EP/EN microarchitecture provides measurements of the Ring-to-PCIe (R2PCIe) interface in the uncore. The description from Intel®:<BR>
+<I>R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe.
+</I><BR>
+The R2PCIe performance counters are exposed to the operating system through a PCI interface. Independent of the system's configuration, there is only one Ring-to-PCIe interface. The name PBOX originates from the Nehalem EX uncore monitoring.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PBOX3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Operation</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/silvermont.md b/doc/archs/silvermont.md
new file mode 100644
index 0000000..af22e32
--- /dev/null
+++ b/doc/archs/silvermont.md
@@ -0,0 +1,175 @@
+/*! \page silvermont Intel® Silvermont/Airmont
+
+<H1>Available performance monitors for the Intel® Silvermont microarchitecture</H1>
+<UL>
+<LI>\ref SVM_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref SVM_PMC "General-purpose counters"</LI>
+<LI>\ref SVM_THERMAL "Thermal counters"</LI>
+<LI>\ref SVM_POWER "Power measurement counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor SVM_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor SVM_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Silvermont microarchitecture provides 2 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Silvermont microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Silvermont microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with th [...]
+</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>16 bit hex value</TD>
+  <TD>Input value masked with 0xFFFF and written to bits 0-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SLM">https://download.01.org/perfmon/SLM</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>22 bit hex value</TD>
+  <TD>Input value is written to bits 16-38 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/SLM">https://download.01.org/perfmon/SLM</A>.</TD>
+</TR>
+</TABLE>
+
+\anchor SVM_THERMAL
+<H2>Thermal counter</H2>
+<P>The Intel® Silvermont microarchitecture provides one register for the current core temperature.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>TMP0</TD>
+  <TD>TEMP_CORE</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor SVM_POWER
+<H2>Power counters</H2>
+<P>The Intel® Silvermont microarchitecture provides measurements of the current power consumption through the RAPL interface.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PWR0</TD>
+  <TD>PWR_PKG_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR1</TD>
+  <TD>PWR_PP0_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR2*</TD>
+  <TD>PWR_PP1_ENERGY</TD>
+</TR>
+<TR>
+  <TD>PWR3*</TD>
+  <TD>PWR_DRAM_ENERGY</TD>
+</TR>
+</TABLE>
+<P>*) The PWR2 and PWR3 counter is commonly not implemented by Intel® Silvermont systems.</P>
+*/
diff --git a/doc/archs/westmere.md b/doc/archs/westmere.md
new file mode 100644
index 0000000..3371c20
--- /dev/null
+++ b/doc/archs/westmere.md
@@ -0,0 +1,239 @@
+/*! \page westmere Intel® Westmere
+
+<P>The Intel® Westmere microarchitecture has the same features as the Intel® Nehalem architecture. There are some additional features like a second OFFCORE_RESPONSE register and an addr/opcode matching unit for general-purpose counters in the Uncore.</P>
+
+<H1>Available performance monitors for the Intel® Westmere microarchitecture</H1>
+<UL>
+<LI>\ref WES_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref WES_PMC "General-purpose counters"</LI>
+<LI>\ref WES_UNCORE "Uncore counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor WES_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor WES_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Westmere microarchitecture provides 4 general-purpose counters consisting of a config and a counter register.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Westmere microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Westmere microarchitecture has two of those registers. Own filtering can be applied with the OFFCORE_RESPONSE_0_OPTIONS and OFFCORE_RESPONSE_1_OPTIONS events. Only for those events two more counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xFF and written to bits 0-7 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/WSM-EP-SP">https://download.01.org/perfmon/WSM-EP-SP</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xF7 and written to bits 8-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and <A HREF="https://download.01.org/perfmon/WSM-EP-SP">https://download.01.org/perfmon/WSM-EP-SP</A>.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor WES_UNCORE
+<H2>Uncore counters</H2>
+<P>The Intel® Westmere microarchitecture provides 8 general-purpose counters for the uncpre consisting of a config and a counter register. Moreover, there is a fixed-purpose counter to measure the clock of the uncore.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UPMC0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC5</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC6</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMC7</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>UPMCFIX</TD>
+  <TD>UNCORE_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for UPMC<0-7> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 21 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>opcode</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 40-47 in MSR_UNCORE_ADDR_OPCODE_MATCH register</TD>
+  <TD>A list of valid opcodes can be found in the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A>.</TD>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>40 bit physical memory address</TD>
+  <TD>Extract bits 3-39 from address and write them to bits 3-39 in MSR_UNCORE_ADDR_OPCODE_MATCH register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/archs/westmereex.md b/doc/archs/westmereex.md
new file mode 100644
index 0000000..ce37674
--- /dev/null
+++ b/doc/archs/westmereex.md
@@ -0,0 +1,555 @@
+/*! \page westmereex Intel® Westmere EX
+
+<P>The Intel® Westmere EX microarchitecture has the same features as the Intel® Westmere architecture. There are some additional features like a second OFFCORE_RESPONSE register and an addr/opcode matching unit for general-purpose counters in the uncore.</P>
+
+<H1>Available performance monitors for the Intel® Westmere EX microarchitecture</H1>
+<UL>
+<LI>\ref WESEX_FIXED "Fixed-purpose counters"</LI>
+<LI>\ref WESEX_PMC "General-purpose counters"</LI>
+<LI>\ref WESEX_MBOX "Memory controller counters"</LI>
+<LI>\ref WESEX_BBOX "Home Agent counters"</LI>
+<LI>\ref WESEX_RBOX "Crossbar router counters"</LI>
+<LI>\ref WESEX_CBOX "Last Level cache counters"</LI>
+<LI>\ref WESEX_SBOX "LLC-to-QPI interface counters"</LI>
+<LI>\ref WESEX_WBOX "Power control unit counters"</LI>
+<LI>\ref WESEX_UBOX "Uncore management counters"</LI>
+</UL>
+
+<H1>Counters available for each hardware thread</H1>
+\anchor WESEX_FIXED
+<H2>Fixed-purpose counters</H2>
+<P>Since the Core2 microarchitecture, Intel® provides a set of fixed-purpose counters. Each can measure only one specific event.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>FIXC0</TD>
+  <TD>INSTR_RETIRED_ANY</TD>
+</TR>
+<TR>
+  <TD>FIXC1</TD>
+  <TD>CPU_CLK_UNHALTED_CORE</TD>
+</TR>
+<TR>
+  <TD>FIXC2</TD>
+  <TD>CPU_CLK_UNHALTED_REF</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>anythread</TD>
+  <TD>N</TD>
+  <TD>Set bit 2+(index*4) in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit (index*4) in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_PMC
+<H2>General-purpose counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides 4 general-purpose counters consisting of a config and a counter register. They are core-local, hence each hardware thread has its own set of general-purpose counters.</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>PMC0</TD>
+  <TD>*</TD>
+</TR>
+  <TD>PMC1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>PMC3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>kernel</TD>
+  <TD>N</TD>
+  <TD>Set bit 17 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>The Intel® Westmere EX microarchitecture provides measuring of offcore events in PMC counters. Therefore the stream of offcore events must be filtered using the OFFCORE_RESPONSE registers. The Intel® Westmere EX microarchitecture has two of those registers. LIKWID defines some events that perform the filtering according to the event name. Although there are many bitmasks possible, LIKWID natively provides only the ones with response type ANY. Own filtering can be applied with  [...]
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xFF and written to bits 0-7 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and the event files at <A HREF="https://download.01.org/perfmon/WSM-EX">https://download.01.org/perfmon/WSM-EX</A>.</TD>
+</TR>
+<TR>
+  <TD>match1</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Input value masked with 0xF7 and written to bits 8-15 in the OFFCORE_RESPONSE register</TD>
+  <TD>Check the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual, Vol. 3, Chapter Performance Monitoring</A> and the event files at <A HREF="https://download.01.org/perfmon/WSM-EX">https://download.01.org/perfmon/WSM-EX</A>.</TD>
+</TR>
+</TABLE>
+
+<H1>Counters available for one hardware thread per socket</H1>
+\anchor WESEX_MBOX
+<H2>MBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the memory controllers in the uncore. The description from Intel®:<BR>
+<I>The memory controller interfaces to the Intel® 7500 Scalable Memory Buffers and translates read and write commands into specific Intel® Scalable Memory Interconnect (Intel® SMI) operations. Intel SMI is based on the FB-DIMM architecture, but the Intel 7500 Scalable Memory Buffer is not an AMB2 device and has significant exceptions to the FB-DIMM2 architecture. The memory controller also provides a variety of RAS features, such as ECC, memory scrubbing, thermal throttling,  [...]
+</I><BR>
+The Intel® Westmere EX microarchitecture has 2 memory controllers, each with 6 general-purpose counters. They are exposed through the MSR interface to the operating system kernel. The MBOX and RBOX setup routines are taken from Likwid 3, they are not as flexible as the newer setup routines but programming of the MBOXes and RBOXes is tedious for Westmere EX. It is not possible to specify a FVID (Fill Victim Index) for the MBOX or IPERF option for RBOXes.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>MBOX<0,1>C5</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Special handling for events</H3>
+<P>For the events DRAM_CMD_ALL and DRAM_CMD_ILLEGAL two counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>34 bit address</TD>
+  <TD>Set bits 0-33 in MSR_M<0,1>_PMON_ADDR_MATCH register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>60 bit hex value</TD>
+  <TD>Extract bits 6-33 from address and set bits 0-27 in MSR_M<0,1>_PMON_ADDR_MASK register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+<P>For the events THERM_TRP_DN and THERM_TRP_UP you cannot measure events for all and one specific DIMM simultaneously because they program the same filter register MSR_M<0,1>_PMON_MSC_THR and have contrary configurations.</P>
+<P>Although the events FVC_EV<0-3> are available to measure multiple memory events, some overlap and do not allow simultaneous measuring. That's because they program the same filter register MSR_M<0,1>_PMON_ZDP and have contrary configurations. One case are the FVC_EV<0-3>_BBOX_CMDS_READS and FVC_EV<0-3>_BBOX_CMDS_WRITES events that measure memory reads or writes but cannot be measured at the same time.</P>
+
+
+
+\anchor WESEX_BBOX
+<H2>BBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the Home Agent in the uncore. The description from Intel®:<BR>
+<I>The B-Box is responsible for the protocol side of memory interactions, including coherent and non-coherent home agent protocols (as defined in the Intel® QuickPath Interconnect Specification). Additionally, the B-Box is responsible for ordering memory reads/writes to a given address such that the M-Box does not have to perform this conflict checking. All requests for memory attached to the coupled M-Box must first be ordered through the B-Box.
+</I><BR>
+The memory traffic in an Intel® Westmere EX system is controller by the Home Agents. Each MBOX has a corresponding BBOX. Each BBOX offers 4 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>BBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Special handling for events</H3>
+<P>For the matching events MSG_IN_MATCH, MSG_ADDR_IN_MATCH, MSG_OPCODE_ADDR_IN_MATCH, MSG_OPCODE_IN_MATCH, MSG_OPCODE_OUT_MATCH, MSG_OUT_MATCH, OPCODE_ADDR_IN_MATCH, OPCODE_IN_MATCH, OPCODE_OUT_MATCH and ADDR_IN_MATCH two counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+  <TD>match0</TD>
+  <TD>60 bit hex value</TD>
+  <TD>Set bits 0-59 in MSR_B<0,1>_PMON_MATCH register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e7-family-uncore-performance-programming-guide.html">Intel® Xeon® Processor E7 Family uncore Performance Monitoring Guide</A></TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>60 bit hex value</TD>
+  <TD>Set bits 0-59 in MSR_B<0,1>_PMON_MASK register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e7-family-uncore-performance-programming-guide.html">Intel® Xeon® Processor E7 Family uncore Performance Monitoring Guide</A></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_RBOX
+<H2>RBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the crossbar router in the uncore. The description from Intel®:<BR>
+<I>The Crossbar Router (R-Box) is a 8 port switch/router implementing the Intel® QuickPath Interconnect Link and Routing layers. The R-Box is responsible for routing and transmitting all intra- and inter-processor communication.
+</I><BR>
+The Intel® Westmere EX microarchitecture has two interfaces to the RBOX although each socket contains only one crossbar router, RBOX0 is the left part and RBOX1 is the right part of the single RBOX. Each RBOX side offers 8 general-purpose counters. They are exposed through the MSR interface to the operating system kernel. The MBOX and RBOX setup routines are taken from Likwid 3, they are not as flexible as the newer setup routines but programming of the MBOXes and RBOXes is tedious f [...]
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C5</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C6</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>RBOX<0,1>C7</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_CBOX
+<H2>CBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the LLC coherency engine in the uncore. The description from Intel®:<BR>
+<I>For the Intel Xeon Processor 7500 Series, the LLC coherence engine (C-Box) manages the interface between the core and the last level cache (LLC). All core transactions that access the LLC are directed from the core to a C-Box via the ring interconnect. The C-Box is responsible for managing data delivery from the LLC to the requesting core. It is also responsible for maintaining coherence between the cores within the socket that share the LLC; generating snoops and collecting snoop res [...]
+The C-Box is also the gate keeper for all Intel® QuickPath Interconnect (Intel® QPI) messages that originate in the core and is responsible for ensuring that all Intel QuickPath Interconnect messages that pass through the socket’s LLC remain coherent.
+</I><BR>
+The Intel® Westmere EX microarchitecture has 10 CBOX instances. Each CBOX offers 6 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>CBOX<0-9>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-9>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-9>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-9>C3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-9>C4</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>CBOX<0-9>C5</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>5 bit hex value</TD>
+  <TD>Set bits 24-28 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_SBOX
+<H2>SBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the LLC-to-QPI interface in the uncore. The description from Intel®:<BR>
+<I>The S-Box represents the interface between the last level cache and the system interface. It manages flow control between the C and R & B-Boxes. The S-Box is broken into system bound (ring to Intel® QPI) and ring bound (Intel® QPI to ring) connections.<BR>
+As such, it shares responsibility with the C-Box(es) as the Intel® QPI caching agent(s). It is responsible for converting C-box requests to Intel® QPI messages (i.e. snoop generation and data response messages from the snoop response) as well as converting/forwarding ring messages to Intel® QPI packets and vice versa.
+</I><BR>
+The Intel® Westmere EX microarchitecture has 2 SBOX instances. Each SBOX offers 4 general-purpose counters. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>SBOX<0,1>C3</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+<H3>Special handling for events</H3>
+<P>Only for the TO_R_PROG_EV events two counter options are available:</P>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>match0</TD>
+  <TD>64 bit hex value</TD>
+  <TD>Set bit 0-63 in MSR_S<0,1>_PMON_MATCH register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e7-family-uncore-performance-programming-guide.html">Intel® Xeon® Processor E7 Family uncore Performance Monitoring Guide</A></TD>
+</TR>
+<TR>
+  <TD>mask0</TD>
+  <TD>39 bit hex value</TD>
+  <TD>Set bit 0-38 in MSR_S<0,1>_PMON_MASK register</TD>
+  <TD>For register layout and valid settings see <A HREF="http://www.intel.com/content/www/us/en/processors/xeon/xeon-e7-family-uncore-performance-programming-guide.html">Intel® Xeon® Processor E7 Family uncore Performance Monitoring Guide</A></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_WBOX
+<H2>WBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the power controller in the uncore. The description from Intel®:<BR>
+<I>The W-Box is the primary Power Controller for the Intel® Xeon® Processor 7500 Series.
+</I><BR>
+The Intel® Westmere EX microarchitecture has one WBOX and it offers 4 general-purpose counters and one fixed counter. They are exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>WBOX0</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX1</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX2</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOX3</TD>
+  <TD>*</TD>
+</TR>
+<TR>
+  <TD>WBOXFIX</TD>
+  <TD>UNCORE_CLOCKTICKS</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>threshold</TD>
+  <TD>8 bit hex value</TD>
+  <TD>Set bits 24-31 in config register</TD>
+  <TD></TD>
+</TR>
+<TR>
+  <TD>invert</TD>
+  <TD>N</TD>
+  <TD>Set bit 23 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+\anchor WESEX_UBOX
+<H2>UBOX counters</H2>
+<P>The Intel® Westmere EX microarchitecture provides measurements of the system configuration controller in the uncore. The description from Intel®:<BR>
+<I>The U-Box serves as the system configuration controller for the Intel® Xeon® Processor E7 Family.
+</I><BR>
+The Intel® Westmere EX microarchitecture has one UBOX and it offers a single general-purpose counter. It is exposed through the MSR interface to the operating system kernel.
+</P>
+<H3>Counter and events</H3>
+<TABLE>
+<TR>
+  <TH>Counter name</TH>
+  <TH>Event name</TH>
+</TR>
+<TR>
+  <TD>UBOX0</TD>
+  <TD>*</TD>
+</TR>
+</TABLE>
+<H3>Available Options (Only for WBOX<0-3> counters)</H3>
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Argument</TH>
+  <TH>Description</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>edgedetect</TD>
+  <TD>N</TD>
+  <TD>Set bit 18 in config register</TD>
+  <TD></TD>
+</TR>
+</TABLE>
+
+*/
diff --git a/doc/bstrlib.txt b/doc/bstrlib.txt
new file mode 100644
index 0000000..d0f02f7
--- /dev/null
+++ b/doc/bstrlib.txt
@@ -0,0 +1,3201 @@
+Better String library
+---------------------
+
+by Paul Hsieh
+
+The bstring library is an attempt to provide improved string processing
+functionality to the C and C++ language.  At the heart of the bstring library
+(Bstrlib for short) is the management of "bstring"s which are a significant
+improvement over '\0' terminated char buffers.
+
+===============================================================================
+
+Motivation
+----------
+
+The standard C string library has serious problems:
+
+    1) Its use of '\0' to denote the end of the string means knowing a
+       string's length is O(n) when it could be O(1).
+    2) It imposes an interpretation for the character value '\0'.
+    3) gets() always exposes the application to a buffer overflow.
+    4) strtok() modifies the string its parsing and thus may not be usable in
+       programs which are re-entrant or multithreaded.
+    5) fgets has the unusual semantic of ignoring '\0's that occur before
+       '\n's are consumed.
+    6) There is no memory management, and actions performed such as strcpy,
+       strcat and sprintf are common places for buffer overflows.
+    7) strncpy() doesn't '\0' terminate the destination in some cases.
+    8) Passing NULL to C library string functions causes an undefined NULL
+       pointer access.
+    9) Parameter aliasing (overlapping, or self-referencing parameters)
+       within most C library functions has undefined behavior.
+   10) Many C library string function calls take integer parameters with
+       restricted legal ranges.  Parameters passed outside these ranges are
+       not typically detected and cause undefined behavior.
+
+So the desire is to create an alternative string library that does not suffer
+from the above problems and adds in the following functionality:
+
+    1) Incorporate string functionality seen from other languages.
+        a) MID$() - from BASIC
+        b) split()/join() - from Python
+        c) string/char x n - from Perl
+    2) Implement analogs to functions that combine stream IO and char buffers
+       without creating a dependency on stream IO functionality.
+    3) Implement the basic text editor-style functions insert, delete, find,
+       and replace.
+    4) Implement reference based sub-string access (as a generalization of
+       pointer arithmetic.)
+    5) Implement runtime write protection for strings.
+
+There is also a desire to avoid "API-bloat".  So functionality that can be
+implemented trivially in other functionality is omitted.  So there is no
+left$() or right$() or reverse() or anything like that as part of the core
+functionality.
+
+Explaining Bstrings
+-------------------
+
+A bstring is basically a header which wraps a pointer to a char buffer.  Lets
+start with the declaration of a struct tagbstring:
+
+    struct tagbstring {
+        int mlen;
+        int slen;
+        unsigned char * data;
+    };
+
+This definition is considered exposed, not opaque (though it is neither
+necessary nor recommended that low level maintenance of bstrings be performed
+whenever the abstract interfaces are sufficient).  The mlen field (usually)
+describes a lower bound for the memory allocated for the data field.  The
+slen field describes the exact length for the bstring.  The data field is a
+single contiguous buffer of unsigned chars.  Note that the existence of a '\0'
+character in the unsigned char buffer pointed to by the data field does not
+necessarily denote the end of the bstring.
+
+To be a well formed modifiable bstring the mlen field must be at least the
+length of the slen field, and slen must be non-negative.  Furthermore, the
+data field must point to a valid buffer in which access to the first mlen
+characters has been acquired.  So the minimal check for correctness is:
+
+    (slen >= 0 && mlen >= slen && data != NULL)
+
+bstrings returned by bstring functions can be assumed to be either NULL or
+satisfy the above property.  (When bstrings are only readable, the mlen >=
+slen restriction is not required; this is discussed later in this section.)
+A bstring itself is just a pointer to a struct tagbstring:
+
+    typedef struct tagbstring * bstring;
+
+Note that use of the prefix "tag" in struct tagbstring is required to work
+around the inconsistency between C and C++'s struct namespace usage.  This
+definition is also considered exposed.
+
+Bstrlib basically manages bstrings allocated as a header and an associated
+data-buffer.  Since the implementation is exposed, they can also be
+constructed manually.  Functions which mutate bstrings assume that the header
+and data buffer have been malloced; the bstring library may perform free() or
+realloc() on both the header and data buffer of any bstring parameter.
+Functions which return bstring's create new bstrings.  The string memory is
+freed by a bdestroy() call (or using the bstrFree macro).
+
+The following related typedef is also provided:
+
+    typedef const struct tagbstring * const_bstring;
+
+which is also considered exposed.  These are directly bstring compatible (no
+casting required) but are just used for parameters which are meant to be
+non-mutable.  So in general, bstring parameters which are read as input but
+not meant to be modified will be declared as const_bstring, and bstring
+parameters which may be modified will be declared as bstring.  This convention
+is recommended for user written functions as well.
+
+Since bstrings maintain interoperability with C library char-buffer style
+strings, all functions which modify, update or create bstrings also append a
+'\0' character into the position slen + 1.  This trailing '\0' character is
+not required for bstrings input to the bstring functions; this is provided
+solely as a convenience for interoperability with standard C char-buffer
+functionality.
+
+Analogs for the ANSI C string library functions have been created when they
+are necessary, but have also been left out when they are not.  In particular
+there are no functions analogous to fwrite, or puts just for the purposes of
+bstring.  The ->data member of any string is exposed, and therefore can be
+used just as easily as char buffers for C functions which read strings.
+
+For those that wish to hand construct bstrings, the following should be kept
+in mind:
+
+    1) While bstrlib can accept constructed bstrings without terminating
+       '\0' characters, the rest of the C language string library will not
+       function properly on such non-terminated strings.  This is obvious
+       but must be kept in mind.
+    2) If it is intended that a constructed bstring be written to by the
+       bstring library functions then the data portion should be allocated
+       by the malloc function and the slen and mlen fields should be entered
+       properly.  The struct tagbstring header is not reallocated, and only
+       freed by bdestroy.
+    3) Writing arbitrary '\0' characters at various places in the string
+       will not modify its length as perceived by the bstring library
+       functions.  In fact, '\0' is a legitimate non-terminating character
+       for a bstring to contain.
+    4) For read only parameters, bstring functions do not check the mlen.
+       I.e., the minimal correctness requirements are reduced to:
+
+            (slen >= 0 && data != NULL)
+
+Better pointer arithmetic
+-------------------------
+
+One built-in feature of '\0' terminated char * strings, is that its very easy
+and fast to obtain a reference to the tail of any string using pointer
+arithmetic.  Bstrlib does one better by providing a way to get a reference to
+any substring of a bstring (or any other length delimited block of memory.)
+So rather than just having pointer arithmetic, with bstrlib one essentially
+has segment arithmetic.  This is achieved using the macro blk2tbstr() which
+builds a reference to a block of memory and the macro bmid2tbstr() which
+builds a reference to a segment of a bstring.  Bstrlib also includes
+functions for direct consumption of memory blocks into bstrings, namely
+bcatblk () and blk2bstr ().
+
+One scenario where this can be extremely useful is when string contains many
+substrings which one would like to pass as read-only reference parameters to
+some string consuming function without the need to allocate entire new
+containers for the string data.  More concretely, imagine parsing a command
+line string whose parameters are space delimited.  This can only be done for
+tails of the string with '\0' terminated char * strings.
+
+Improved NULL semantics and error handling
+------------------------------------------
+
+Unless otherwise noted, if a NULL pointer is passed as a bstring or any other
+detectably illegal parameter, the called function will return with an error
+indicator (either NULL or BSTR_ERR) rather than simply performing a NULL
+pointer access, or having undefined behavior.
+
+To illustrate the value of this, consider the following example:
+
+        strcpy (p = malloc (13 * sizeof (char)), "Hello,");
+        strcat (p, " World");
+
+This is not correct because malloc may return NULL (due to an out of memory
+condition), and the behaviour of strcpy is undefined if either of its
+parameters are NULL.  However:
+
+        bstrcat (p = bfromcstr ("Hello,"), q = bfromcstr (" World"));
+        bdestroy (q);
+
+is well defined, because if either p or q are assigned NULL (indicating a
+failure to allocate memory) both bstrcat and bdestroy will recognize it and
+perform no detrimental action.
+
+Note that it is not necessary to check any of the members of a returned
+bstring for internal correctness (in particular the data member does not need
+to be checked against NULL when the header is non-NULL), since this is
+assured by the bstring library itself.
+
+bStreams
+--------
+
+In addition to the bgets and bread functions, bstrlib can abstract streams
+with a high performance read only stream called a bStream.  In general, the
+idea is to open a core stream (with something like fopen) then pass its
+handle as well as a bNread function pointer (like fread) to the bsopen
+function which will return a handle to an open bStream.  Then the functions
+bsread, bsreadln or bsreadlns can be called to read portions of the stream.
+Finally, the bsclose function is called to close the bStream -- it will
+return a handle to the original (core) stream.  So bStreams, essentially,
+wrap other streams.
+
+The bStreams have two main advantages over the bgets and bread (as well as
+fgets/ungetc) paradigms:
+
+1) Improved functionality via the bunread function which allows a stream to
+   unread characters, giving the bStream stack-like functionality if so
+   desired.
+2) A very high performance bsreadln function.  The C library function fgets()
+   (and the bgets function) can typically be written as a loop on top of
+   fgetc(), thus paying all of the overhead costs of calling fgetc on a per
+   character basis.  bsreadln will read blocks at a time, thus amortizing the
+   overhead of fread calls over many characters at once.
+
+However, clearly bStreams are suboptimal or unusable for certain kinds of
+streams (stdin) or certain usage patterns (a few spotty, or non-sequential
+reads from a slow stream.)  For those situations, using bgets will be more
+appropriate.
+
+The semantics of bStreams allows practical construction of layerable data
+streams.  What this means is that by writing a bNread compatible function on
+top of a bStream, one can construct a new bStream on top of it.  This can be
+useful for writing multi-pass parsers that don't actually read the entire
+input more than once and don't require the use of intermediate storage.
+
+Aliasing
+--------
+
+Aliasing occurs when a function is given two parameters which point to data
+structures which overlap in the memory they occupy.  While this does not
+disturb read only functions, for many libraries this can make functions that
+write to these memory locations malfunction.  This is a common problem of the
+C standard library and especially the string functions in the C standard
+library.
+
+The C standard string library is entirely char by char oriented (as is
+bstring) which makes conforming implementations alias safe for some
+scenarios.  However no actual detection of aliasing is typically performed,
+so it is easy to find cases where the aliasing will cause anomolous or
+undesirable behaviour (consider: strcat (p, p).)  The C99 standard includes
+the "restrict" pointer modifier which allows the compiler to document and
+assume a no-alias condition on usage.  However, only the most trivial cases
+can be caught (if at all) by the compiler at compile time, and thus there is
+no actual enforcement of non-aliasing.
+
+Bstrlib, by contrast, permits aliasing and is completely aliasing safe, in
+the C99 sense of aliasing.  That is to say, under the assumption that
+pointers of incompatible types from distinct objects can never alias, bstrlib
+is completely aliasing safe.  (In practice this means that the data buffer
+portion of any bstring and header of any bstring are assumed to never alias.)
+With the exception of the reference building macros, the library behaves as
+if all read-only parameters are first copied and replaced by temporary
+non-aliased parameters before any writing to any output bstring is performed
+(though actual copying is extremely rarely ever done.)
+
+Besides being a useful safety feature, bstring searching/comparison
+functions can improve to O(1) execution when aliasing is detected.
+
+Note that aliasing detection and handling code in Bstrlib is generally
+extremely cheap.  There is almost never any appreciable performance penalty
+for using aliased parameters.
+
+Reenterancy
+-----------
+
+Nearly every function in Bstrlib is a leaf function, and is completely
+reenterable with the exception of writing to common bstrings.  The split
+functions which use a callback mechanism requires only that the source string
+not be destroyed by the callback function unless the callback function returns
+with an error status (note that Bstrlib functions which return an error do
+not modify the string in any way.)  The string can in fact be modified by the
+callback and the behaviour is deterministic.  See the documentation of the
+various split functions for more details.
+
+Undefined scenarios
+-------------------
+
+One of the basic important premises for Bstrlib is to not to increase the
+propogation of undefined situations from parameters that are otherwise legal
+in of themselves.  In particular, except for extremely marginal cases, usages
+of bstrings that use the bstring library functions alone cannot lead to any
+undefined action.  But due to C/C++ language and library limitations, there
+is no way to define a non-trivial library that is completely without
+undefined operations.  All such possible undefined operations are described
+below:
+
+1) bstrings or struct tagbstrings that are not explicitely initialized cannot
+   be passed as a parameter to any bstring function.
+2) The members of the NULL bstring cannot be accessed directly.  (Though all
+   APIs and macros detect the NULL bstring.)
+3) A bstring whose data member has not been obtained from a malloc or
+   compatible call and which is write accessible passed as a writable
+   parameter will lead to undefined results.  (i.e., do not writeAllow any
+   constructed bstrings unless the data portion has been obtained from the
+   heap.)
+4) If the headers of two strings alias but are not identical (which can only
+   happen via a defective manual construction), then passing them to a
+   bstring function in which one is writable is not defined.
+5) If the mlen member is larger than the actual accessible length of the data
+   member for a writable bstring, or if the slen member is larger than the
+   readable length of the data member for a readable bstring, then the
+   corresponding bstring operations are undefined.
+6) Any bstring definition whose header or accessible data portion has been
+   assigned to inaccessible or otherwise illegal memory clearly cannot be
+   acted upon by the bstring library in any way.
+7) Destroying the source of an incremental split from within the callback
+   and not returning with a negative value (indicating that it should abort)
+   will lead to undefined behaviour.  (Though *modifying* or adjusting the
+   state of the source data, even if those modification fail within the
+   bstrlib API, has well defined behavior.)
+8) Modifying a bstring which is write protected by direct access has
+   undefined behavior.
+
+While this may seem like a long list, with the exception of invalid uses of
+the writeAllow macro, and source destruction during an iterative split
+without an accompanying abort, no usage of the bstring API alone can cause
+any undefined scenario to occurr.  I.e., the policy of restricting usage of
+bstrings to the bstring API can significantly reduce the risk of runtime
+errors (in practice it should eliminate them) related to string manipulation
+due to undefined action.
+
+C++ wrapper
+-----------
+
+A C++ wrapper has been created to enable bstring functionality for C++ in the
+most natural (for C++ programers) way possible.  The mandate for the C++
+wrapper is different from the base C bstring library.  Since the C++ language
+has far more abstracting capabilities, the CBString structure is considered
+fully abstracted -- i.e., hand generated CBStrings are not supported (though
+conversion from a struct tagbstring is allowed) and all detectable errors are
+manifest as thrown exceptions.
+
+- The C++ class definitions are all under the namespace Bstrlib.  bstrwrap.h
+  enables this namespace (with a using namespace Bstrlib; directive at the
+  end) unless the macro BSTRLIB_DONT_ASSUME_NAMESPACE has been defined before
+  it is included.
+
+- Erroneous accesses results in an exception being thrown.  The exception
+  parameter is of type "struct CBStringException" which is derived from
+  std::exception if STL is used.  A verbose description of the error message
+  can be obtained from the what() method.
+
+- CBString is a C++ structure derived from a struct tagbstring.  An address
+  of a CBString cast to a bstring must not be passed to bdestroy.  The bstring
+  C API has been made C++ safe and can be used directly in a C++ project.
+
+- It includes constructors which can take a char, '\0' terminated char
+  buffer, tagbstring, (char, repeat-value), a length delimited buffer or a
+  CBStringList to initialize it.
+
+- Concatenation is performed with the + and += operators.  Comparisons are
+  done with the ==, !=, <, >, <= and >= operators.  Note that == and != use
+  the biseq call, while <, >, <= and >= use bstrcmp.
+
+- CBString's can be directly cast to const character buffers.
+
+- CBString's can be directly cast to double, float, int or unsigned int so
+  long as the CBString are decimal representations of those types (otherwise
+  an exception will be thrown).  Converting the other way should be done with
+  the format(a) method(s).
+
+- CBString contains the length, character and [] accessor methods.  The
+  character and [] accessors are aliases of each other.  If the bounds for
+  the string are exceeded, an exception is thrown.  To avoid the overhead for
+  this check, first cast the CBString to a (const char *) and use [] to
+  dereference the array as normal.  Note that the character and [] accessor
+  methods allows both reading and writing of individual characters.
+
+- The methods: format, formata, find, reversefind, findcaseless,
+  reversefindcaseless, midstr, insert, insertchrs, replace, findreplace,
+  findreplacecaseless, remove, findchr, nfindchr, alloc, toupper, tolower,
+  gets, read are analogous to the functions that can be found in the C API.
+
+- The caselessEqual and caselessCmp methods are analogous to biseqcaseless
+  and bstricmp functions respectively.
+
+- Note that just like the bformat function, the format and formata methods do
+  not automatically cast CBStrings into char * strings for "%s"-type
+  substitutions:
+
+    CBString w("world");
+    CBString h("Hello");
+    CBString hw;
+
+    /* The casts are necessary */
+    hw.format ("%s, %s", (const char *)h, (const char *)w);
+
+- The methods trunc and repeat have been added instead of using pattern.
+
+- ltrim, rtrim and trim methods have been added.  These remove characters
+  from a given character string set (defaulting to the whitespace characters)
+  from either the left, right or both ends of the CBString, respectively.
+
+- The method setsubstr is also analogous in functionality to bsetstr, except
+  that it cannot be passed NULL.  Instead the method fill and the fill-style
+  constructor have been supplied to enable this functionality.
+
+- The writeprotect(), writeallow() and iswriteprotected() methods are
+  analogous to the bwriteprotect(), bwriteallow() and biswriteprotected()
+  macros in the C API.  Write protection semantics in CBString are stronger
+  than with the C API in that indexed character assignment is checked for
+  write protection.  However, unlike with the C API, a write protected
+  CBString can be destroyed by the destructor.
+
+- CBStream is a C++ structure which wraps a struct bStream (its not derived
+  from it, since destruction is slightly different).  It is constructed by
+  passing in a bNread function pointer and a stream parameter cast to void *.
+  This structure includes methods for detecting eof, setting the buffer
+  length, reading the whole stream or reading entries line by line or block
+  by block, an unread function, and a peek function.
+
+- If STL is available, the CBStringList structure is derived from a vector of
+  CBString with various split methods.  The split method has been overloaded
+  to accept either a character or CBString as the second parameter (when the
+  split parameter is a CBString any character in that CBString is used as a
+  seperator).  The splitstr method takes a CBString as a substring seperator.
+  Joins can be performed via a CBString constructor which takes a
+  CBStringList as a parameter, or just using the CBString::join() method.
+
+- If there is proper support for std::iostreams, then the >> and << operators
+  and the getline() function have been added (with semantics the same as
+  those for std::string).
+
+Multithreading
+--------------
+
+A mutable bstring is kind of analogous to a small (two entry) linked list
+allocated by malloc, with all aliasing completely under programmer control.
+I.e., manipulation of one bstring will never affect any other distinct
+bstring unless explicitely constructed to do so by the programmer via hand
+construction or via building a reference.  Bstrlib also does not use any
+static or global storage, so there are no hidden unremovable race conditions.
+Bstrings are also clearly not inherently thread local.  So just like
+char *'s, bstrings can be passed around from thread to thread and shared and
+so on, so long as modifications to a bstring correspond to some kind of
+exclusive access lock as should be expected (or if the bstring is read-only,
+which can be enforced by bstring write protection) for any sort of shared
+object in a multithreaded environment.
+
+Bsafe module
+------------
+
+For convenience, a bsafe module has been included.  The idea is that if this
+module is included, inadvertant usage of the most dangerous C functions will
+be overridden and lead to an immediate run time abort.  Of course, it should
+be emphasized that usage of this module is completely optional.  The
+intention is essentially to provide an option for creating project safety
+rules which can be enforced mechanically rather than socially.  This is
+useful for larger, or open development projects where its more difficult to
+enforce social rules or "coding conventions".
+
+Problems not solved
+-------------------
+
+Bstrlib is written for the C and C++ languages, which have inherent weaknesses
+that cannot be easily solved:
+
+1. Memory leaks:  Forgetting to call bdestroy on a bstring that is about to be
+   unreferenced, just as forgetting to call free on a heap buffer that is
+   about to be dereferenced.  Though bstrlib itself is leak free.
+2. Read before write usage:  In C, declaring an auto bstring does not
+   automatically fill it with legal/valid contents.  This problem has been
+   somewhat mitigated in C++.  (The bstrDeclare and bstrFree macros from
+   bstraux can be used to help mitigate this problem.)
+
+Other problems not addressed:
+
+3. Built-in mutex usage to automatically avoid all bstring internal race
+   conditions in multitasking environments: The problem with trying to
+   implement such things at this low a level is that it is typically more
+   efficient to use locks in higher level primitives. There is also no
+   platform independent way to implement locks or mutexes.
+4. Unicode/widecharacter support.
+
+Note that except for spotty support of wide characters, the default C
+standard library does not address any of these problems either.
+
+Configurable compilation options
+--------------------------------
+
+All configuration options are meant solely for the purpose of compiler
+compatibility.  Configuration options are not meant to change the semantics
+or capabilities of the library, except where it is unavoidable.
+
+Since some C++ compilers don't include the Standard Template Library and some
+have the options of disabling exception handling, a number of macros can be
+used to conditionally compile support for each of this:
+
+BSTRLIB_CAN_USE_STL
+
+  - defining this will enable the used of the Standard Template Library.
+    Defining BSTRLIB_CAN_USE_STL overrides the BSTRLIB_CANNOT_USE_STL macro.
+
+BSTRLIB_CANNOT_USE_STL
+
+  - defining this will disable the use of the Standard Template Library.
+    Defining BSTRLIB_CAN_USE_STL overrides the BSTRLIB_CANNOT_USE_STL macro.
+
+BSTRLIB_CAN_USE_IOSTREAM
+
+  - defining this will enable the used of streams from class std.  Defining
+    BSTRLIB_CAN_USE_IOSTREAM overrides the BSTRLIB_CANNOT_USE_IOSTREAM macro.
+
+BSTRLIB_CANNOT_USE_IOSTREAM
+
+  - defining this will disable the use of streams from class std.  Defining
+    BSTRLIB_CAN_USE_IOSTREAM overrides the BSTRLIB_CANNOT_USE_IOSTREAM macro.
+
+BSTRLIB_THROWS_EXCEPTIONS
+
+  - defining this will enable the exception handling within bstring.
+    Defining BSTRLIB_THROWS_EXCEPTIONS overrides the
+    BSTRLIB_DOESNT_THROWS_EXCEPTIONS macro.
+
+BSTRLIB_DOESNT_THROW_EXCEPTIONS
+
+  - defining this will disable the exception handling within bstring.
+    Defining BSTRLIB_THROWS_EXCEPTIONS overrides the
+    BSTRLIB_DOESNT_THROW_EXCEPTIONS macro.
+
+Note that these macros must be defined consistently throughout all modules
+that use CBStrings including bstrwrap.cpp.
+
+Some older C compilers do not support functions such as vsnprintf.  This is
+handled by the following macro variables:
+
+BSTRLIB_NOVSNP
+
+  - defining this indicates that the compiler does not support vsnprintf.
+    This will cause bformat and bformata to not be declared.  Note that
+    for some compilers, such as Turbo C, this is set automatically.
+    Defining BSTRLIB_NOVSNP overrides the BSTRLIB_VSNP_OK macro.
+
+BSTRLIB_VSNP_OK
+
+  - defining this will disable the autodetection of compilers that do not
+    vsnprintf.
+    Defining BSTRLIB_NOVSNP overrides the BSTRLIB_VSNP_OK macro.
+
+Semantic compilation options
+----------------------------
+
+Bstrlib comes with very few compilation options for changing the semantics of
+of the library.  These are described below.
+
+BSTRLIB_DONT_ASSUME_NAMESPACE
+
+  - Defining this before including bstrwrap.h will disable the automatic
+    enabling of the Bstrlib namespace for the C++ declarations.
+
+BSTRLIB_DONT_USE_VIRTUAL_DESTRUCTOR
+
+  - Defining this will make the CBString destructor non-virtual.
+
+BSTRLIB_MEMORY_DEBUG
+
+  - Defining this will cause the bstrlib modules bstrlib.c and bstrwrap.cpp
+    to invoke a #include "memdbg.h".  memdbg.h has to be supplied by the user.
+
+Note that these macros must be defined consistently throughout all modules
+that use bstrings or CBStrings including bstrlib.c, bstraux.c and
+bstrwrap.cpp.
+
+===============================================================================
+
+Files
+-----
+
+bstrlib.c       - C implementaion of bstring functions.
+bstrlib.h       - C header file for bstring functions.
+bstraux.c       - C example that implements trivial additional functions.
+bstraux.h       - C header for bstraux.c
+bstest.c        - C unit/regression test for bstrlib.c
+
+bstrwrap.cpp    - C++ implementation of CBString.
+bstrwrap.h      - C++ header file for CBString.
+test.cpp        - C++ unit/regression test for bstrwrap.cpp
+
+bsafe.c         - C runtime stubs to abort usage of unsafe C functions.
+bsafe.h         - C header file for bsafe.c functions.
+
+C projects need only include bstrlib.h and compile/link bstrlib.c to use the
+bstring library.  C++ projects need to additionally include bstrwrap.h and
+compile/link bstrwrap.cpp.  For both, there may be a need to make choices
+about feature configuration as described in the "Configurable compilation
+options" in the section above.
+
+Other files that are included in this archive are:
+
+license.txt     - The BSD license for Bstrlib
+gpl.txt         - The GPL version 2
+security.txt    - A security statement useful for auditting Bstrlib
+porting.txt     - A guide to porting Bstrlib
+bstrlib.txt     - This file
+
+===============================================================================
+
+The functions
+-------------
+
+    extern bstring bfromcstr (const char * str);
+
+    Take a standard C library style '\0' terminated char buffer and generate
+    a bstring with the same contents as the char buffer.  If an error occurs
+    NULL is returned.
+
+    So for example:
+
+    bstring b = bfromcstr ("Hello");
+    if (!b) {
+        fprintf (stderr, "Out of memory");
+    } else {
+        puts ((char *) b->data);
+    }
+
+    ..........................................................................
+
+    extern bstring bfromcstralloc (int mlen, const char * str);
+
+    Create a bstring which contains the contents of the '\0' terminated
+    char * buffer str.  The memory buffer backing the bstring is at least
+    mlen characters in length.  If an error occurs NULL is returned.
+
+    So for example:
+
+    bstring b = bfromcstralloc (64, someCstr);
+    if (b) b->data[63] = 'x';
+
+    The idea is that this will set the 64th character of b to 'x' if it is at
+    least 64 characters long otherwise do nothing.  And we know this is well
+    defined so long as b was successfully created, since it will have been
+    allocated with at least 64 characters.
+
+    ..........................................................................
+
+    extern bstring blk2bstr (const void * blk, int len);
+
+    Create a bstring whose contents are described by the contiguous buffer
+    pointing to by blk with a length of len bytes.  Note that this function
+    creates a copy of the data in blk, rather than simply referencing it.
+    Compare with the blk2tbstr macro.  If an error occurs NULL is returned.
+
+    ..........................................................................
+
+    extern char * bstr2cstr (const_bstring s, char z);
+
+    Create a '\0' terminated char buffer which contains the contents of the
+    bstring s, except that any contained '\0' characters are converted to the
+    character in z.  This returned value should be freed with bcstrfree(), by
+    the caller.  If an error occurs NULL is returned.
+
+    ..........................................................................
+
+    extern int bcstrfree (char * s);
+
+    Frees a C-string generated by bstr2cstr ().  This is normally unnecessary
+    since it just wraps a call to free (), however, if malloc () and free ()
+    have been redefined as a macros within the bstrlib module (via macros in
+    the memdbg.h backdoor) with some difference in behaviour from the std
+    library functions, then this allows a correct way of freeing the memory
+    that allows higher level code to be independent from these macro
+    redefinitions.
+
+    ..........................................................................
+
+    extern bstring bstrcpy (const_bstring b1);
+
+    Make a copy of the passed in bstring.  The copied bstring is returned if
+    there is no error, otherwise NULL is returned.
+
+    ..........................................................................
+
+    extern int bassign (bstring a, const_bstring b);
+
+    Overwrite the bstring a with the contents of bstring b.  Note that the
+    bstring a must be a well defined and writable bstring.  If an error
+    occurs BSTR_ERR is returned and a is not overwritten.
+
+    ..........................................................................
+
+    int bassigncstr (bstring a, const char * str);
+
+    Overwrite the string a with the contents of char * string str.  Note that
+    the bstring a must be a well defined and writable bstring.  If an error
+    occurs BSTR_ERR is returned and a may be partially overwritten.
+
+    ..........................................................................
+
+    int bassignblk (bstring a, const void * s, int len);
+
+    Overwrite the string a with the contents of the block (s, len).  Note that
+    the bstring a must be a well defined and writable bstring.  If an error
+    occurs BSTR_ERR is returned and a is not overwritten.
+
+    ..........................................................................
+
+    extern int bassignmidstr (bstring a, const_bstring b, int left, int len);
+
+    Overwrite the bstring a with the middle of contents of bstring b
+    starting from position left and running for a length len.  left and
+    len are clamped to the ends of b as with the function bmidstr.  Note that
+    the bstring a must be a well defined and writable bstring.  If an error
+    occurs BSTR_ERR is returned and a is not overwritten.
+
+    ..........................................................................
+
+    extern bstring bmidstr (const_bstring b, int left, int len);
+
+    Create a bstring which is the substring of b starting from position left
+    and running for a length len (clamped by the end of the bstring b.)  If
+    there was no error, the value of this constructed bstring is returned
+    otherwise NULL is returned.
+
+    ..........................................................................
+
+    extern int bdelete (bstring s1, int pos, int len);
+
+    Removes characters from pos to pos+len-1 and shifts the tail of the
+    bstring starting from pos+len to pos.  len must be positive for this call
+    to have any effect.  The section of the bstring described by (pos, len)
+    is clamped to boundaries of the bstring b.  The value BSTR_OK is returned
+    if the operation is successful, otherwise BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int bconcat (bstring b0, const_bstring b1);
+
+    Concatenate the bstring b1 to the end of bstring b0.  The value BSTR_OK
+    is returned if the operation is successful, otherwise BSTR_ERR is
+    returned.
+
+    ..........................................................................
+
+    extern int bconchar (bstring b, char c);
+
+    Concatenate the character c to the end of bstring b.  The value BSTR_OK
+    is returned if the operation is successful, otherwise BSTR_ERR is
+    returned.
+
+    ..........................................................................
+
+    extern int bcatcstr (bstring b, const char * s);
+
+    Concatenate the char * string s to the end of bstring b.  The value
+    BSTR_OK is returned if the operation is successful, otherwise BSTR_ERR is
+    returned.
+
+    ..........................................................................
+
+    extern int bcatblk (bstring b, const void * s, int len);
+
+    Concatenate a fixed length buffer (s, len) to the end of bstring b.  The
+    value BSTR_OK is returned if the operation is successful, otherwise
+    BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int biseq (const_bstring b0, const_bstring b1);
+
+    Compare the bstring b0 and b1 for equality.  If the bstrings differ, 0
+    is returned, if the bstrings are the same, 1 is returned, if there is an
+    error, -1 is returned.  If the length of the bstrings are different, this
+    function has O(1) complexity.  Contained '\0' characters are not treated
+    as a termination character.
+
+    Note that the semantics of biseq are not completely compatible with
+    bstrcmp because of its different treatment of the '\0' character.
+
+    ..........................................................................
+
+    extern int bisstemeqblk (const_bstring b, const void * blk, int len);
+
+    Compare beginning of bstring b0 with a block of memory of length len for
+    equality.  If the beginning of b0 differs from the memory block (or if b0
+    is too short), 0 is returned, if the bstrings are the same, 1 is returned,
+    if there is an error, -1 is returned.
+
+    ..........................................................................
+
+    extern int biseqcaseless (const_bstring b0, const_bstring b1);
+
+    Compare two bstrings for equality without differentiating between case.
+    If the bstrings differ other than in case, 0 is returned, if the bstrings
+    are the same, 1 is returned, if there is an error, -1 is returned.  If
+    the length of the bstrings are different, this function is O(1).  '\0'
+    termination characters are not treated in any special way.
+
+    ..........................................................................
+
+    extern int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len);
+
+    Compare beginning of bstring b0 with a block of memory of length len
+    without differentiating between case for equality.  If the beginning of b0
+    differs from the memory block other than in case (or if b0 is too short),
+    0 is returned, if the bstrings are the same, 1 is returned, if there is an
+    error, -1 is returned.
+
+    ..........................................................................
+
+    extern int biseqcstr (const_bstring b, const char *s);
+
+    Compare the bstring b and char * bstring s.  The C string s must be '\0'
+    terminated at exactly the length of the bstring b, and the contents
+    between the two must be identical with the bstring b with no '\0'
+    characters for the two contents to be considered equal.  This is
+    equivalent to the condition that their current contents will be always be
+    equal when comparing them in the same format after converting one or the
+    other.  If they are equal 1 is returned, if they are unequal 0 is
+    returned and if there is a detectable error BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int biseqcstrcaseless (const_bstring b, const char *s);
+
+    Compare the bstring b and char * string s.  The C string s must be '\0'
+    terminated at exactly the length of the bstring b, and the contents
+    between the two must be identical except for case with the bstring b with
+    no '\0' characters for the two contents to be considered equal.  This is
+    equivalent to the condition that their current contents will be always be
+    equal ignoring case when comparing them in the same format after
+    converting one or the other.  If they are equal, except for case, 1 is
+    returned, if they are unequal regardless of case 0 is returned and if
+    there is a detectable error BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int bstrcmp (const_bstring b0, const_bstring b1);
+
+    Compare the bstrings b0 and b1 for ordering.  If there is an error,
+    SHRT_MIN is returned, otherwise a value less than or greater than zero,
+    indicating that the bstring pointed to by b0 is lexicographically less
+    than or greater than the bstring pointed to by b1 is returned.  If the
+    bstring lengths are unequal but the characters up until the length of the
+    shorter are equal then a value less than, or greater than zero,
+    indicating that the bstring pointed to by b0 is shorter or longer than the
+    bstring pointed to by b1 is returned.  0 is returned if and only if the
+    two bstrings are the same.  If the length of the bstrings are different,
+    this function is O(n).  Like its standard C library counter part, the
+    comparison does not proceed past any '\0' termination characters
+    encountered.
+
+    The seemingly odd error return value, merely provides slightly more
+    granularity than the undefined situation given in the C library function
+    strcmp.  The function otherwise behaves very much like strcmp().
+
+    Note that the semantics of bstrcmp are not completely compatible with
+    biseq because of its different treatment of the '\0' termination
+    character.
+
+    ..........................................................................
+
+    extern int bstrncmp (const_bstring b0, const_bstring b1, int n);
+
+    Compare the bstrings b0 and b1 for ordering for at most n characters.  If
+    there is an error, SHRT_MIN is returned, otherwise a value is returned as
+    if b0 and b1 were first truncated to at most n characters then bstrcmp
+    was called with these new bstrings are paremeters.  If the length of the
+    bstrings are different, this function is O(n).  Like its standard C
+    library counter part, the comparison does not proceed past any '\0'
+    termination characters encountered.
+
+    The seemingly odd error return value, merely provides slightly more
+    granularity than the undefined situation given in the C library function
+    strncmp.  The function otherwise behaves very much like strncmp().
+
+    ..........................................................................
+
+    extern int bstricmp (const_bstring b0, const_bstring b1);
+
+    Compare two bstrings without differentiating between case.  The return
+    value is the difference of the values of the characters where the two
+    bstrings first differ, otherwise 0 is returned indicating that the
+    bstrings are equal.  If the lengths are different, then a difference from
+    0 is given, but if the first extra character is '\0', then it is taken to
+    be the value UCHAR_MAX+1.
+
+    ..........................................................................
+
+    extern int bstrnicmp (const_bstring b0, const_bstring b1, int n);
+
+    Compare two bstrings without differentiating between case for at most n
+    characters.  If the position where the two bstrings first differ is
+    before the nth position, the return value is the difference of the values
+    of the characters, otherwise 0 is returned.  If the lengths are different
+    and less than n characters, then a difference from 0 is given, but if the
+    first extra character is '\0', then it is taken to be the value
+    UCHAR_MAX+1.
+
+    ..........................................................................
+
+    extern int bdestroy (bstring b);
+
+    Deallocate the bstring passed.  Passing NULL in as a parameter will have
+    no effect.  Note that both the header and the data portion of the bstring
+    will be freed.  No other bstring function which modifies one of its
+    parameters will free or reallocate the header.  Because of this, in
+    general, bdestroy cannot be called on any declared struct tagbstring even
+    if it is not write protected.  A bstring which is write protected cannot
+    be destroyed via the bdestroy call.  Any attempt to do so will result in
+    no action taken, and BSTR_ERR will be returned.
+
+    Note to C++ users: Passing in a CBString cast to a bstring will lead to
+    undefined behavior (free will be called on the header, rather than the
+    CBString destructor.)  Instead just use the ordinary C++ language
+    facilities to dealloc a CBString.
+
+    ..........................................................................
+
+    extern int binstr (const_bstring s1, int pos, const_bstring s2);
+
+    Search for the bstring s2 in s1 starting at position pos and looking in a
+    forward (increasing) direction.  If it is found then it returns with the
+    first position after pos where it is found, otherwise it returns BSTR_ERR.
+    The algorithm used is brute force; O(m*n).
+
+    ..........................................................................
+
+    extern int binstrr (const_bstring s1, int pos, const_bstring s2);
+
+    Search for the bstring s2 in s1 starting at position pos and looking in a
+    backward (decreasing) direction.  If it is found then it returns with the
+    first position after pos where it is found, otherwise return BSTR_ERR.
+    Note that the current position at pos is tested as well -- so to be
+    disjoint from a previous forward search it is recommended that the
+    position be backed up (decremented) by one position.  The algorithm used
+    is brute force; O(m*n).
+
+    ..........................................................................
+
+    extern int binstrcaseless (const_bstring s1, int pos, const_bstring s2);
+
+    Search for the bstring s2 in s1 starting at position pos and looking in a
+    forward (increasing) direction but without regard to case.  If it is
+    found then it returns with the first position after pos where it is
+    found, otherwise it returns BSTR_ERR. The algorithm used is brute force;
+    O(m*n).
+
+    ..........................................................................
+
+    extern int binstrrcaseless (const_bstring s1, int pos, const_bstring s2);
+
+    Search for the bstring s2 in s1 starting at position pos and looking in a
+    backward (decreasing) direction but without regard to case.  If it is
+    found then it returns with the first position after pos where it is
+    found, otherwise return BSTR_ERR. Note that the current position at pos
+    is tested as well -- so to be disjoint from a previous forward search it
+    is recommended that the position be backed up (decremented) by one
+    position.  The algorithm used is brute force; O(m*n).
+
+    ..........................................................................
+
+    extern int binchr (const_bstring b0, int pos, const_bstring b1);
+
+    Search for the first position in b0 starting from pos or after, in which
+    one of the characters in b1 is found.  This function has an execution
+    time of O(b0->slen + b1->slen).  If such a position does not exist in b0,
+    then BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int binchrr (const_bstring b0, int pos, const_bstring b1);
+
+    Search for the last position in b0 no greater than pos, in which one of
+    the characters in b1 is found.  This function has an execution time
+    of O(b0->slen + b1->slen).  If such a position does not exist in b0,
+    then BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int bninchr (const_bstring b0, int pos, const_bstring b1);
+
+    Search for the first position in b0 starting from pos or after, in which
+    none of the characters in b1 is found and return it.  This function has
+    an execution time of O(b0->slen + b1->slen).  If such a position does
+    not exist in b0, then BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int bninchrr (const_bstring b0, int pos, const_bstring b1);
+
+    Search for the last position in b0 no greater than pos, in which none of
+    the characters in b1 is found and return it.  This function has an
+    execution time of O(b0->slen + b1->slen).  If such a position does not
+    exist in b0, then BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int bstrchr (const_bstring b, int c);
+
+    Search for the character c in the bstring b forwards from the start of
+    the bstring.  Returns the position of the found character or BSTR_ERR if
+    it is not found.
+
+    NOTE: This has been implemented as a macro on top of bstrchrp ().
+
+    ..........................................................................
+
+    extern int bstrrchr (const_bstring b, int c);
+
+    Search for the character c in the bstring b backwards from the end of the
+    bstring.  Returns the position of the found character or BSTR_ERR if it is
+    not found.
+
+    NOTE: This has been implemented as a macro on top of bstrrchrp ().
+
+    ..........................................................................
+
+    extern int bstrchrp (const_bstring b, int c, int pos);
+
+    Search for the character c in b forwards from the position pos
+    (inclusive).  Returns the position of the found character or BSTR_ERR if
+    it is not found.
+
+    ..........................................................................
+
+    extern int bstrrchrp (const_bstring b, int c, int pos);
+
+    Search for the character c in b backwards from the position pos in bstring
+    (inclusive).  Returns the position of the found character or BSTR_ERR if
+    it is not found.
+
+    ..........................................................................
+
+    extern int bsetstr (bstring b0, int pos, const_bstring b1, unsigned char fill);
+
+    Overwrite the bstring b0 starting at position pos with the bstring b1. If
+    the position pos is past the end of b0, then the character "fill" is
+    appended as necessary to make up the gap between the end of b0 and pos.
+    If b1 is NULL, it behaves as if it were a 0-length bstring. The value
+    BSTR_OK is returned if the operation is successful, otherwise BSTR_ERR is
+    returned.
+
+    ..........................................................................
+
+    extern int binsert (bstring s1, int pos, const_bstring s2, unsigned char fill);
+
+    Inserts the bstring s2 into s1 at position pos.  If the position pos is
+    past the end of s1, then the character "fill" is appended as necessary to
+    make up the gap between the end of s1 and pos.  The value BSTR_OK is
+    returned if the operation is successful, otherwise BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int binsertch (bstring s1, int pos, int len, unsigned char fill);
+
+    Inserts the character fill repeatedly into s1 at position pos for a
+    length len.  If the position pos is past the end of s1, then the
+    character "fill" is appended as necessary to make up the gap between the
+    end of s1 and the position pos + len (exclusive).  The value BSTR_OK is
+    returned if the operation is successful, otherwise BSTR_ERR is returned.
+
+    ..........................................................................
+
+    extern int breplace (bstring b1, int pos, int len, const_bstring b2,
+                         unsigned char fill);
+
+    Replace a section of a bstring from pos for a length len with the bstring
+    b2. If the position pos is past the end of b1 then the character "fill"
+    is appended as necessary to make up the gap between the end of b1 and
+    pos.
+
+    ..........................................................................
+
+    extern int bfindreplace (bstring b, const_bstring find,
+                             const_bstring replace, int position);
+
+    Replace all occurrences of the find substring with a replace bstring
+    after a given position in the bstring b.  The find bstring must have a
+    length > 0 otherwise BSTR_ERR is returned.  This function does not
+    perform recursive per character replacement; that is to say successive
+    searches resume at the position after the last replace.
+
+    So for example:
+
+        bfindreplace (a0 = bfromcstr("aabaAb"), a1 = bfromcstr("a"),
+                      a2 = bfromcstr("aa"), 0);
+
+    Should result in changing a0 to "aaaabaaAb".
+
+    This function performs exactly (b->slen - position) bstring comparisons,
+    and data movement is bounded above by character volume equivalent to size
+    of the output bstring.
+
+    ..........................................................................
+
+    extern int bfindreplacecaseless (bstring b, const_bstring find,
+                             const_bstring replace, int position);
+
+    Replace all occurrences of the find substring, ignoring case, with a
+    replace bstring after a given position in the bstring b.  The find bstring
+    must have a length > 0 otherwise BSTR_ERR is returned.  This function
+    does not perform recursive per character replacement; that is to say
+    successive searches resume at the position after the last replace.
+
+    So for example:
+
+        bfindreplacecaseless (a0 = bfromcstr("AAbaAb"), a1 = bfromcstr("a"),
+                              a2 = bfromcstr("aa"), 0);
+
+    Should result in changing a0 to "aaaabaaaab".
+
+    This function performs exactly (b->slen - position) bstring comparisons,
+    and data movement is bounded above by character volume equivalent to size
+    of the output bstring.
+
+    ..........................................................................
+
+    extern int balloc (bstring b, int length);
+
+    Increase the allocated memory backing the data buffer for the bstring b
+    to a length of at least length.  If the memory backing the bstring b is
+    already large enough, not action is performed.  This has no effect on the
+    bstring b that is visible to the bstring API.  Usually this function will
+    only be used when a minimum buffer size is required coupled with a direct
+    access to the ->data member of the bstring structure.
+
+    Be warned that like any other bstring function, the bstring must be well
+    defined upon entry to this function.  I.e., doing something like:
+
+        b->slen *= 2; /* ?? Most likely incorrect */
+        balloc (b, b->slen);
+
+    is invalid, and should be implemented as:
+
+        int t;
+        if (BSTR_OK == balloc (b, t = (b->slen * 2))) b->slen = t;
+
+    This function will return with BSTR_ERR if b is not detected as a valid
+    bstring or length is not greater than 0, otherwise BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int ballocmin (bstring b, int length);
+
+    Change the amount of memory backing the bstring b to at least length.
+    This operation will never truncate the bstring data including the
+    extra terminating '\0' and thus will not decrease the length to less than
+    b->slen + 1.  Note that repeated use of this function may cause
+    performance problems (realloc may be called on the bstring more than
+    the O(log(INT_MAX)) times).  This function will return with BSTR_ERR if b
+    is not detected as a valid bstring or length is not greater than 0,
+    otherwise BSTR_OK is returned.
+
+    So for example:
+
+    if (BSTR_OK == ballocmin (b, 64)) b->data[63] = 'x';
+
+    The idea is that this will set the 64th character of b to 'x' if it is at
+    least 64 characters long otherwise do nothing.  And we know this is well
+    defined so long as the ballocmin call was successfully, since it will
+    ensure that b has been allocated with at least 64 characters.
+
+    ..........................................................................
+
+    int btrunc (bstring b, int n);
+
+    Truncate the bstring to at most n characters.  This function will return
+    with BSTR_ERR if b is not detected as a valid bstring or n is less than
+    0, otherwise BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int bpattern (bstring b, int len);
+
+    Replicate the starting bstring, b, end to end repeatedly until it
+    surpasses len characters, then chop the result to exactly len characters.
+    This function operates in-place.  This function will return with BSTR_ERR
+    if b is NULL or of length 0, otherwise BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int btoupper (bstring b);
+
+    Convert contents of bstring to upper case.  This function will return with
+    BSTR_ERR if b is NULL or of length 0, otherwise BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int btolower (bstring b);
+
+    Convert contents of bstring to lower case.  This function will return with
+    BSTR_ERR if b is NULL or of length 0, otherwise BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int bltrimws (bstring b);
+
+    Delete whitespace contiguous from the left end of the bstring.  This
+    function will return with BSTR_ERR if b is NULL or of length 0, otherwise
+    BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int brtrimws (bstring b);
+
+    Delete whitespace contiguous from the right end of the bstring.  This
+    function will return with BSTR_ERR if b is NULL or of length 0, otherwise
+    BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int btrimws (bstring b);
+
+    Delete whitespace contiguous from both ends of the bstring.  This function
+    will return with BSTR_ERR if b is NULL or of length 0, otherwise BSTR_OK
+    is returned.
+
+    ..........................................................................
+
+    extern struct bstrList* bstrListCreate (void);
+
+    Create an empty struct bstrList. The struct bstrList output structure is
+    declared as follows:
+
+    struct bstrList {
+        int qty, mlen;
+        bstring * entry;
+    };
+
+    The entry field actually is an array with qty number entries.  The mlen
+    record counts the maximum number of bstring's for which there is memory
+    in the entry record.
+
+    The Bstrlib API does *NOT* include a comprehensive set of functions for
+    full management of struct bstrList in an abstracted way.  The reason for
+    this is because aliasing semantics of the list are best left to the user
+    of this function, and performance varies wildly depending on the
+    assumptions made.  For a complete list of bstring data type it is
+    recommended that the C++ public std::vector<CBString> be used, since its
+    semantics are usage are more standard.
+
+    ..........................................................................
+
+    extern int bstrListDestroy (struct bstrList * sl);
+
+    Destroy a struct bstrList structure that was returned by the bsplit
+    function.  Note that this will destroy each bstring in the ->entry array
+    as well.  See bstrListCreate() above for structure of struct bstrList.
+
+    ..........................................................................
+
+    extern int bstrListAlloc (struct bstrList * sl, int msz);
+
+    Ensure that there is memory for at least msz number of entries for the
+    list.
+
+    ..........................................................................
+
+    extern int bstrListAllocMin (struct bstrList * sl, int msz);
+
+    Try to allocate the minimum amount of memory for the list to include at
+    least msz entries or sl->qty whichever is greater.
+
+    ..........................................................................
+
+    extern struct bstrList * bsplit (bstring str, unsigned char splitChar);
+
+    Create an array of sequential substrings from str divided by the
+    character splitChar.  Successive occurrences of the splitChar will be
+    divided by empty bstring entries, following the semantics from the Python
+    programming language.  To reclaim the memory from this output structure,
+    bstrListDestroy () should be called.  See bstrListCreate() above for
+    structure of struct bstrList.
+
+    ..........................................................................
+
+    extern struct bstrList * bsplits (bstring str, const_bstring splitStr);
+
+    Create an array of sequential substrings from str divided by any
+    character contained in splitStr.  An empty splitStr causes a single entry
+    bstrList containing a copy of str to be returned.  See bstrListCreate()
+    above for structure of struct bstrList.
+
+    ..........................................................................
+
+    extern struct bstrList * bsplitstr (bstring str, const_bstring splitStr);
+
+    Create an array of sequential substrings from str divided by the entire
+    substring splitStr.  An empty splitStr causes a single entry bstrList
+    containing a copy of str to be returned.  See bstrListCreate() above for
+    structure of struct bstrList.
+
+    ..........................................................................
+
+    extern bstring bjoin (const struct bstrList * bl, const_bstring sep);
+
+    Join the entries of a bstrList into one bstring by sequentially
+    concatenating them with the sep bstring in between.  If sep is NULL, it
+    is treated as if it were the empty bstring.  Note that:
+
+        bjoin (l = bsplit (b, s->data[0]), s);
+
+    should result in a copy of b, if s->slen is 1.  If there is an error NULL
+    is returned, otherwise a bstring with the correct result is returned.
+    See bstrListCreate() above for structure of struct bstrList.
+
+    ..........................................................................
+
+    extern int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
+    int (* cb) (void * parm, int ofs, int len), void * parm);
+
+    Iterate the set of disjoint sequential substrings over str starting at
+    position pos divided by the character splitChar.  The parm passed to
+    bsplitcb is passed on to cb.  If the function cb returns a value < 0,
+    then further iterating is halted and this value is returned by bsplitcb.
+
+    Note: Non-destructive modification of str from within the cb function
+    while performing this split is not undefined.  bsplitcb behaves in
+    sequential lock step with calls to cb.  I.e., after returning from a cb
+    that return a non-negative integer, bsplitcb continues from the position
+    1 character after the last detected split character and it will halt
+    immediately if the length of str falls below this point.  However, if the
+    cb function destroys str, then it *must* return with a negative value,
+    otherwise bsplitcb will continue in an undefined manner.
+
+    This function is provided as an incremental alternative to bsplit that is
+    abortable and which does not impose additional memory allocation.
+
+    ..........................................................................
+
+    extern int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
+    int (* cb) (void * parm, int ofs, int len), void * parm);
+
+    Iterate the set of disjoint sequential substrings over str starting at
+    position pos divided by any of the characters in splitStr.  An empty
+    splitStr causes the whole str to be iterated once.  The parm passed to
+    bsplitcb is passed on to cb.  If the function cb returns a value < 0,
+    then further iterating is halted and this value is returned by bsplitcb.
+
+    Note: Non-destructive modification of str from within the cb function
+    while performing this split is not undefined.  bsplitscb behaves in
+    sequential lock step with calls to cb.  I.e., after returning from a cb
+    that return a non-negative integer, bsplitscb continues from the position
+    1 character after the last detected split character and it will halt
+    immediately if the length of str falls below this point.  However, if the
+    cb function destroys str, then it *must* return with a negative value,
+    otherwise bsplitscb will continue in an undefined manner.
+
+    This function is provided as an incremental alternative to bsplits that
+    is abortable and which does not impose additional memory allocation.
+
+    ..........................................................................
+
+    extern int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
+    int (* cb) (void * parm, int ofs, int len), void * parm);
+
+    Iterate the set of disjoint sequential substrings over str starting at
+    position pos divided by the entire substring splitStr.  An empty splitStr
+    causes each character of str to be iterated.  The parm passed to bsplitcb
+    is passed on to cb.  If the function cb returns a value < 0, then further
+    iterating is halted and this value is returned by bsplitcb.
+
+    Note: Non-destructive modification of str from within the cb function
+    while performing this split is not undefined.  bsplitstrcb behaves in
+    sequential lock step with calls to cb.  I.e., after returning from a cb
+    that return a non-negative integer, bsplitstrcb continues from the position
+    1 character after the last detected split character and it will halt
+    immediately if the length of str falls below this point.  However, if the
+    cb function destroys str, then it *must* return with a negative value,
+    otherwise bsplitscb will continue in an undefined manner.
+
+    This function is provided as an incremental alternative to bsplitstr that
+    is abortable and which does not impose additional memory allocation.
+
+    ..........................................................................
+
+    extern bstring bformat (const char * fmt, ...);
+
+    Takes the same parameters as printf (), but rather than outputting
+    results to stdio, it forms a bstring which contains what would have been
+    output. Note that if there is an early generation of a '\0' character,
+    the bstring will be truncated to this end point.
+
+    Note that %s format tokens correspond to '\0' terminated char * buffers,
+    not bstrings.  To print a bstring, first dereference data element of the
+    the bstring:
+
+        /* b1->data needs to be '\0' terminated, so tagbstrings generated
+           by blk2tbstr () might not be suitable. */
+        b0 = bformat ("Hello, %s", b1->data);
+
+    Note that if the BSTRLIB_NOVSNP macro has been set when bstrlib has been
+    compiled the bformat function is not present.
+
+    ..........................................................................
+
+    extern int bformata (bstring b, const char * fmt, ...);
+
+    In addition to the initial output buffer b, bformata takes the same
+    parameters as printf (), but rather than outputting results to stdio, it
+    appends the results to the initial bstring parameter. Note that if
+    there is an early generation of a '\0' character, the bstring will be
+    truncated to this end point.
+
+    Note that %s format tokens correspond to '\0' terminated char * buffers,
+    not bstrings.  To print a bstring, first dereference data element of the
+    the bstring:
+
+        /* b1->data needs to be '\0' terminated, so tagbstrings generated
+           by blk2tbstr () might not be suitable. */
+        bformata (b0 = bfromcstr ("Hello"), ", %s", b1->data);
+
+    Note that if the BSTRLIB_NOVSNP macro has been set when bstrlib has been
+    compiled the bformata function is not present.
+
+    ..........................................................................
+
+    extern int bassignformat (bstring b, const char * fmt, ...);
+
+    After the first parameter, it takes the same parameters as printf (), but
+    rather than outputting results to stdio, it outputs the results to
+    the bstring parameter b. Note that if there is an early generation of a
+    '\0' character, the bstring will be truncated to this end point.
+
+    Note that %s format tokens correspond to '\0' terminated char * buffers,
+    not bstrings.  To print a bstring, first dereference data element of the
+    the bstring:
+
+        /* b1->data needs to be '\0' terminated, so tagbstrings generated
+           by blk2tbstr () might not be suitable. */
+        bassignformat (b0 = bfromcstr ("Hello"), ", %s", b1->data);
+
+    Note that if the BSTRLIB_NOVSNP macro has been set when bstrlib has been
+    compiled the bassignformat function is not present.
+
+    ..........................................................................
+
+    extern int bvcformata (bstring b, int count, const char * fmt, va_list arglist);
+
+    The bvcformata function formats data under control of the format control
+    string fmt and attempts to append the result to b.  The fmt parameter is
+    the same as that of the printf function.  The variable argument list is
+    replaced with arglist, which has been initialized by the va_start macro.
+    The size of the output is upper bounded by count.  If the required output
+    exceeds count, the string b is not augmented with any contents and a value
+    below BSTR_ERR is returned.  If a value below -count is returned then it
+    is recommended that the negative of this value be used as an update to the
+    count in a subsequent pass.  On other errors, such as running out of
+    memory, parameter errors or numeric wrap around BSTR_ERR is returned.
+    BSTR_OK is returned when the output is successfully generated and
+    appended to b.
+
+    Note: There is no sanity checking of arglist, and this function is
+    destructive of the contents of b from the b->slen point onward.  If there
+    is an early generation of a '\0' character, the bstring will be truncated
+    to this end point.
+
+    Although this function is part of the external API for Bstrlib, the
+    interface and semantics (length limitations, and unusual return codes)
+    are fairly atypical.  The real purpose for this function is to provide an
+    engine for the bvformata macro.
+
+    Note that if the BSTRLIB_NOVSNP macro has been set when bstrlib has been
+    compiled the bvcformata function is not present.
+
+    ..........................................................................
+
+    extern bstring bread (bNread readPtr, void * parm);
+    typedef size_t (* bNread) (void *buff, size_t elsize, size_t nelem,
+                               void *parm);
+
+    Read an entire stream into a bstring, verbatum.  The readPtr function
+    pointer is compatible with fread sematics, except that it need not obtain
+    the stream data from a file.  The intention is that parm would contain
+    the stream data context/state required (similar to the role of the FILE*
+    I/O stream parameter of fread.)
+
+    Abstracting the block read function allows for block devices other than
+    file streams to be read if desired.  Note that there is an ANSI
+    compatibility issue if "fread" is used directly; see the ANSI issues
+    section below.
+
+    ..........................................................................
+
+    extern int breada (bstring b, bNread readPtr, void * parm);
+
+    Read an entire stream and append it to a bstring, verbatum.  Behaves
+    like bread, except that it appends it results to the bstring b.
+    BSTR_ERR is returned on error, otherwise 0 is returned.
+
+    ..........................................................................
+
+    extern bstring bgets (bNgetc getcPtr, void * parm, char terminator);
+    typedef int (* bNgetc) (void * parm);
+
+    Read a bstring from a stream.  As many bytes as is necessary are read
+    until the terminator is consumed or no more characters are available from
+    the stream.  If read from the stream, the terminator character will be
+    appended to the end of the returned bstring.  The getcPtr function must
+    have the same semantics as the fgetc C library function (i.e., returning
+    an integer whose value is negative when there are no more characters
+    available, otherwise the value of the next available unsigned character
+    from the stream.)  The intention is that parm would contain the stream
+    data context/state required (similar to the role of the FILE* I/O stream
+    parameter of fgets.)  If no characters are read, or there is some other
+    detectable error, NULL is returned.
+
+    bgets will never call the getcPtr function more often than necessary to
+    construct its output (including a single call, if required, to determine
+    that the stream contains no more characters.)
+
+    Abstracting the character stream function and terminator character allows
+    for different stream devices and string formats other than '\n'
+    terminated lines in a file if desired (consider \032 terminated email
+    messages, in a UNIX mailbox for example.)
+
+    For files, this function can be used analogously as fgets as follows:
+
+        fp = fopen ( ... );
+        if (fp) b = bgets ((bNgetc) fgetc, fp, '\n');
+
+    (Note that only one terminator character can be used, and that '\0' is
+    not assumed to terminate the stream in addition to the terminator
+    character. This is consistent with the semantics of fgets.)
+
+    ..........................................................................
+
+    extern int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator);
+
+    Read from a stream and concatenate to a bstring.  Behaves like bgets,
+    except that it appends it results to the bstring b.  The value 1 is
+    returned if no characters are read before a negative result is returned
+    from getcPtr.  Otherwise BSTR_ERR is returned on error, and 0 is returned
+    in other normal cases.
+
+    ..........................................................................
+
+    extern int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator);
+
+    Read from a stream and concatenate to a bstring.  Behaves like bgets,
+    except that it assigns the results to the bstring b.  The value 1 is
+    returned if no characters are read before a negative result is returned
+    from getcPtr.  Otherwise BSTR_ERR is returned on error, and 0 is returned
+    in other normal cases.
+
+    ..........................................................................
+
+    extern struct bStream * bsopen (bNread readPtr, void * parm);
+
+    Wrap a given open stream (described by a fread compatible function
+    pointer and stream handle) into an open bStream suitable for the bstring
+    library streaming functions.
+
+    ..........................................................................
+
+    extern void * bsclose (struct bStream * s);
+
+    Close the bStream, and return the handle to the stream that was
+    originally used to open the given stream.  If s is NULL or detectably
+    invalid, NULL will be returned.
+
+    ..........................................................................
+
+    extern int bsbufflength (struct bStream * s, int sz);
+
+    Set the length of the buffer used by the bStream.  If sz is the macro
+    BSTR_BS_BUFF_LENGTH_GET (which is 0), the length is not set.  If s is
+    NULL or sz is negative, the function will return with BSTR_ERR, otherwise
+    this function returns with the previous length.
+
+    ..........................................................................
+
+    extern int bsreadln (bstring r, struct bStream * s, char terminator);
+
+    Read a bstring terminated by the terminator character or the end of the
+    stream from the bStream (s) and return it into the parameter r.  The
+    matched terminator, if found, appears at the end of the line read.  If
+    the stream has been exhausted of all available data, before any can be
+    read, BSTR_ERR is returned.  This function may read additional characters
+    into the stream buffer from the core stream that are not returned, but
+    will be retained for subsequent read operations.  When reading from high
+    speed streams, this function can perform significantly faster than bgets.
+
+    ..........................................................................
+
+    extern int bsreadlna (bstring r, struct bStream * s, char terminator);
+
+    Read a bstring terminated by the terminator character or the end of the
+    stream from the bStream (s) and concatenate it to the parameter r.  The
+    matched terminator, if found, appears at the end of the line read.  If
+    the stream has been exhausted of all available data, before any can be
+    read, BSTR_ERR is returned.  This function may read additional characters
+    into the stream buffer from the core stream that are not returned, but
+    will be retained for subsequent read operations.  When reading from high
+    speed streams, this function can perform significantly faster than bgets.
+
+    ..........................................................................
+
+    extern int bsreadlns (bstring r, struct bStream * s, bstring terminators);
+
+    Read a bstring terminated by any character in the terminators bstring or
+    the end of the stream from the bStream (s) and return it into the
+    parameter r. This function may read additional characters from the core
+    stream that are not returned, but will be retained for subsequent read
+    operations.
+
+    ..........................................................................
+
+    extern int bsreadlnsa (bstring r, struct bStream * s, bstring terminators);
+
+    Read a bstring terminated by any character in the terminators bstring or
+    the end of the stream from the bStream (s) and concatenate it to the
+    parameter r.  If the stream has been exhausted of all available data,
+    before any can be read, BSTR_ERR is returned.  This function may read
+    additional characters from the core stream that are not returned, but
+    will be retained for subsequent read operations.
+
+    ..........................................................................
+
+    extern int bsread (bstring r, struct bStream * s, int n);
+
+    Read a bstring of length n (or, if it is fewer, as many bytes as is
+    remaining) from the bStream.  This function will read the minimum
+    required number of additional characters from the core stream.  When the
+    stream is at the end of the file BSTR_ERR is returned, otherwise BSTR_OK
+    is returned.
+
+    ..........................................................................
+
+    extern int bsreada (bstring r, struct bStream * s, int n);
+
+    Read a bstring of length n (or, if it is fewer, as many bytes as is
+    remaining) from the bStream and concatenate it to the parameter r.  This
+    function will read the minimum required number of additional characters
+    from the core stream.  When the stream is at the end of the file BSTR_ERR
+    is returned, otherwise BSTR_OK is returned.
+
+    ..........................................................................
+
+    extern int bsunread (struct bStream * s, const_bstring b);
+
+    Insert a bstring into the bStream at the current position.  These
+    characters will be read prior to those that actually come from the core
+    stream.
+
+    ..........................................................................
+
+    extern int bspeek (bstring r, const struct bStream * s);
+
+    Return the number of currently buffered characters from the bStream that
+    will be read prior to reads from the core stream, and append it to the
+    the parameter r.
+
+    ..........................................................................
+
+    extern int bssplitscb (struct bStream * s, const_bstring splitStr,
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+
+    Iterate the set of disjoint sequential substrings over the stream s
+    divided by any character from the bstring splitStr.  The parm passed to
+    bssplitscb is passed on to cb.  If the function cb returns a value < 0,
+    then further iterating is halted and this return value is returned by
+    bssplitscb.
+
+    Note: At the point of calling the cb function, the bStream pointer is
+    pointed exactly at the position right after having read the split
+    character.  The cb function can act on the stream by causing the bStream
+    pointer to move, and bssplitscb will continue by starting the next split
+    at the position of the pointer after the return from cb.
+
+    However, if the cb causes the bStream s to be destroyed then the cb must
+    return with a negative value, otherwise bssplitscb will continue in an
+    undefined manner.
+
+    This function is provided as way to incrementally parse through a file
+    or other generic stream that in total size may otherwise exceed the
+    practical or desired memory available.  As with the other split callback
+    based functions this is abortable and does not impose additional memory
+    allocation.
+
+    ..........................................................................
+
+    extern int bssplitstrcb (struct bStream * s, const_bstring splitStr,
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+
+    Iterate the set of disjoint sequential substrings over the stream s
+    divided by the entire substring splitStr.  The parm passed to
+    bssplitstrcb is passed on to cb.  If the function cb returns a
+    value < 0, then further iterating is halted and this return value is
+    returned by bssplitstrcb.
+
+    Note: At the point of calling the cb function, the bStream pointer is
+    pointed exactly at the position right after having read the split
+    character.  The cb function can act on the stream by causing the bStream
+    pointer to move, and bssplitstrcb will continue by starting the next
+    split at the position of the pointer after the return from cb.
+
+    However, if the cb causes the bStream s to be destroyed then the cb must
+    return with a negative value, otherwise bssplitscb will continue in an
+    undefined manner.
+
+    This function is provided as way to incrementally parse through a file
+    or other generic stream that in total size may otherwise exceed the
+    practical or desired memory available.  As with the other split callback
+    based functions this is abortable and does not impose additional memory
+    allocation.
+
+    ..........................................................................
+
+    extern int bseof (const struct bStream * s);
+
+    Return the defacto "EOF" (end of file) state of a stream (1 if the
+    bStream is in an EOF state, 0 if not, and BSTR_ERR if stream is closed or
+    detectably erroneous.)  When the readPtr callback returns a value <= 0
+    the stream reaches its "EOF" state. Note that bunread with non-empty
+    content will essentially turn off this state, and the stream will not be
+    in its "EOF" state so long as its possible to read more data out of it.
+
+    Also note that the semantics of bseof() are slightly different from
+    something like feof().  I.e., reaching the end of the stream does not
+    necessarily guarantee that bseof() will return with a value indicating
+    that this has happened.  bseof() will only return indicating that it has
+    reached the "EOF" and an attempt has been made to read past the end of
+    the bStream.
+
+The macros
+----------
+
+    The macros described below are shown in a prototype form indicating their
+    intended usage.  Note that the parameters passed to these macros will be
+    referenced multiple times.  As with all macros, programmer care is
+    required to guard against unintended side effects.
+
+    int blengthe (const_bstring b, int err);
+
+    Returns the length of the bstring.  If the bstring is NULL err is
+    returned.
+
+    ..........................................................................
+
+    int blength (const_bstring b);
+
+    Returns the length of the bstring.  If the bstring is NULL, the length
+    returned is 0.
+
+    ..........................................................................
+
+    int bchare (const_bstring b, int p, int c);
+
+    Returns the p'th character of the bstring b.  If the position p refers to
+    a position that does not exist in the bstring or the bstring is NULL,
+    then c is returned.
+
+    ..........................................................................
+
+    char bchar (const_bstring b, int p);
+
+    Returns the p'th character of the bstring b.  If the position p refers to
+    a position that does not exist in the bstring or the bstring is NULL,
+    then '\0' is returned.
+
+    ..........................................................................
+
+    char * bdatae (bstring b, char * err);
+
+    Returns the char * data portion of the bstring b.  If b is NULL, err is
+    returned.
+
+    ..........................................................................
+
+    char * bdata (bstring b);
+
+    Returns the char * data portion of the bstring b.  If b is NULL, NULL is
+    returned.
+
+    ..........................................................................
+
+    char * bdataofse (bstring b, int ofs, char * err);
+
+    Returns the char * data portion of the bstring b offset by ofs.  If b is
+    NULL, err is returned.
+
+    ..........................................................................
+
+    char * bdataofs (bstring b, int ofs);
+
+    Returns the char * data portion of the bstring b offset by ofs.  If b is
+    NULL, NULL is returned.
+
+    ..........................................................................
+
+    struct tagbstring var = bsStatic ("...");
+
+    The bsStatic macro allows for static declarations of literal string
+    constants as struct tagbstring structures.  The resulting tagbstring does
+    not need to be freed or destroyed.  Note that this macro is only well
+    defined for string literal arguments.  For more general string pointers,
+    use the btfromcstr macro.
+
+    The resulting struct tagbstring is permanently write protected.  Attempts
+    to write to this struct tagbstring from any bstrlib function will lead to
+    BSTR_ERR being returned.  Invoking the bwriteallow macro onto this struct
+    tagbstring has no effect.
+
+    ..........................................................................
+
+    <void * blk, int len> <- bsStaticBlkParms ("...")
+
+    The bsStaticBlkParms macro emits a pair of comma seperated parameters
+    corresponding to the block parameters for the block functions in Bstrlib
+    (i.e., blk2bstr, bcatblk, blk2tbstr, bisstemeqblk, bisstemeqcaselessblk.)
+    Note that this macro is only well defined for string literal arguments.
+
+    Examples:
+
+    bstring b = blk2bstr (bsStaticBlkParms ("Fast init. "));
+    bcatblk (b, bsStaticBlkParms ("No frills fast concatenation."));
+
+    These are faster than using bfromcstr() and bcatcstr() respectively
+    because the length of the inline string is known as a compile time
+    constant.  Also note that seperate struct tagbstring declarations for
+    holding the output of a bsStatic() macro are not required.
+
+    ..........................................................................
+
+    void btfromcstr (struct tagbstring& t, const char * s);
+
+    Fill in the tagbstring t with the '\0' terminated char buffer s.  This
+    action is purely reference oriented; no memory management is done.  The
+    data member is just assigned s, and slen is assigned the strlen of s.
+    The s parameter is accessed exactly once in this macro.
+
+    The resulting struct tagbstring is initially write protected.  Attempts
+    to write to this struct tagbstring in a write protected state from any
+    bstrlib function will lead to BSTR_ERR being returned.  Invoke the
+    bwriteallow on this struct tagbstring to make it writeable (though this
+    requires that s be obtained from a function compatible with malloc.)
+
+    ..........................................................................
+
+    void btfromblk (struct tagbstring& t, void * s, int len);
+
+    Fill in the tagbstring t with the data buffer s with length len.  This
+    action is purely reference oriented; no memory management is done.  The
+    data member of t is just assigned s, and slen is assigned len.  Note that
+    the buffer is not appended with a '\0' character.  The s and len
+    parameters are accessed exactly once each in this macro.
+
+    The resulting struct tagbstring is initially write protected.  Attempts
+    to write to this struct tagbstring in a write protected state from any
+    bstrlib function will lead to BSTR_ERR being returned.  Invoke the
+    bwriteallow on this struct tagbstring to make it writeable (though this
+    requires that s be obtained from a function compatible with malloc.)
+
+    ..........................................................................
+
+    void btfromblkltrimws (struct tagbstring& t, void * s, int len);
+
+    Fill in the tagbstring t with the data buffer s with length len after it
+    has been left trimmed.  This action is purely reference oriented; no
+    memory management is done.  The data member of t is just assigned to a
+    pointer inside the buffer s.  Note that the buffer is not appended with a
+    '\0' character.  The s and len parameters are accessed exactly once each
+    in this macro.
+
+    The resulting struct tagbstring is permanently write protected.  Attempts
+    to write to this struct tagbstring from any bstrlib function will lead to
+    BSTR_ERR being returned.  Invoking the bwriteallow macro onto this struct
+    tagbstring has no effect.
+
+    ..........................................................................
+
+    void btfromblkrtrimws (struct tagbstring& t, void * s, int len);
+
+    Fill in the tagbstring t with the data buffer s with length len after it
+    has been right trimmed.  This action is purely reference oriented; no
+    memory management is done.  The data member of t is just assigned to a
+    pointer inside the buffer s.  Note that the buffer is not appended with a
+    '\0' character.  The s and len parameters are accessed exactly once each
+    in this macro.
+
+    The resulting struct tagbstring is permanently write protected.  Attempts
+    to write to this struct tagbstring from any bstrlib function will lead to
+    BSTR_ERR being returned.  Invoking the bwriteallow macro onto this struct
+    tagbstring has no effect.
+
+    ..........................................................................
+
+    void btfromblktrimws (struct tagbstring& t, void * s, int len);
+
+    Fill in the tagbstring t with the data buffer s with length len after it
+    has been left and right trimmed.  This action is purely reference
+    oriented; no memory management is done.  The data member of t is just
+    assigned to a pointer inside the buffer s.  Note that the buffer is not
+    appended with a '\0' character.  The s and len parameters are accessed
+    exactly once each in this macro.
+
+    The resulting struct tagbstring is permanently write protected.  Attempts
+    to write to this struct tagbstring from any bstrlib function will lead to
+    BSTR_ERR being returned.  Invoking the bwriteallow macro onto this struct
+    tagbstring has no effect.
+
+    ..........................................................................
+
+    void bmid2tbstr (struct tagbstring& t, bstring b, int pos, int len);
+
+    Fill the tagbstring t with the substring from b, starting from position
+    pos with a length len.  The segment is clamped by the boundaries of
+    the bstring b.  This action is purely reference oriented; no memory
+    management is done.  Note that the buffer is not appended with a '\0'
+    character.  Note that the t parameter to this macro may be accessed
+    multiple times.  Note that the contents of t will become undefined
+    if the contents of b change or are destroyed.
+
+    The resulting struct tagbstring is permanently write protected.  Attempts
+    to write to this struct tagbstring in a write protected state from any
+    bstrlib function will lead to BSTR_ERR being returned.  Invoking the
+    bwriteallow macro on this struct tagbstring will have no effect.
+
+    ..........................................................................
+
+    void bvformata (int& ret, bstring b, const char * format, lastarg);
+
+    Append the bstring b with printf like formatting with the format control
+    string, and the arguments taken from the ... list of arguments after
+    lastarg passed to the containing function.  If the containing function
+    does not have ... parameters or lastarg is not the last named parameter
+    before the ... then the results are undefined.  If successful, the
+    results are appended to b and BSTR_OK is assigned to ret.  Otherwise
+    BSTR_ERR is assigned to ret.
+
+    Example:
+
+    void dbgerror (FILE * fp, const char * fmt, ...) {
+        int ret;
+        bstring b;
+        bvformata (ret, b = bfromcstr ("DBG: "), fmt, fmt);
+        if (BSTR_OK == ret) fputs ((char *) bdata (b), fp);
+        bdestroy (b);
+    }
+
+    Note that if the BSTRLIB_NOVSNP macro was set when bstrlib had been
+    compiled the bvformata macro will not link properly.  If the
+    BSTRLIB_NOVSNP macro has been set, the bvformata macro will not be
+    available.
+
+    ..........................................................................
+
+    void bwriteprotect (struct tagbstring& t);
+
+    Disallow bstring from being written to via the bstrlib API.  Attempts to
+    write to the resulting tagbstring from any bstrlib function will lead to
+    BSTR_ERR being returned.
+
+    Note: bstrings which are write protected cannot be destroyed via bdestroy.
+
+    Note to C++ users: Setting a CBString as write protected will not prevent
+    it from being destroyed by the destructor.
+
+    ..........................................................................
+
+    void bwriteallow (struct tagbstring& t);
+
+    Allow bstring to be written to via the bstrlib API.  Note that such an
+    action makes the bstring both writable and destroyable.  If the bstring is
+    not legitimately writable (as is the case for struct tagbstrings
+    initialized with a bsStatic value), the results of this are undefined.
+
+    Note that invoking the bwriteallow macro may increase the number of
+    reallocs by one more than necessary for every call to bwriteallow
+    interleaved with any bstring API which writes to this bstring.
+
+    ..........................................................................
+
+    int biswriteprotected (struct tagbstring& t);
+
+    Returns 1 if the bstring is write protected, otherwise 0 is returned.
+
+===============================================================================
+
+The bstest module
+-----------------
+
+The bstest module is just a unit test for the bstrlib module.  For correct
+implementations of bstrlib, it should execute with 0 failures being reported.
+This test should be utilized if modifications/customizations to bstrlib have
+been performed.  It tests each core bstrlib function with bstrings of every
+mode (read-only, NULL, static and mutable) and ensures that the expected
+semantics are observed (including results that should indicate an error). It
+also tests for aliasing support.  Passing bstest is a necessary but not a
+sufficient condition for ensuring the correctness of the bstrlib module.
+
+
+The test module
+---------------
+
+The test module is just a unit test for the bstrwrap module.  For correct
+implementations of bstrwrap, it should execute with 0 failures being
+reported.  This test should be utilized if modifications/customizations to
+bstrwrap have been performed.  It tests each core bstrwrap function with
+CBStrings write protected or not and ensures that the expected semantics are
+observed (including expected exceptions.)  Note that exceptions cannot be
+disabled to run this test.  Passing test is a necessary but not a sufficient
+condition for ensuring the correctness of the bstrwrap module.
+
+===============================================================================
+
+Using Bstring and CBString as an alternative to the C library
+-------------------------------------------------------------
+
+First let us give a table of C library functions and the alternative bstring
+functions and CBString methods that should be used instead of them.
+
+C-library         Bstring alternative             CBString alternative
+---------         -------------------             --------------------
+gets              bgets                           ::gets
+strcpy            bassign                         = operator
+strncpy           bassignmidstr                   ::midstr
+strcat            bconcat                         += operator
+strncat           bconcat + btrunc                += operator + ::trunc
+strtok            bsplit, bsplits                 ::split
+sprintf           b(assign)format                 ::format
+snprintf          b(assign)format + btrunc        ::format + ::trunc
+vsprintf          bvformata                       bvformata
+
+vsnprintf         bvformata + btrunc              bvformata + btrunc
+vfprintf          bvformata + fputs               use bvformata + fputs
+strcmp            biseq, bstrcmp                  comparison operators.
+strncmp           bstrncmp, memcmp                bstrncmp, memcmp
+strlen            ->slen, blength                 ::length
+strdup            bstrcpy                         constructor
+strset            bpattern                        ::fill
+strstr            binstr                          ::find
+strpbrk           binchr                          ::findchr
+stricmp           bstricmp                        cast & use bstricmp
+strlwr            btolower                        cast & use btolower
+strupr            btoupper                        cast & use btoupper
+strrev            bReverse (aux module)           cast & use bReverse
+strchr            bstrchr                         cast & use bstrchr
+strspnp           use strspn                      use strspn
+ungetc            bsunread                        bsunread
+
+The top 9 C functions listed here are troublesome in that they impose memory
+management in the calling function.  The Bstring and CBstring interfaces have
+built-in memory management, so there is far less code with far less potential
+for buffer overrun problems.  strtok can only be reliably called as a "leaf"
+calculation, since it (quite bizarrely) maintains hidden internal state.  And
+gets is well known to be broken no matter what.  The Bstrlib alternatives do
+not suffer from those sorts of problems.
+
+The substitute for strncat can be performed with higher performance by using
+the blk2tbstr macro to create a presized second operand for bconcat.
+
+C-library         Bstring alternative             CBString alternative
+---------         -------------------             --------------------
+strspn            strspn acceptable               strspn acceptable
+strcspn           strcspn acceptable              strcspn acceptable
+strnset           strnset acceptable              strnset acceptable
+printf            printf acceptable               printf acceptable
+puts              puts acceptable                 puts acceptable
+fprintf           fprintf acceptable              fprintf acceptable
+fputs             fputs acceptable                fputs acceptable
+memcmp            memcmp acceptable               memcmp acceptable
+
+Remember that Bstring (and CBstring) functions will automatically append the
+'\0' character to the character data buffer.  So by simply accessing the data
+buffer directly, ordinary C string library functions can be called directly
+on them.  Note that bstrcmp is not the same as memcmp in exactly the same way
+that strcmp is not the same as memcmp.
+
+C-library         Bstring alternative             CBString alternative
+---------         -------------------             --------------------
+fread             balloc + fread                  ::alloc + fread
+fgets             balloc + fgets                  ::alloc + fgets
+
+These are odd ones because of the exact sizing of the buffer required.  The
+Bstring and CBString alternatives requires that the buffers are forced to
+hold at least the prescribed length, then just use fread or fgets directly.
+However, typically the automatic memory management of Bstring and CBstring
+will make the typical use of fgets and fread to read specifically sized
+strings unnecessary.
+
+Implementation Choices
+----------------------
+
+Overhead:
+.........
+
+The bstring library has more overhead versus straight char buffers for most
+functions.  This overhead is essentially just the memory management and
+string header allocation.  This overhead usually only shows up for small
+string manipulations.  The performance loss has to be considered in
+light of the following:
+
+1) What would be the performance loss of trying to write this management
+   code in one's own application?
+2) Since the bstring library source code is given, a sufficiently powerful
+   modern inlining globally optimizing compiler can remove function call
+   overhead.
+
+Since the data type is exposed, a developer can replace any unsatisfactory
+function with their own inline implementation.  And that is besides the main
+point of what the better string library is mainly meant to provide.  Any
+overhead lost has to be compared against the value of the safe abstraction
+for coupling memory management and string functionality.
+
+Performance of the C interface:
+...............................
+
+The algorithms used have performance advantages versus the analogous C
+library functions.  For example:
+
+1. bfromcstr/blk2str/bstrcpy versus strcpy/strdup.  By using memmove instead
+   of strcpy, the break condition of the copy loop is based on an independent
+   counter (that should be allocated in a register) rather than having to
+   check the results of the load.  Modern out-of-order executing CPUs can
+   parallelize the final branch mis-predict penality with the loading of the
+   source string.  Some CPUs will also tend to have better built-in hardware
+   support for counted memory moves than load-compare-store.  (This is a
+   minor, but non-zero gain.)
+2. biseq versus strcmp.  If the strings are unequal in length, bsiseq will
+   return in O(1) time.  If the strings are aliased, or have aliased data
+   buffers, biseq will return in O(1) time.  strcmp will always be O(k),
+   where k is the length of the common prefix or the whole string if they are
+   identical.
+3. ->slen versus strlen.  ->slen is obviously always O(1), while strlen is
+   always O(n) where n is the length of the string.
+4. bconcat versus strcat.  Both rely on precomputing the length of the
+   destination string argument, which will favor the bstring library.  On
+   iterated concatenations the performance difference can be enormous.
+5. bsreadln versus fgets.  The bsreadln function reads large blocks at a time
+   from the given stream, then parses out lines from the buffers directly.
+   Some C libraries will implement fgets as a loop over single fgetc calls.
+   Testing indicates that the bsreadln approach can be several times faster
+   for fast stream devices (such as a file that has been entirely cached.)
+6. bsplits/bsplitscb versus strspn.  Accelerators for the set of match
+   characters are generated only once.
+7. binstr versus strstr.  The binstr implementation unrolls the loops to
+   help reduce loop overhead.  This will matter if the target string is
+   long and source string is not found very early in the target string.
+   With strstr, while it is possible to unroll the source contents, it is
+   not possible to do so with the destination contents in a way that is
+   effective because every destination character must be tested against
+   '\0' before proceeding to the next character.
+8. bReverse versus strrev.  The C function must find the end of the string
+   first before swaping character pairs.
+9. bstrrchr versus no comparable C function.  Its not hard to write some C
+   code to search for a character from the end going backwards.  But there
+   is no way to do this without computing the length of the string with
+   strlen.
+
+Practical testing indicates that in general Bstrlib is never signifcantly
+slower than the C library for common operations, while very often having a
+performance advantage that ranges from significant to massive.  Even for
+functions like b(n)inchr versus str(c)spn() (where, in theory, there is no
+advantage for the Bstrlib architecture) the performance of Bstrlib is vastly
+superior to most tested C library implementations.
+
+Some of Bstrlib's extra functionality also lead to inevitable performance
+advantages over typical C solutions.  For example, using the blk2tbstr macro,
+one can (in O(1) time) generate an internal substring by reference while not
+disturbing the original string.  If disturbing the original string is not an
+option, typically, a comparable char * solution would have to make a copy of
+the substring to provide similar functionality.  Another example is reverse
+character set scanning -- the str(c)spn functions only scan in a forward
+direction which can complicate some parsing algorithms.
+
+Where high performance char * based algorithms are available, Bstrlib can
+still leverage them by accessing the ->data field on bstrings.  So
+realistically Bstrlib can never be significantly slower than any standard
+'\0' terminated char * based solutions.
+
+Performance of the C++ interface:
+.................................
+
+The C++ interface has been designed with an emphasis on abstraction and safety
+first.  However, since it is substantially a wrapper for the C bstring
+functions, for longer strings the performance comments described in the
+"Performance of the C interface" section above still apply. Note that the
+(CBString *) type can be directly cast to a (bstring) type, and passed as
+parameters to the C functions (though a CBString must never be passed to
+bdestroy.)
+
+Probably the most controversial choice is performing full bounds checking on
+the [] operator.  This decision was made because 1) the fast alternative of
+not bounds checking is still available by first casting the CBString to a
+(const char *) buffer or to a (struct tagbstring) then derefencing .data and
+2) because the lack of bounds checking is seen as one of the main weaknesses
+of C/C++ versus other languages.  This check being done on every access leads
+to individual character extraction being actually slower than other languages
+in this one respect (other language's compilers will normally dedicate more
+resources on hoisting or removing bounds checking as necessary) but otherwise
+bring C++ up to the level of other languages in terms of functionality.
+
+It is common for other C++ libraries to leverage the abstractions provided by
+C++ to use reference counting and "copy on write" policies.  While these
+techniques can speed up some scenarios, they impose a problem with respect to
+thread safety.  bstrings and CBStrings can be properly protected with
+"per-object" mutexes, meaning that two bstrlib calls can be made and execute
+simultaneously, so long as the bstrings and CBstrings are distinct.  With a
+reference count and alias before copy on write policy, global mutexes are
+required that prevent multiple calls to the strings library to execute
+simultaneously regardless of whether or not the strings represent the same
+string.
+
+One interesting trade off in CBString is that the default constructor is not
+trivial.  I.e., it always prepares a ready to use memory buffer.  The purpose
+is to ensure that there is a uniform internal composition for any functioning
+CBString that is compatible with bstrings.  It also means that the other
+methods in the class are not forced to perform "late initialization" checks.
+In the end it means that construction of CBStrings are slower than other
+comparable C++ string classes.  Initial testing, however, indicates that
+CBString outperforms std::string and MFC's CString, for example, in all other
+operations.  So to work around this weakness it is recommended that CBString
+declarations be pushed outside of inner loops.
+
+Practical testing indicates that with the exception of the caveats given
+above (constructors and safe index character manipulations) the C++ API for
+Bstrlib generally outperforms popular standard C++ string classes.  Amongst
+the standard libraries and compilers, the quality of concatenation operations
+varies wildly and very little care has gone into search functions.  Bstrlib
+dominates those performance benchmarks.
+
+Memory management:
+..................
+
+The bstring functions which write and modify bstrings will automatically
+reallocate the backing memory for the char buffer whenever it is required to
+grow.  The algorithm for resizing chosen is to snap up to sizes that are a
+power of two which are sufficient to hold the intended new size.  Memory
+reallocation is not performed when the required size of the buffer is
+decreased.  This behavior can be relied on, and is necessary to make the
+behaviour of balloc deterministic.  This trades off additional memory usage
+for decreasing the frequency for required reallocations:
+
+1. For any bstring whose size never exceeds n, its buffer is not ever
+   reallocated more than log_2(n) times for its lifetime.
+2. For any bstring whose size never exceeds n, its buffer is never more than
+   2*(n+1) in length.  (The extra characters beyond 2*n are to allow for the
+   implicit '\0' which is always added by the bstring modifying functions.)
+
+Decreasing the buffer size when the string decreases in size would violate 1)
+above and in real world case lead to pathological heap thrashing.  Similarly,
+allocating more tightly than "least power of 2 greater than necessary" would
+lead to a violation of 1) and have the same potential for heap thrashing.
+
+Property 2) needs emphasizing.  Although the memory allocated is always a
+power of 2, for a bstring that grows linearly in size, its buffer memory also
+grows linearly, not exponentially.  The reason is that the amount of extra
+space increases with each reallocation, which decreases the frequency of
+future reallocations.
+
+Obviously, given that bstring writing functions may reallocate the data
+buffer backing the target bstring, one should not attempt to cache the data
+buffer address and use it after such bstring functions have been called.
+This includes making reference struct tagbstrings which alias to a writable
+bstring.
+
+balloc or bfromcstralloc can be used to preallocate the minimum amount of
+space used for a given bstring.  This will reduce even further the number of
+times the data portion is reallocated.  If the length of the string is never
+more than one less than the memory length then there will be no further
+reallocations.
+
+Note that invoking the bwriteallow macro may increase the number of reallocs
+by one more than necessary for every call to bwriteallow interleaved with any
+bstring API which writes to this bstring.
+
+The library does not use any mechanism for automatic clean up for the C API.
+Thus explicit clean up via calls to bdestroy() are required to avoid memory
+leaks.
+
+Constant and static tagbstrings:
+................................
+
+A struct tagbstring can be write protected from any bstrlib function using
+the bwriteprotect macro.  A write protected struct tagbstring can then be
+reset to being writable via the bwriteallow macro.  There is, of course, no
+protection from attempts to directly access the bstring members.  Modifying a
+bstring which is write protected by direct access has undefined behavior.
+
+static struct tagbstrings can be declared via the bsStatic macro.  They are
+considered permanently unwritable.  Such struct tagbstrings's are declared
+such that attempts to write to it are not well defined.  Invoking either
+bwriteallow or bwriteprotect on static struct tagbstrings has no effect.
+
+struct tagbstring's initialized via btfromcstr or blk2tbstr are protected by
+default but can be made writeable via the bwriteallow macro.  If bwriteallow
+is called on such struct tagbstring's, it is the programmer's responsibility
+to ensure that:
+
+1) the buffer supplied was allocated from the heap.
+2) bdestroy is not called on this tagbstring (unless the header itself has
+   also been allocated from the heap.)
+3) free is called on the buffer to reclaim its memory.
+
+bwriteallow and bwriteprotect can be invoked on ordinary bstrings (they have
+to be dereferenced with the (*) operator to get the levels of indirection
+correct) to give them write protection.
+
+Buffer declaration:
+...................
+
+The memory buffer is actually declared "unsigned char *" instead of "char *".
+The reason for this is to trigger compiler warnings whenever uncasted char
+buffers are assigned to the data portion of a bstring.  This will draw more
+diligent programmers into taking a second look at the code where they
+have carelessly left off the typically required cast.  (Research from
+AT&T/Lucent indicates that additional programmer eyeballs is one of the most
+effective mechanisms at ferreting out bugs.)
+
+Function pointers:
+..................
+
+The bgets, bread and bStream functions use function pointers to obtain
+strings from data streams.  The function pointer declarations have been
+specifically chosen to be compatible with the fgetc and fread functions.
+While this may seem to be a convoluted way of implementing fgets and fread
+style functionality, it has been specifically designed this way to ensure
+that there is no dependency on a single narrowly defined set of device
+interfaces, such as just stream I/O.  In the embedded world, its quite
+possible to have environments where such interfaces may not exist in the
+standard C library form.  Furthermore, the generalization that this opens up
+allows for more sophisticated uses for these functions (performing an fgets
+like function on a socket, for example.) By using function pointers, it also
+allows such abstract stream interfaces to be created using the bstring library
+itself while not creating a circular dependency.
+
+Use of int's for sizes:
+.......................
+
+This is just a recognition that 16bit platforms with requirements for strings
+that are larger than 64K and 32bit+ platforms with requirements for strings
+that are larger than 4GB are pretty marginal.  The main focus is for 32bit
+platforms, and emerging 64bit platforms with reasonable < 4GB string
+requirements.  Using ints allows for negative values which has meaning
+internally to bstrlib.
+
+Semantic consideration:
+.......................
+
+Certain care needs to be taken when copying and aliasing bstrings.  A bstring
+is essentially a pointer type which points to a multipart abstract data
+structure.  Thus usage, and lifetime of bstrings have semantics that follow
+these considerations.  For example:
+
+    bstring a, b;
+    struct tagbstring t;
+
+    a = bfromcstr("Hello"); /* Create new bstring and copy "Hello" into it. */
+    b = a;                  /* Alias b to the contents of a.                */
+    t = *a;                 /* Create a current instance pseudo-alias of a. */
+    bconcat (a, b);         /* Double a and b, t is now undefined.          */
+    bdestroy (a);           /* Destroy the contents of both a and b.        */
+
+Variables of type bstring are really just references that point to real
+bstring objects.  The equal operator (=) creates aliases, and the asterisk
+dereference operator (*) creates a kind of alias to the current instance (which
+is generally not useful for any purpose.)  Using bstrcpy() is the correct way
+of creating duplicate instances.  The ampersand operator (&) is useful for
+creating aliases to struct tagbstrings (remembering that constructed struct
+tagbstrings are not writable by default.)
+
+CBStrings use complete copy semantics for the equal operator (=), and thus do
+not have these sorts of issues.
+
+Debugging:
+..........
+
+Bstrings have a simple, exposed definition and construction, and the library
+itself is open source.  So most debugging is going to be fairly straight-
+forward.  But the memory for bstrings come from the heap, which can often be
+corrupted indirectly, and it might not be obvious what has happened even from
+direct examination of the contents in a debugger or a core dump.  There are
+some tools such as Purify, Insure++ and Electric Fence which can help solve
+such problems, however another common approach is to directly instrument the
+calls to malloc, realloc, calloc, free, memcpy, memmove and/or other calls
+by overriding them with macro definitions.
+
+Although the user could hack on the Bstrlib sources directly as necessary to
+perform such an instrumentation, Bstrlib comes with a built-in mechanism for
+doing this.  By defining the macro BSTRLIB_MEMORY_DEBUG and providing an
+include file named memdbg.h this will force the core Bstrlib modules to
+attempt to include this file.  In such a file, macros could be defined which
+overrides Bstrlib's useage of the C standard library.
+
+Rather than calling malloc, realloc, free, memcpy or memmove directly, Bstrlib
+emits the macros bstr__alloc, bstr__realloc, bstr__free, bstr__memcpy and
+bstr__memmove in their place respectively.  By default these macros are simply
+assigned to be equivalent to their corresponding C standard library function
+call.  However, if they are given earlier macro definitions (via the back
+door include file) they will not be given their default definition.  In this
+way Bstrlib's interface to the standard library can be changed but without
+having to directly redefine or link standard library symbols (both of which
+are not strictly ANSI C compliant.)
+
+An example definition might include:
+
+    #define bstr__alloc(sz) X_malloc ((sz), __LINE__, __FILE__)
+
+which might help contextualize heap entries in a debugging environment.
+
+The NULL parameter and sanity checking of bstrings is part of the Bstrlib
+API, and thus Bstrlib itself does not present any different modes which would
+correspond to "Debug" or "Release" modes.  Bstrlib always contains mechanisms
+which one might think of as debugging features, but retains the performance
+and small memory footprint one would normally associate with release mode
+code.
+
+Integration Microsoft's Visual Studio debugger:
+...............................................
+
+Microsoft's Visual Studio debugger has a capability of customizable mouse
+float over data type descriptions.  This is accomplished by editting the
+AUTOEXP.DAT file to include the following:
+
+    ; new for CBString
+    tagbstring =slen=<slen> mlen=<mlen> <data,st>
+    Bstrlib::CBStringList =count=<size()>
+
+In Visual C++ 6.0 this file is located in the directory:
+
+    C:\Program Files\Microsoft Visual Studio\Common\MSDev98\Bin
+
+and in Visual Studio .NET 2003 its located here:
+
+    C:\Program Files\Microsoft Visual Studio .NET 2003\Common7\Packages\Debugger
+
+This will improve the ability of debugging with Bstrlib under Visual Studio.
+
+Security
+--------
+
+Bstrlib does not come with explicit security features outside of its fairly
+comprehensive error detection, coupled with its strict semantic support.
+That is to say that certain common security problems, such as buffer overrun,
+constant overwrite, arbitrary truncation etc, are far less likely to happen
+inadvertently.  Where it does help, Bstrlib maximizes its advantage by
+providing developers a simple adoption path that lets them leave less secure
+string mechanisms behind.  The library will not leave developers wanting, so
+they will be less likely to add new code using a less secure string library
+to add functionality that might be missing from Bstrlib.
+
+That said there are a number of security ideas not addressed by Bstrlib:
+
+1. Race condition exploitation (i.e., verifying a string's contents, then
+raising the privilege level and execute it as a shell command as two
+non-atomic steps) is well beyond the scope of what Bstrlib can provide.  It
+should be noted that MFC's built-in string mutex actually does not solve this
+problem either -- it just removes immediate data corruption as a possible
+outcome of such exploit attempts (it can be argued that this is worse, since
+it will leave no trace of the exploitation).  In general race conditions have
+to be dealt with by careful design and implementation; it cannot be assisted
+by a string library.
+
+2. Any kind of access control or security attributes to prevent usage in
+dangerous interfaces such as system().  Perl includes a "trust" attribute
+which can be endowed upon strings that are intended to be passed to such
+dangerous interfaces.  However, Perl's solution reflects its own limitations
+-- notably that it is not a strongly typed language.  In the example code for
+Bstrlib, there is a module called taint.cpp.  It demonstrates how to write a
+simple wrapper class for managing "untainted" or trusted strings using the
+type system to prevent questionable mixing of ordinary untrusted strings with
+untainted ones then passing them to dangerous interfaces.  In this way the
+security correctness of the code reduces to auditing the direct usages of
+dangerous interfaces or promotions of tainted strings to untainted ones.
+
+3. Encryption of string contents is way beyond the scope of Bstrlib.
+Maintaining encrypted string contents in the futile hopes of thwarting things
+like using system-level debuggers to examine sensitive string data is likely
+to be a wasted effort (imagine a debugger that runs at a higher level than a
+virtual processor where the application runs).  For more standard encryption
+usages, since the bstring contents are simply binary blocks of data, this
+should pose no problem for usage with other standard encryption libraries.
+
+Compatibility
+-------------
+
+The Better String Library is known to compile and function correctly with the
+following compilers:
+
+  - Microsoft Visual C++
+  - Watcom C/C++
+  - Intel's C/C++ compiler (Windows)
+  - The GNU C/C++ compiler (cygwin and Linux on PPC64)
+  - Borland C
+  - Turbo C
+
+Setting of configuration options should be unnecessary for these compilers
+(unless exceptions are being disabled or STLport has been added to WATCOM
+C/C++).  Bstrlib has been developed with an emphasis on portability.  As such
+porting it to other compilers should be straight forward.  This package
+includes a porting guide (called porting.txt) which explains what issues may
+exist for porting Bstrlib to different compilers and environments.
+
+ANSI issues
+-----------
+
+1. The function pointer types bNgetc and bNread have prototypes which are very
+similar to, but not exactly the same as fgetc and fread respectively.
+Basically the FILE * parameter is replaced by void *.  The purpose of this
+was to allow one to create other functions with fgetc and fread like
+semantics without being tied to ANSI C's file streaming mechanism.  I.e., one
+could very easily adapt it to sockets, or simply reading a block of memory,
+or procedurally generated strings (for fractal generation, for example.)
+
+The problem is that invoking the functions (bNgetc)fgetc and (bNread)fread is
+not technically legal in ANSI C.  The reason being that the compiler is only
+able to coerce the function pointers themselves into the target type, however
+are unable to perform any cast (implicit or otherwise) on the parameters
+passed once invoked.  I.e., if internally void * and FILE * need some kind of
+mechanical coercion, the compiler will not properly perform this conversion
+and thus lead to undefined behavior.
+
+Apparently a platform from Data General called "Eclipse" and another from
+Tandem called "NonStop" have a different representation for pointers to bytes
+and pointers to words, for example, where coercion via casting is necessary.
+(Actual confirmation of the existence of such machines is hard to come by, so
+it is prudent to be skeptical about this information.)  However, this is not
+an issue for any known contemporary platforms.  One may conclude that such
+platforms are effectively apocryphal even if they do exist.
+
+To correctly work around this problem to the satisfaction of the ANSI
+limitations, one needs to create wrapper functions for fgets and/or
+fread with the prototypes of bNgetc and/or bNread respectively which performs
+no other action other than to explicitely cast the void * parameter to a
+FILE *, and simply pass the remaining parameters straight to the function
+pointer call.
+
+The wrappers themselves are trivial:
+
+    size_t freadWrap (void * buff, size_t esz, size_t eqty, void * parm) {
+        return fread (buff, esz, eqty, (FILE *) parm);
+    }
+
+    int fgetcWrap (void * parm) {
+        return fgetc ((FILE *) parm);
+    }
+
+These have not been supplied in bstrlib or bstraux to prevent unnecessary
+linking with file I/O functions.
+
+2. vsnprintf is not available on all compilers.  Because of this, the bformat
+and bformata functions (and format and formata methods) are not guaranteed to
+work properly.  For those compilers that don't have vsnprintf, the
+BSTRLIB_NOVSNP macro should be set before compiling bstrlib, and the format
+functions/method will be disabled.
+
+The more recent ANSI C standards have specified the required inclusion of a
+vsnprintf function.
+
+3. The bstrlib function names are not unique in the first 6 characters.  This
+is only an issue for older C compiler environments which do not store more
+than 6 characters for function names.
+
+4. The bsafe module defines macros and function names which are part of the
+C library.  This simply overrides the definition as expected on all platforms
+tested, however it is not sanctioned by the ANSI standard.  This module is
+clearly optional and should be omitted on platforms which disallow its
+undefined semantics.
+
+In practice the real issue is that some compilers in some modes of operation
+can/will inline these standard library functions on a module by module basis
+as they appear in each.  The linker will thus have no opportunity to override
+the implementation of these functions for those cases.  This can lead to
+inconsistent behaviour of the bsafe module on different platforms and
+compilers.
+
+===============================================================================
+
+Comparison with Microsoft's CString class
+-----------------------------------------
+
+Although developed independently, CBStrings have very similar functionality to
+Microsoft's CString class.  However, the bstring library has significant
+advantages over CString:
+
+1. Bstrlib is a C-library as well as a C++ library (using the C++ wrapper).
+
+    - Thus it is compatible with more programming environments and
+      available to a wider population of programmers.
+
+2. The internal structure of a bstring is considered exposed.
+
+    - A single contiguous block of data can be cut into read-only pieces by
+      simply creating headers, without allocating additional memory to create
+      reference copies of each of these sub-strings.
+    - In this way, using bstrings in a totally abstracted way becomes a choice
+      rather than an imposition.  Further this choice can be made differently
+      at different layers of applications that use it.
+
+3. Static declaration support precludes the need for constructor
+   invocation.
+
+    - Allows for static declarations of constant strings that has no
+      additional constructor overhead.
+
+4. Bstrlib is not attached to another library.
+
+    - Bstrlib is designed to be easily plugged into any other library
+      collection, without dependencies on other libraries or paradigms (such
+      as "MFC".)
+
+The bstring library also comes with a few additional functions that are not
+available in the CString class:
+
+    - bsetstr
+    - bsplit
+    - bread
+    - breplace (this is different from CString::Replace())
+    - Writable indexed characters (for example a[i]='x')
+
+Interestingly, although Microsoft did implement mid$(), left$() and right$()
+functional analogues (these are functions from GWBASIC) they seem to have
+forgotten that mid$() could be also used to write into the middle of a string.
+This functionality exists in Bstrlib with the bsetstr() and breplace()
+functions.
+
+Among the disadvantages of Bstrlib is that there is no special support for
+localization or wide characters.  Such things are considered beyond the scope
+of what bstrings are trying to deliver.  CString essentially supports the
+older UCS-2 version of Unicode via widechar_t as an application-wide compile
+time switch.
+
+CString's also use built-in mechanisms for ensuring thread safety under all
+situations.  While this makes writing thread safe code that much easier, this
+built-in safety feature has a price -- the inner loops of each CString method
+runs in its own critical section (grabbing and releasing a light weight mutex
+on every operation.)  The usual way to decrease the impact of a critical
+section performance penalty is to amortize more operations per critical
+section.  But since the implementation of CStrings is fixed as a one critical
+section per-operation cost, there is no way to leverage this common
+performance enhancing idea.
+
+The search facilities in Bstrlib are comparable to those in MFC's CString
+class, though it is missing locale specific collation.  But because Bstrlib
+is interoperable with C's char buffers, it will allow programmers to write
+their own string searching mechanism (such as Boyer-Moore), or be able to
+choose from a variety of available existing string searching libraries (such
+as those for regular expressions) without difficulty.
+
+Microsoft used a very non-ANSI conforming trick in its implementation to
+allow printf() to use the "%s" specifier to output a CString correctly.  This
+can be convenient, but it is inherently not portable.  CBString requires an
+explicit cast, while bstring requires the data member to be dereferenced.
+Microsoft's own documentation recommends casting, instead of relying on this
+feature.
+
+Comparison with C++'s std::string
+---------------------------------
+
+This is the C++ language's standard STL based string class.
+
+1. There is no C implementation.
+2. The [] operator is not bounds checked.
+3. Missing a lot of useful functions like printf-like formatting.
+4. Some sub-standard std::string implementations (SGI) are necessarily unsafe
+   to use with multithreading.
+5. Limited by STL's std::iostream which in turn is limited by ifstream which
+   can only take input from files.  (Compare to CBStream's API which can take
+   abstracted input.)
+6. Extremely uneven performance across implementations.
+
+Comparison with ISO C TR 24731 proposal
+---------------------------------------
+
+Following the ISO C99 standard, Microsoft has proposed a group of C library
+extensions which are supposedly "safer and more secure".  This proposal is
+expected to be adopted by the ISO C standard which follows C99.
+
+The proposal reveals itself to be very similar to Microsoft's "StrSafe"
+library. The functions are basically the same as other standard C library
+string functions except that destination parameters are paired with an
+additional length parameter of type rsize_t.  rsize_t is the same as size_t,
+however, the range is checked to make sure its between 1 and RSIZE_MAX.  Like
+Bstrlib, the functions perform a "parameter check".  Unlike Bstrlib, when a
+parameter check fails, rather than simply outputing accumulatable error
+statuses, they call a user settable global error function handler, and upon
+return of control performs no (additional) detrimental action.  The proposal
+covers basic string functions as well as a few non-reenterable functions
+(asctime, ctime, and strtok).
+
+1. Still based solely on char * buffers (and therefore strlen() and strcat()
+   is still O(n), and there are no faster streq() comparison functions.)
+2. No growable string semantics.
+3. Requires manual buffer length synchronization in the source code.
+4. No attempt to enhance functionality of the C library.
+5. Introduces a new error scenario (strings exceeding RSIZE_MAX length).
+
+The hope is that by exposing the buffer length requirements there will be
+fewer buffer overrun errors.  However, the error modes are really just
+transformed, rather than removed.  The real problem of buffer overflows is
+that they all happen as a result of erroneous programming.  So forcing
+programmers to manually deal with buffer limits, will make them more aware of
+the problem but doesn't remove the possibility of erroneous programming.  So
+a programmer that erroneously mixes up the rsize_t parameters is no better off
+from a programmer that introduces potential buffer overflows through other
+more typical lapses.  So at best this may reduce the rate of erroneous
+programming, rather than making any attempt at removing failure modes.
+
+The error handler can discriminate between types of failures, but does not
+take into account any callsite context.  So the problem is that the error is
+going to be manifest in a piece of code, but there is no pointer to that
+code.  It would seem that passing in the call site __FILE__, __LINE__ as
+parameters would be very useful, but the API clearly doesn't support such a
+thing (it would increase code bloat even more than the extra length
+parameter does, and would require macro tricks to implement).
+
+The Bstrlib C API takes the position that error handling needs to be done at
+the callsite, and just tries to make it as painless as possible.  Furthermore,
+error modes are removed by supporting auto-growing strings and aliasing.  For
+capturing errors in more central code fragments, Bstrlib's C++ API uses
+exception handling extensively, which is superior to the leaf-only error
+handler approach.
+
+Comparison with Managed String Library CERT proposal
+----------------------------------------------------
+
+The main webpage for the managed string library:
+http://www.cert.org/secure-coding/managedstring.html
+
+Robert Seacord at CERT has proposed a C string library that he calls the
+"Managed String Library" for C. Like Bstrlib, it introduces a new type
+which is called a managed string. The structure of a managed string
+(string_m) is like a struct tagbstring but missing the length field.  This
+internal structure is considered opaque. The length is, like the C standard
+library, always computed on the fly by searching for a terminating NUL on
+every operation that requires it. So it suffers from every performance
+problem that the C standard library suffers from. Interoperating with C
+string APIs (like printf, fopen, or anything else that takes a string
+parameter) requires copying to additionally allocating buffers that have to
+be manually freed -- this makes this library probably slower and more
+cumbersome than any other string library in existence.
+
+The library gives a fully populated error status as the return value of every
+string function.  The hope is to be able to diagnose all problems
+specifically from the return code alone.  Comparing this to Bstrlib, which
+aways returns one consistent error message, might make it seem that Bstrlib
+would be harder to debug; but this is not true.  With Bstrlib, if an error
+occurs there is always enough information from just knowing there was an error
+and examining the parameters to deduce exactly what kind of error has
+happened.  The managed string library thus gives up nested function calls
+while achieving little benefit, while Bstrlib does not.
+
+One interesting feature that "managed strings" has is the idea of data
+sanitization via character set whitelisting.  That is to say, a globally
+definable filter that makes any attempt to put invalid characters into strings
+lead to an error and not modify the string.  The author gives the following
+example:
+
+    // create valid char set
+    if (retValue = strcreate_m(&str1, "abc") ) {
+      fprintf(
+        stderr,
+        "Error %d from strcreate_m.\n",
+        retValue
+      );
+    }
+    if (retValue = setcharset(str1)) {
+      fprintf(
+        stderr,
+        "Error %d from  setcharset().\n",
+        retValue
+      );
+    }
+    if (retValue = strcreate_m(&str1, "aabbccabc")) {
+      fprintf(
+        stderr,
+        "Error %d from strcreate_m.\n",
+        retValue
+      );
+    }
+    // create string with invalid char set
+    if (retValue = strcreate_m(&str1, "abbccdabc")) {
+      fprintf(
+        stderr,
+        "Error %d from strcreate_m.\n",
+        retValue
+      );
+    }
+
+Which we can compare with a more Bstrlib way of doing things:
+
+    bstring bCreateWithFilter (const char * cstr, const_bstring filter) {
+      bstring b = bfromcstr (cstr);
+      if (BSTR_ERR != bninchr (b, filter) && NULL != b) {
+        fprintf (stderr, "Filter violation.\n");
+        bdestroy (b);
+        b = NULL;
+      }
+      return b;
+    }
+
+    struct tagbstring charFilter = bsStatic ("abc");
+    bstring str1 = bCreateWithFilter ("aabbccabc", &charFilter);
+    bstring str2 = bCreateWithFilter ("aabbccdabc", &charFilter);
+
+The first thing we should notice is that with the Bstrlib approach you can
+have different filters for different strings if necessary.  Furthermore,
+selecting a charset filter in the Managed String Library is uni-contextual.
+That is to say, there can only be one such filter active for the entire
+program, which means its usage is not well defined for intermediate library
+usage (a library that uses it will interfere with user code that uses it, and
+vice versa.)  It is also likely to be poorly defined in multi-threading
+environments.
+
+There is also a question as to whether the data sanitization filter is checked
+on every operation, or just on creation operations.  Since the charset can be
+set arbitrarily at run time, it might be set *after* some managed strings have
+been created.  This would seem to imply that all functions should run this
+additional check every time if there is an attempt to enforce this.  This
+would make things tremendously slow.  On the other hand, if it is assumed that
+only creates and other operations that take char *'s as input need be checked
+because the charset was only supposed to be called once at and before any
+other managed string was created, then one can see that its easy to cover
+Bstrlib with equivalent functionality via a few wrapper calls such as the
+example given above.
+
+And finally we have to question the value of sanitation in the first place.
+For example, for httpd servers, there is generally a requirement that the
+URLs parsed have some form that avoids undesirable translation to local file
+system filenames or resources.  The problem is that the way URLs can be
+encoded, it must be completely parsed and translated to know if it is using
+certain invalid character combinations.  That is to say, merely filtering
+each character one at a time is not necessarily the right way to ensure that
+a string has safe contents.
+
+In the article that describes this proposal, it is claimed that it fairly
+closely approximates the existing C API semantics.  On this point we should
+compare this "closeness" with Bstrlib:
+
+                      Bstrlib                     Managed String Library
+                      -------                     ----------------------
+
+Pointer arithmetic    Segment arithmetic          N/A
+
+Use in C Std lib      ->data, or bdata{e}         getstr_m(x,*) ... free(x)
+
+String literals       bsStatic, bsStaticBlk       strcreate_m()
+
+Transparency          Complete                    None
+
+Its pretty clear that the semantic mapping from C strings to Bstrlib is fairly
+straightforward, and that in general semantic capabilities are the same or
+superior in Bstrlib.  On the other hand the Managed String Library is either
+missing semantics or changes things fairly significantly.
+
+Comparison with Annexia's c2lib library
+---------------------------------------
+
+This library is available at:
+http://www.annexia.org/freeware/c2lib
+
+1. Still based solely on char * buffers (and therefore strlen() and strcat()
+   is still O(n), and there are no faster streq() comparison functions.)
+   Their suggestion that alternatives which wrap the string data type (such as
+   bstring does) imposes a difficulty in interoperating with the C langauge's
+   ordinary C string library is not founded.
+2. Introduction of memory (and vector?) abstractions imposes a learning
+   curve, and some kind of memory usage policy that is outside of the strings
+   themselves (and therefore must be maintained by the developer.)
+3. The API is massive, and filled with all sorts of trivial (pjoin) and
+   controvertial (pmatch -- regular expression are not sufficiently
+   standardized, and there is a very large difference in performance between
+   compiled and non-compiled, REs) functions.  Bstrlib takes a decidely
+   minimal approach -- none of the functionality in c2lib is difficult or
+   challenging to implement on top of Bstrlib (except the regex stuff, which
+   is going to be difficult, and controvertial no matter what.)
+4. Understanding why c2lib is the way it is pretty much requires a working
+   knowledge of Perl.  bstrlib requires only knowledge of the C string library
+   while providing just a very select few worthwhile extras.
+5. It is attached to a lot of cruft like a matrix math library (that doesn't
+   include any functions for getting the determinant, eigenvectors,
+   eigenvalues, the matrix inverse, test for singularity, test for
+   orthogonality, a grahm schmit orthogonlization, LU decomposition ... I
+   mean why bother?)
+
+Convincing a development house to use c2lib is likely quite difficult.  It
+introduces too much, while not being part of any kind of standards body.  The
+code must therefore be trusted, or maintained by those that use it.  While
+bstring offers nothing more on this front, since its so much smaller, covers
+far less in terms of scope, and will typically improve string performance,
+the barrier to usage should be much smaller.
+
+Comparison with stralloc/qmail
+------------------------------
+
+More information about this library can be found here:
+http://www.canonical.org/~kragen/stralloc.html or here:
+http://cr.yp.to/lib/stralloc.html
+
+1. Library is very very minimal.  A little too minimal.
+2. Untargetted source parameters are not declared const.
+3. Slightly different expected emphasis (like _cats function which takes an
+   ordinary C string char buffer as a parameter.)  Its clear that the
+   remainder of the C string library is still required to perform more
+   useful string operations.
+
+The struct declaration for their string header is essentially the same as that
+for bstring.  But its clear that this was a quickly written hack whose goals
+are clearly a subset of what Bstrlib supplies.  For anyone who is served by
+stralloc, Bstrlib is complete substitute that just adds more functionality.
+
+stralloc actually uses the interesting policy that a NULL data pointer
+indicates an empty string.  In this way, non-static empty strings can be
+declared without construction.  This advantage is minimal, since static empty
+bstrings can be declared inline without construction, and if the string needs
+to be written to it should be constructed from an empty string (or its first
+initializer) in any event.
+
+wxString class
+--------------
+
+This is the string class used in the wxWindows project.  A description of
+wxString can be found here:
+http://www.wxwindows.org/manuals/2.4.2/wx368.htm#wxstring
+
+This C++ library is similar to CBString.  However, it is littered with
+trivial functions (IsAscii, UpperCase, RemoveLast etc.)
+
+1. There is no C implementation.
+2. The memory management strategy is to allocate a bounded fixed amount of
+   additional space on each resize, meaning that it does not have the
+   log_2(n) property that Bstrlib has (it will thrash very easily, cause
+   massive fragmentation in common heap implementations, and can easily be a
+   common source of performance problems).
+3. The library uses a "copy on write" strategy, meaning that it has to deal
+   with multithreading problems.
+
+Vstr
+----
+
+This is a highly orthogonal C string library with an emphasis on
+networking/realtime programming.  It can be found here:
+http://www.and.org/vstr/
+
+1. The convoluted internal structure does not contain a '\0' char * compatible
+   buffer, so interoperability with the C library a non-starter.
+2. The API and implementation is very large (owing to its orthogonality) and
+   can lead to difficulty in understanding its exact functionality.
+3. An obvious dependency on gnu tools (confusing make configure step)
+4. Uses a reference counting system, meaning that it is not likely to be
+   thread safe.
+
+The implementation has an extreme emphasis on performance for nontrivial
+actions (adds, inserts and deletes are all constant or roughly O(#operations)
+time) following the "zero copy" principle.  This trades off performance of
+trivial functions (character access, char buffer access/coersion, alias
+detection) which becomes significantly slower, as well as incremental
+accumulative costs for its searching/parsing functions.  Whether or not Vstr
+wins any particular performance benchmark will depend a lot on the benchmark,
+but it should handily win on some, while losing dreadfully on others.
+
+The learning curve for Vstr is very steep, and it doesn't come with any
+obvious way to build for Windows or other platforms without gnu tools.  At
+least one mechanism (the iterator) introduces a new undefined scenario
+(writing to a Vstr while iterating through it.)  Vstr has a very large
+footprint, and is very ambitious in its total functionality.  Vstr has no C++
+API.
+
+Vstr usage requires context initialization via vstr_init() which must be run
+in a thread-local context.  Given the totally reference based architecture
+this means that sharing Vstrings across threads is not well defined, or at
+least not safe from race conditions.  This API is clearly geared to the older
+standard of fork() style multitasking in UNIX, and is not safely transportable
+to modern shared memory multithreading available in Linux and Windows.  There
+is no portable external solution making the library thread safe (since it
+requires a mutex around each Vstr context -- not each string.)
+
+In the documentation for this library, a big deal is made of its self hosted
+s(n)printf-like function.  This is an issue for older compilers that don't
+include vsnprintf(), but also an issue because Vstr has a slow conversion to
+'\0' terminated char * mechanism.  That is to say, using "%s" to format data
+that originates from Vstr would be slow without some sort of native function
+to do so.  Bstrlib sidesteps the issue by relying on what snprintf-like
+functionality does exist and having a high performance conversion to a char *
+compatible string so that "%s" can be used directly.
+
+Str Library
+-----------
+
+This is a fairly extensive string library, that includes full unicode support
+and targetted at the goal of out performing MFC and STL.  The architecture,
+similarly to MFC's CStrings, is a copy on write reference counting mechanism.
+
+http://www.utilitycode.com/str/default.aspx
+
+1. Commercial.
+2. C++ only.
+
+This library, like Vstr, uses a ref counting system.  There is only so deeply
+I can analyze it, since I don't have a license for it.  However, performance
+improvements over MFC's and STL, doesn't seem like a sufficient reason to
+move your source base to it.  For example, in the future, Microsoft may
+improve the performance CString.
+
+It should be pointed out that performance testing of Bstrlib has indicated
+that its relative performance advantage versus MFC's CString and STL's
+std::string is at least as high as that for the Str library.
+
+libmib astrings
+---------------
+
+A handful of functional extensions to the C library that add dynamic string
+functionality.
+http://www.mibsoftware.com/libmib/astring/
+
+This package basically references strings through char ** pointers and assumes
+they are pointing to the top of an allocated heap entry (or NULL, in which
+case memory will be newly allocated from the heap.)  So its still up to user
+to mix and match the older C string functions with these functions whenever
+pointer arithmetic is used (i.e., there is no leveraging of the type system
+to assert semantic differences between references and base strings as Bstrlib
+does since no new types are introduced.)  Unlike Bstrlib, exact string length
+meta data is not stored, thus requiring a strlen() call on *every* string
+writing operation.  The library is very small, covering only a handful of C's
+functions.
+
+While this is better than nothing, it is clearly slower than even the
+standard C library, less safe and less functional than Bstrlib.
+
+To explain the advantage of using libmib, their website shows an example of
+how dangerous C code:
+
+    char buf[256];
+    char *pszExtraPath = ";/usr/local/bin";
+
+    strcpy(buf,getenv("PATH")); /* oops! could overrun! */
+    strcat(buf,pszExtraPath); /* Could overrun as well! */
+
+    printf("Checking...%s\n",buf); /* Some printfs overrun too! */
+
+is avoided using libmib:
+
+    char *pasz = 0;      /* Must initialize to 0 */
+    char *paszOut = 0;
+    char *pszExtraPath = ";/usr/local/bin";
+
+    if (!astrcpy(&pasz,getenv("PATH"))) /* malloc error */ exit(-1);
+    if (!astrcat(&pasz,pszExtraPath)) /* malloc error */ exit(-1);
+
+    /* Finally, a "limitless" printf! we can use */
+    asprintf(&paszOut,"Checking...%s\n",pasz);fputs(paszOut,stdout);
+
+    astrfree(&pasz); /* Can use free(pasz) also. */
+    astrfree(&paszOut);
+
+However, compare this to Bstrlib:
+
+    bstring b, out;
+
+    bcatcstr (b = bfromcstr (getenv ("PATH")), ";/usr/local/bin");
+    out = bformat ("Checking...%s\n", bdatae (b, "<Out of memory>"));
+    /* if (out && b) */ fputs (bdatae (out, "<Out of memory>"), stdout);
+    bdestroy (b);
+    bdestroy (out);
+
+Besides being shorter, we can see that error handling can be deferred right
+to the very end.  Also, unlike the above two versions, if getenv() returns
+with NULL, the Bstrlib version will not exhibit undefined behavior.
+Initialization starts with the relevant content rather than an extra
+autoinitialization step.
+
+libclc
+------
+
+An attempt to add to the standard C library with a number of common useful
+functions, including additional string functions.
+http://libclc.sourceforge.net/
+
+1. Uses standard char * buffer, and adopts C 99's usage of "restrict" to pass
+   the responsibility to guard against aliasing to the programmer.
+2. Adds no safety or memory management whatsoever.
+3. Most of the supplied string functions are completely trivial.
+
+The goals of libclc and Bstrlib are clearly quite different.
+
+fireString
+----------
+
+http://firestuff.org/
+
+1. Uses standard char * buffer, and adopts C 99's usage of "restrict" to pass
+   the responsibility to guard against aliasing to the programmer.
+2. Mixes char * and length wrapped buffers (estr) functions, doubling the API
+   size, with safety limited to only half of the functions.
+
+Firestring was originally just a wrapper of char * functionality with extra
+length parameters.  However, it has been augmented with the inclusion of the
+estr type which has similar functionality to stralloc.  But firestring does
+not nearly cover the functional scope of Bstrlib.
+
+Safe C String Library
+---------------------
+
+A library written for the purpose of increasing safety and power to C's string
+handling capabilities.
+http://www.zork.org/safestr/safestr.html
+
+1. While the safestr_* functions are safe in of themselves, interoperating
+   with char * string has dangerous unsafe modes of operation.
+2. The architecture of safestr's causes the base pointer to change.  Thus,
+   its not practical/safe to store a safestr in multiple locations if any
+   single instance can be manipulated.
+3. Dependent on an additional error handling library.
+4. Uses reference counting, meaning that it is either not thread safe or
+   slow and not portable.
+
+I think the idea of reallocating (and hence potentially changing) the base
+pointer is a serious design flaw that is fatal to this architecture.  True
+safety is obtained by having automatic handling of all common scenarios
+without creating implicit constraints on the user.
+
+Because of its automatic temporary clean up system, it cannot use "const"
+semantics on input arguments.  Interesting anomolies such as:
+
+    safestr_t s, t;
+    s = safestr_replace (t = SAFESTR_TEMP ("This is a test"),
+                         SAFESTR_TEMP (" "), SAFESTR_TEMP ("."));
+    /* t is now undefined. */
+
+are possible.  If one defines a function which takes a safestr_t as a
+parameter, then the function would not know whether or not the safestr_t is
+defined after it passes it to a safestr library function.  The author
+recommended method for working around this problem is to examine the
+attributes of the safestr_t within the function which is to modify any of
+its parameters and play games with its reference count.  I think, therefore,
+that the whole SAFESTR_TEMP idea is also fatally broken.
+
+The library implements immutability, optional non-resizability, and a "trust"
+flag.  This trust flag is interesting, and suggests that applying any
+arbitrary sequence of safestr_* function calls on any set of trusted strings
+will result in a trusted string.  It seems to me, however, that if one wanted
+to implement a trusted string semantic, one might do so by actually creating
+a different *type* and only implement the subset of string functions that are
+deemed safe (i.e., user input would be excluded, for example.)  This, in
+essence, would allow the compiler to enforce trust propogation at compile
+time rather than run time.  Non-resizability is also interesting, however,
+it seems marginal (i.e., to want a string that cannot be resized, yet can be
+modified and yet where a fixed sized buffer is undesirable.)
+
+===============================================================================
+
+Examples
+--------
+
+    Dumping a line numbered file:
+
+    FILE * fp;
+    int i, ret;
+    struct bstrList * lines;
+    struct tagbstring prefix = bsStatic ("-> ");
+
+    if (NULL != (fp = fopen ("bstrlib.txt", "rb"))) {
+        bstring b = bread ((bNread) fread, fp);
+        fclose (fp);
+        if (NULL != (lines = bsplit (b, '\n'))) {
+            for (i=0; i < lines->qty; i++) {
+                binsert (lines->entry[i], 0, &prefix, '?');
+                printf ("%04d: %s\n", i, bdatae (lines->entry[i], "NULL"));
+            }
+            bstrListDestroy (lines);
+        }
+        bdestroy (b);
+    }
+
+For numerous other examples, see bstraux.c, bstraux.h and the example archive.
+
+===============================================================================
+
+License
+-------
+
+The Better String Library is available under either the BSD license (see the
+accompanying license.txt) or the Gnu Public License version 2 (see the
+accompanying gpl.txt) at the option of the user.
+
+===============================================================================
+
+Acknowledgements
+----------------
+
+The following individuals have made significant contributions to the design
+and testing of the Better String Library:
+
+Bjorn Augestad
+Clint Olsen
+Darryl Bleau
+Fabian Cenedese
+Graham Wideman
+Ignacio Burgueno
+International Business Machines Corporation
+Ira Mica
+John Kortink
+Manuel Woelker
+Marcel van Kervinck
+Michael Hsieh
+Richard A. Smith
+Simon Ekstrom
+Wayne Scott
+
+===============================================================================
diff --git a/doc/likwid-accessD.1 b/doc/likwid-accessD.1
index 7d444af..df66b51 100644
--- a/doc/likwid-accessD.1
+++ b/doc/likwid-accessD.1
@@ -1,7 +1,7 @@
 .TH LIKWID-ACCESSD 1 <DATE> likwid\-<VERSION>
 .SH NAME
 likwid-accessD \- This tool forwards the access operations from LIKWID PerfMon tools
-to the MSR device files
+to the MSR  and PCI device files
 .SH DESCRIPTION
 .B likwid-accessD
 is a command line application that opens a UNIX file socket and waits for access
@@ -9,14 +9,14 @@ operations from LIKWID tools that require access to the MSR and PCI device
 files. The MSR and PCI device files are only accessible for users with root
 privileges, therefore
 .B likwid-accessD
-requires the suid-bit set.
+requires the suid-bit set or a suitable libcap setting.
 Depending on the current system architecture,
 .B likwid-accessD
 permits only access to registers defined for the architecture.
 
 .SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/rrze-likwid/likwid/issues>.
 .SH "SEE ALSO"
-likwid-perfctr(1), likwid-powermeter(1), likwid-features(1), likwid-pin(1), likwid-topology(1),
+likwid-perfctr(1), likwid-powermeter(1), likwid-features(1)
diff --git a/doc/likwid-agent.1 b/doc/likwid-agent.1
new file mode 100644
index 0000000..08ec833
--- /dev/null
+++ b/doc/likwid-agent.1
@@ -0,0 +1,94 @@
+.TH LIKWID-AGENT 1 <DATE> likwid\-VERSION
+.SH NAME
+likwid-agent \- monitoring daemon for hardware performance counters
+.SH SYNOPSIS
+.B likwid-agent <config_file>
+.SH DESCRIPTION
+.B likwid-agent
+is a daemon application that uses
+.B likwid-perfctr(1)
+to measure hardware performance counters. The basic configuration is in a global configuration file. The configuration of the hardware event sets is done with extra files suitable for each architecture. Besides the hardware event configuration, the raw data can be transformed using formulas to interested metrics. In order to output to much data, the data can be further filtered or aggregated.
+.B likwid-agent
+provides multiple store backends like logfiles, RRD (Round Robin Database) or gmetric (Ganglia Monitoring System).
+
+.SH CONFIG FILE
+The global configuration file has the following options:
+.TP
+.B GROUPPATH <path>
+Path to the group files containing event set and output defintitions. See section
+.B GROUP FILES
+for information.
+.TP
+.B EVENTSET <group1> <group2> ...
+Space separated list of groups (without .txt) that should be monitored.
+.TP
+.B DURATION <time>
+Measurement duration in seconds.
+.TP
+.B LOGPATH <path>
+Specify a logfile.
+.TP
+.B GMETRIC <True/False>
+Activates the output to gmetric.
+.TP
+.B GMETRICPATH <path>
+Set path to the gmetric executable.
+.TP
+.B GMETRICCONFIG <path>
+Set a custom configuration file is needed for gmetric.
+.TP
+.B RRD <True/False>
+Activates the output to RRD files (Round Robin Database).
+.TP
+.B RRDPATH <path>
+Output path for the RRD files. The files are named according to the group and each output metric is saved as DS with function GAUGE. The RRD is configured with RRA entries to store average, minimum and maximum of 10 minutes for one hour, of 60 min for one day and daily data for one month.
+.TP
+.B SYSLOG <True/False>
+Activates the output to system log using logger.
+.TP
+.B SYSLOGPRIO <prio>
+Set the priority string for logger, default is 'local0.notice'.
+
+.SH GROUP FILES
+The group files are adapted performance group files as used by
+.B likwid-perfctr(1).
+This makes it easy to uses the predefined and often used performance groups as basis for the monitoring. The folder structure of for the groups is
+.B <GROUPPATH>/<SHORT_ARCH_NAME>/
+with
+.B <SHORT_ARCH_NAME>
+similar to the ones for the performance groups, like 'sandybridge' or 'haswellEP'.
+.TP
+.B SHORT <string>
+A short descriptive information about the group.
+.TP
+.B EVENTSET
+.TP
+.B <counter1> <event1>
+.TP
+.B <counter2>:<option> <event2>
+Defintion of the eventset similar to the performance groups.
+.TP
+.B METRICS
+.TP
+.B <metricname> <formula>
+.TP
+.B <filter> <metricname> <formula>
+Defintion of the output metrics. The syntax follows the
+.B METRICS
+defintion of the performance groups as used by
+.B likwid-perfctr(1).
+If no function is set at the beginning of the line,
+.B <formula>
+is evaluated for every CPU and send to the output backends. The
+.B <metricname>
+gets the prefix "T<cpuid> ". To avoid writing to much data to the backends, the data can be reduced by
+.B <filter>.
+The possible filter options are MIN, MAX, AVG, SUM, ONCE. The ONCE filter sends only the data from the first CPU to the output backends commonly used for the measurement duration.
+
+
+.SH AUTHOR
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
+.SH BUGS
+Report Bugs on <https://github.com/rrze-likwid/likwid/issues>.
+.SH "SEE ALSO"
+likwid-perfctr(1), rrdtool(1), gmetric(1)
diff --git a/doc/likwid-bench.1 b/doc/likwid-bench.1
index 45d0f6c..659bcfd 100644
--- a/doc/likwid-bench.1
+++ b/doc/likwid-bench.1
@@ -1,33 +1,32 @@
 .TH LIKWID-BENCH 1 <DATE> likwid\-<VERSION>
-.WARN
 .SH NAME
 likwid-bench \- low-level benchmark suite and microbenchmarking framework
 .SH SYNOPSIS
 .B likwid-bench
 .RB [\-hap]
-.RB [ \-l
-.IR <testname> ] 
-.RB [ \-i
-.IR <iterations> ]
-.RB [ \-g
-.IR <number_of_workgroups> ]
 .RB [ \-t
 .IR <testname> ]
+.RB [ \-s
+.IR <min_time> ]
 .RB [ \-w
 .IR <workgroup_expression> ]
+.RB [ \-l
+.IR <testname> ]
+.RB [ \-d
+.IR <delimiter> ]
 .SH DESCRIPTION
 .B likwid-bench
 is a benchmark suite for low-level (assembly) benchmarks to measure bandwidths and instruction throughput for specific instruction code on x86 systems. The currently included benchmark codes include common data access patterns like load and store but also calculations like vector triad and sum.
 .B likwid-bench
-includes architecture specific benchmarks for x86, x86_64 and x86 for Intel Xeon Phi coprocessors. The performance values can either be calculated by 
-.B likwid-bench 
-or measured using performance counters by using.
+includes architecture specific benchmarks for x86, x86_64 and x86 for Intel Xeon Phi coprocessors. The performance values can either be calculated by
+.B likwid-bench
+or measured using performance counters by using
 .B likwid-perfctr
 as a wrapper to
 .B likwid-bench.
 This requires to build
-.B likwid-bench.
-with Instrumentation which can be enabled in config.mk.
+.B likwid-bench
+with instrumentation enabled in config.mk.
 .SH OPTIONS
 .TP
 .B \-\^h
@@ -39,77 +38,130 @@ list available benchmark codes for the current system.
 .B \-\^p
 list available thread domains.
 .TP
-.B \-\^l " <testname>"
-list properties of a benchmark code.
+.B \-\^s <min_time>
+Run the benchmark for at least
+.B <min_time> seconds.
+The amount of iterations is determined using this value. Default: 1 second.
 .TP
-.B \-\^i " <iterations>"
-number of iterations to perform inside the benchmark code.
-.TP
-.B \-\^t " <testname>"
+.B \-\^t <testname>
 Name of the benchmark code to run (mandatory).
 .TP
-.B \-\^g " <number_of_workgroups>"
-specify the number of workgroups to perform the benchmark code on (mandatory).
-.TP
-.B \-\^w " <workgroup_expression>"
+.B \-\^w <workgroup_expression>
 Specify the affinity domain, thread count and data set size for the current benchmarking run (mandatory).
+.TP
+.B \-\^l <testname>
+list properties of a benchmark code.
+.TP
+.B \-\^i <iterations>
+Set the number of iterations (optional)
 
 .SH WORKGROUP SYNTAX
 
 .B <thread_domain>:<size> [:<num_threads>[:<chunk_size>:<stride>]] [-<streamId>:<domain_id>]
-with size in kB, MB or GB. Where thread domain is where threads are placed. Size is the total data set size for the benchmark. num_threads specifies how many threads are used. Threads are always placed using a compact policy in
+with size in kB, MB or GB. The
+.B <thread_domain>
+defines where the threads are placed.
+.B <size>
+is the total data set size for the benchmark, the allocated vectors in memory sum up to this size.
+.B <num_threads>
+specifies how many threads are used in the
+.B <thread_domain>.
+Threads are always placed using a compact policy in
 .B likwid-bench.
 This means that per default all SMT threads are used. Optionally similar a the expression based syntax in
 .B likwid-pin
-a chunk size and stride can be provided. Optionally for every stream means array the placement can be controlled. Per default all arrays are placed in the same thread domain the threads are running in. To place the data in a different domain for every stream of a benchmark case (the total number of streams  can be aquired by the \-l option) the domain to place the data in can be specified. Multiple streams are comma separated. Either the placement is provided or all streams have to be ex [...]
+a
+.B <chunk_size>
+and
+.B <stride>
+can be provided. Optionally for every stream (array, vector) the placement can be controlled. Per default all arrays are placed in the same
+.B <thread_domain>
+the threads are running in. To place the data in a different domain for every stream of a benchmark case (the total number of streams can be aquired by the
+.B \-l
+option) the domain to place the data in can be specified. Multiple streams are comma separated. Either the placement is provided or all streams have to be explicitly placed. Please refer to the Wiki pages on
 .B http://code.google.com/p/likwid/wiki/LikwidBench
 for further details and examples on usage.
 
 
 .SH EXAMPLE
 .IP 1. 4
-Run the copy benchmark with 1000 iterations on socket 0 with a total data set size of 100kB.
+Run the
+.B copy
+benchmark on socket 0 (
+.B S0
+) with a total data set size of
+.B 100kB.
 .TP
-.B likwid-bench -t copy -i 1000 -g 1 -w S0:100kB
+.B likwid-bench -t copy -w S0:100kB
 .PP
 Since no 
-.B num_thread
-is given in the workload expression, each core of socket 0 gets one thread. The workload is split up between all threads.
+.B <num_threads>
+is given in the workload expression, each core of socket 0 gets one thread. The workload is split up between all threads and the number of iterations is determined automatically.
 .IP 2. 4
-Run the triad benchmark code with 100 iterations with 2 threads on the socket 0 and a data size of 1 GB.
+Run the
+.B triad
+benchmark code with explicitly
+.B 100
+iterations with
+.B 2
+threads on the socket 0 (
+.B S0
+) and a data size of
+.B 1GB.
 .TP
-.B likwid-bench -t triad -i 100 -g 1 -w S0:1GB:2:1:2
+.B likwid-bench -t triad -i 100 -w S0:1GB:2:1:2
 .PP
-Assuming socket 0 has 4 SMT threads, one thread is assigned to each physical core of socket 0.
+Assuming socket 0 (
+.B S0
+) has 2 physical cores with SMT enabled, hence in total 4 hardware threads, one thread is assigned to each physical core of socket 0.
 .IP 3. 4
-Run the update benchmark with 1000 iterations on socket 0 with a workload of 100kB and on socket 1 with the same workload.
+Run the
+.B update
+benchmark on socket 0 (
+.B S0
+) with a workload of
+.B 100kB
+and on socket 1 (
+.B S1
+) with the same workload.
 .TP
-.B likwid-bench -t update -i 1000 -g 2 -w S0:100kB -w S1:100kB
+.B likwid-bench -t update -w S0:100kB -w S1:100kB
 .PP
 The results of both workgroups are combinded for the output. Hence the workload in each workgroup expression should have the same size.
 .IP 4. 4
-Run the copy benchmark but measure the memory traffic with
+Run the
+.B copy
+benchmark but measure the memory traffic with
 .B likwid-perfctr.
-The option INSTRUMENT_BENCH in config.mk needs to be true at compile time to use that feature.
+The option
+.B INSTRUMENT_BENCH
+in
+.B config.mk
+needs to be true at compile time to use that feature.
 .TP
-.B likwid-perfctr -C E:S0:4 -g MEM -m likwid-bench -t update -i 1000 -g 1 -w S0:100kB
+.B likwid-perfctr -c E:S0:4 -g MEM -m likwid-bench -t update -w S0:100kB
 .PP
-.B likwid-perfctr 
-will configure and start the performance counters on socket 0 with 4 threads prior to the execution of
+.B likwid-perfctr
+will configure and start the performance counters on socket 0 (
+.B S0
+) with 4 threads prior to the execution of
 .B likwid-bench.
-The performance counters are read right before and after running the benchmarking code to 
-minimize the interferences of the measurement.
+The performance counters are read right before and after running the benchmarking code to minimize the interferences of the measurement.
 .IP 5. 4
-Run the copy benchmark and place the data on other socket
+Run the
+.B copy
+benchmark and place the data on another socket
 .TP
-.B likwid-bench -t copy -i 50 -g 1 -w S0:1GB:10:1:2-0:S1,1:S1
+.B likwid-bench -t copy -w S0:1GB:10:1:2-0:S1,1:S1
 .PP
-Stream id 0 and 1 are placed in thread domains S1, which is socket 1. This can be verified as the initialization threads output where they are running.
+Stream id 0 and 1 are placed in thread domains
+.B S1,
+which is socket 1. This can be verified as the initialization threads output where they are running.
 
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/rrze-likwid/likwid/issues>.
 .SH SEE ALSO
-likwid-perfctr(1), likwid-pin(1), likwid-topology(1), likwid-features(1), likwid-setFrequencies(1)
+likwid-perfctr(1), likwid-pin(1), likwid-topology(1), likwid-setFrequencies(1)
diff --git a/doc/likwid-doxygen.md b/doc/likwid-doxygen.md
new file mode 100644
index 0000000..e5cbdac
--- /dev/null
+++ b/doc/likwid-doxygen.md
@@ -0,0 +1,262 @@
+/*! \mainpage LIKWID - Like I Knew What I Am Doing
+
+\section Introduction
+This is an effort to develop easy to use but yet powerful performance tools for the GNU Linux operating system. While the focus of LIKWID is on x86 processors some of the tools are portable and not limited to any specific architecture. LIKWID follows the philosophy:
+- Simple
+- Efficient
+- Portable
+- Extensible
+
+\ref build
+
+\ref faq
+
+\section Tools LIKWID Tools
+- \ref likwid-topology : A tool to display the thread and cache topology on multicore/multisocket computers.
+- \ref likwid-pin : A tool to pin your threaded application without changing your code. Works for pthreads and OpenMP.
+- \ref likwid-perfctr : A tool to measure hardware performance counters on recent Intel and AMD processors. It can be used as wrapper application without modifying the profiled code or with a marker API to measure only parts of the code.
+- \ref likwid-powermeter : A tool for accessing RAPL counters and query Turbo mode steps on Intel processor. RAPL counters are also available in \ref likwid-perfctr.
+- \ref likwid-setFrequencies : A tool to print and manage the clock frequency of CPU cores.
+- \ref likwid-agent : A monitoring agent for LIKWID with multiple output backends.
+- \ref likwid-memsweeper : A tool to cleanup ccNUMA domains and LLC caches to get a clean environment for benchmarks.
+- \ref likwid-bench : A benchmarking framework for streaming benchmark kernels written in assembly.
+- \ref likwid-genTopoCfg : A config file writer that gets system topology and writes them to file for faster LIKWID startup.
+<!-- - \ref likwid-features : A tool to toggle the prefetchers on Core 2 processors.-->
+
+Wrapper scripts using the basic likwid tools:
+- \ref likwid-mpirun : A wrapper script enabling simple and flexible pinning of MPI and MPI/threaded hybrid applications. With integrated \ref likwid-perfctr support.
+- \ref likwid-perfscope : A frontend application for the timeline mode of \ref likwid-perfctr that performs live plotting using gnuplot.
+
+LIKWID requires in most environments some daemon application to perform its operations with higher priviledges:
+- \ref likwid-accessD : Daemon to perform MSR and PCI read/write operations with higher priviledges.
+- \ref likwid-setFreq : Daemon to set the CPU frequencies with higher priviledges.
+
+Optionally, a global configuration file \ref likwid.cfg can be given to modify some basic run time parameters of LIKWID.
+
+\section Library LIKWID Library
+\subsection C_Interface C/C++ Interface
+- \ref MarkerAPI
+- \ref AccessClient
+- \ref Config
+- \ref CPUTopology
+- \ref NumaTopology
+- \ref AffinityDomains
+- \ref PerfMon
+- \ref PowerMon
+- \ref ThermalMon
+- \ref TimerMon
+- \ref Daemon
+- \ref MemSweep
+
+\subsection Lua_Interface Lua Interface
+- \ref lua_Info
+- \ref lua_InputOutput
+- \ref lua_Config
+- \ref lua_Access
+- \ref lua_CPUTopology
+- \ref lua_NumaInfo
+- \ref lua_AffinityInfo
+- \ref lua_Perfmon
+- \ref lua_PowerInfo
+- \ref lua_ThermalInfo
+- \ref lua_Timer
+- \ref lua_MemSweep
+- \ref lua_Misc (Some functionality not provided by Lua natively)
+
+\subsection Fortran90_Interface Fortran90 Interface
+- \ref Fortran_Interface
+
+\section Architectures Supported Architectures
+\subsection Architectures_Intel Intel®
+- \subpage pentiumm
+- \subpage core2
+- \subpage atom
+- \subpage nehalem
+- \subpage nehalemex
+- \subpage westmere
+- \subpage westmereex
+- \subpage phi
+- \subpage silvermont
+- \subpage sandybridge
+- \subpage sandybridgeep
+- \subpage ivybridge
+- \subpage ivybridgeep
+- \subpage haswell
+- \subpage haswellep
+- \subpage broadwell
+
+\subsection Architectures_AMD AMD®
+- \subpage k8
+- \subpage k10
+- \subpage interlagos
+- \subpage kabini
+
+\section Examples Example Codes
+Using the Likwid API:
+- \ref C-likwidAPI-code
+- \ref Lua-likwidAPI-code
+
+Using the Marker API:
+- \ref C-markerAPI-code
+- \ref F-markerAPI-code
+
+If you have problems with LIKWID:<BR>
+GitHub: <A HREF="https://github.com/rrze-likwid/likwid">https://github.com/rrze-likwid/likwid</A><BR>
+Bugs: <A HREF="https://github.com/rrze-likwid/likwid/issues">https://github.com/rrze-likwid/likwid/issues</A><BR>
+Mailinglist: <A HREF="http://groups.google.com/group/likwid-users">http://groups.google.com/group/likwid-users</A><BR>
+*/
+
+
+/*! \page build Build and install instructions
+\section allg Introduction
+Likwid is build using GNU make and Perl. Besides the Linux kernel and the standard C library, all required dependencies are shipped with the archive (<A HREF="http://www.lua.org/">Lua</A> and <A HREF="http://www.open-mpi.org/projects/hwloc/">hwloc</A>).
+It should build on any Linux distribution with a recent GCC compiler or CLANG compiler and 2.6 or newer kernel without any changes.
+
+There is one generic top level Makefile and one .mk configuration file for each
+compiler (at the moment GCC, CLANG and ICC). Please note that we test LIKWID only with GCC. CLANG and ICC is only tested for basic functionality.
+
+There is one exception: If you want to use LIKWID on a Intel Xeon Phi card you have to choose the MIC as compiler in config.mk, which is based on Intel ICC compiler.
+
+\subsection directory Directory structure
+All source files are in the src/ directory. All header files are located in
+src/includes/ . Lua application source files are in src/applications/. All external tools, namely HWLOC and Lua, are located in ext/. The bench/ folder contains all files of the benchmarking suite of LIKWID.
+
+All build products are generated in the directory ./TAG, where TAG is the compiler configuration, default ./GCC.
+
+\subsection config Configuration
+Usually the only thing you have to configure is the PREFIX install path in the build config file config.mk in the top directory.
+
+\subsubsection color Changing color of <CODE>likwid-pin</CODE> output
+Depending on the background of your terminal window you can choose a color for <CODE>likwid-pin</CODE> output.
+
+\subsubsection accessD Usage of the access daemon likwid-accessD
+Usually on your own system, you can use LIKWID with direct access to the MSR files. If you install LIKWID on a shared system as a HPC compute cluster you may consider to use the access daemon. This is a proxy application which was implemented with security in mind and performs address checks for allowed access. Using the access daemon, the measurements involve more overhead, especially if you use \ref likwid-perfctr in timeline mode or with the marker API.
+
+To enable using the access daemon, configure in config.mk:
+    - Set BUILDDAEMON to true
+    - Configure the path to the accessDaemon binary at ACCESSDAEMON
+    - Set the ACCESSMODE to accessdaemon
+
+ACCESSMODE can be direct, accessdaemon and sysdaemon (not yet officially supported). You can overwrite the default setting on the command line using the -M switch.
+
+If you want to access Uncore performance counters that are located in the PCI memory range, like they are implemented in Intel SandyBridge EP and IvyBridge EP, you have to use the access daemon or have root privileges because access to the PCI space is only permitted for highly privileged users.
+
+\subsubsection setfreqinstall Usage of frequency daemon likwid-setFreq
+The application \ref likwid-setFrequencies uses another daemon to modify the frequency of CPUs. The daemon is build and later installed if BUILDFREQ is set to true in config.mk.
+
+\subsubsection sharedlib Build Likwid as shared library
+Per default the LIKWID library is build as a shared library. You need the library if you want to use the Marker API. You can also use the LIKWID modules like <I>perfmon</I> directly. This is still not officially supported at the moment. In some settings it is necessary to build LIKWID as a shared library. To do so set SHARED_LIBRARY to true.
+
+\subsubsection instr_bench Instrument likwid-bench for usage with likwid-perfctr
+\ref likwid-bench is instrumented for use with \ref likwid-perfctr. This allows you to measure various metrics of your \ref likwid-bench kernels. Enable instrumentation by setting INSTRUMENT_BENCH to true in config.mk.
+
+\subsubsection fortran Enabling Fortran interface for marker API
+If you want to use the Marker API in Fortran programs LIKWID offers a native Fortran90 interface. To enable it set FORTRAN_INTERFACE to true in config.mk.
+
+\subsection targets Build targets
+You have to edit config.mk to configure your build and install path.
+
+The following make targets are available:
+
+- <B>make</B> - Build everything
+- <B>make likwid-bench</B> - Build likwid-bench
+- <B>make likwid-accessD</B> - Build likwid-accessD
+- <B>make likwid-setFreq</B> - Build likwid-setFreq
+- <B>make docs</B> - Create HTML documentation using doxygen
+- <B>make clean</B> - Remove the object file directory *./GCC*, keep the executables
+- <B>make distclean</B> - Remove all generated files
+- <B>make local</B> - Adjust paths in Lua scripts to work from the build directory. Requires the daemons and the pinning library to be already installed. Mainly used for testing.
+
+The build system has a working dependency tracking, therefore <B>make clean</B> is only needed if you change the Makefile configuration.
+
+\subsection installtargets Installing
+
+NOTE: The pinning functionality and the daemons only work if configured in config.mk and
+installed with <B>make install</B>. If you do not use the pinning functionality the tools
+can be used without installation.
+
+ - <B>make install</B> - Installs the executables, libraries, man pages and headers to the path you configured in config.mk.
+ - <B>make uninstall</B> - Delete all installed files.
+
+\subsection accessD Setting up access for hardware performance monitoring
+Hardware performance monitoring on x86 is enabled using model-specific registers (MSR). MSR registers are special registers not part of the instruction set architecture. To read and write to these registers the x86 ISA provides special instructions. These instructions can only be executed in protected mode or in other words only kernel code can execute these instructions. Fortunately, any Linux kernel 2.6 or newer provides access to these registers via a set of device files. This allows  [...]
+
+Per default only root has read/write access to these msr device files. In order to use the LIKWID tools, which need access to these files (likwid-perfctr, likwid-powermeter and likwid-agent) as standard user, you need to setup access rights to these files.
+
+likwid-perfctr, likwid-powermeter and likwid-features require the Linux <CODE>msr</CODE> kernel module. This module is part of most standard distro kernels. You have to be root to do the initial setup.
+
+    - Check if the <CODE>msr</CODE> module is loaded with <CODE>lsmod | grep msr</CODE>. There should be an output.
+    - It the module is not loaded, load it with <CODE>modprobe msr</CODE>. For automatic loading at startup consult your distros documentation how to do so.
+    - Adopt access rights on the MSR device files for normal user. To grant access to anyone, you can use <CODE>chmod o+rw /dev/cpu/*/msr</CODE>. This is only recommended on single user desktop systems.
+
+As in general access to MSRs is not desired on security sensitive systems, you can either implement a more sophisticated access rights settings with e.g. setgid. A common solution used on many other device files, e.g. for audio, is to introduce a group and make a <CODE>chown</CODE> on the msr device files to that group. Now if you execute likwid-perfctr with setgid on that group, the executing user can use the tool but cannot directly write or read the MSR device files.
+
+Some distributions backported the capabilities check for the msr device to older kernels. If there are problems with accessing the msr device for older kernels with file system permissions set to read&write, please check your kernel code (<CODE>arch/x86/kernel/msr.c</CODE>) for the backport and set the MSR capabilities in case.
+
+A secure solution is to use the access daemon \ref likwid-accessD, which encapsulates the access to the MSR device files and performs a address check for allowed registers.
+
+Some newer kernels implement the so-called capabilities, a fine-grained permission system that can allow access to the MSR files for common users. On the downside it may be not enough anymore to set the suid-root flag for the access daemon, the executable must be registerd at the <CODE>libcap</CODE>.
+
+<CODE>sudo setcap cap_sys_rawio+ep EXECUTABLE</CODE>
+
+This is only possible on local file systems. A feasible way is to use the \ref likwid-accessD for all accesses and just enable the capabilities for this one binary. This will enable the usage for all LIKWID tools and also for all instrumented binaries. If \ref likwid-perfctr utility should only be used in wrapper mode, it is suitable to set the capabilities for \ref likwid-perfctr only. Please remember to set the file permission of the MSR device files to read/write for all users, even i [...]
+
+\subsubsection depends Dependencies
+Although we tried to minimize the external dependencies of LIKWID, some advanced tools or only specific tool options require external packages.<BR>
+\ref likwid-perfscope uses the Perl script <A HREF="https://github.com/dkogan/feedgnuplot">feedGnuplot</A> to forward the real-time data to gnuplot. <A HREF="https://github.com/dkogan/feedgnuplot">feedGnuplot</A> is included into LIKWID, but <A HREF="http://www.gnuplot.info/">gnuplot</A> itself is not.<BR>
+\ref likwid-agent provided multiple backends to output the periodically measured data. The syslog backend requires the shell tool \a logger to be installed. The <A HREF="https://oss.oetiker.ch/rrdtool/">RRD</A> backend requires \a rrdtool and the GMetric backend the \a gmetric tool, part of the <A HREF="http://ganglia.sourceforge.net/">Ganglia Monitoring System</A>.<BR>
+In order to create the HTML documentation of LIKWID, the tool <A HREF="www.doxygen.org">Doxygen</A> is required.
+*/
+
+/*! \page C-markerAPI-code Marker API in a C/C++ application
+\include C-markerAPI.c
+*/
+
+/*! \page F-markerAPI-code Marker API in a Fortran90 application
+\include F-markerAPI.F90
+*/
+
+/*! \page C-likwidAPI-code LIKWID API in a C/C++ application
+\include C-likwidAPI.c
+*/
+/*! \page Lua-likwidAPI-code LIKWID API in a Lua application
+\include Lua-likwidAPI.lua
+*/
+
+/*! \page faq FAQ
+\section faq1 Which architectures are supported?
+LIKWID supports a range of x86 CPU architectures but likely not all. We concentrated the development effort on Intel and AMD machines. Almost all architecture code is tested. For a list of architectures see section \ref Architectures or call <CODE>likwid-perfctr -i</CODE>.
+
+\section faq2 Are all hardware events supported?
+LIKWID offers almost all events that are defined in the <A HREF="http://www.Intel.com/content/www/us/en/processors/architectures-software-developer-manuals.html">Intel® Software Developer System Programming Manual</A> and the <A HREF="http://developer.amd.com/resources/documentation-articles/developer-guides-manuals/">AMD® BIOS and Kernel Developer’s Guides</A>. Some may be missing caused by special handling likely with additional registers. But, LIKWID also provides some events  [...]
+
+\section faq3 Does LIKWID support Intel's PEBS?
+No, PEBS is an interface that must be initialized at kernel level. Since LIKWID is a user-space tool, there is no possibility to maintain PEBS.
+
+\section faq4 Which unit does LIKWID use internally for B, kB, MB, GB?
+As the units imply, you get from one unit to the other by multiplying or dividing it by 1000. E.g. 1kB = 1000B. There is no kiB or MiB possible by now.
+
+\section faq5 Does LIKWID support power capping (Intel only)?
+No, by now LIKWID does not support limiting the power consumption of your machine using the RAPL interface. We added some functions but they are not exported because they need to be rechecked.
+
+\section faq6 Is LIKWID case-sensitive?
+Yes, all strings are case-sensitive. The only exception are the event options, they are case-insensitive. For upcomming versions we change to case-insensitive for all string parsing where possible.
+
+\section faq7 I have given multiple eventsets on the command line but the values are too low? Are they multiplexed?
+LIKWID does not support multiplexing of eventsets. It rotates through its eventset list and measures each for a specific amount of time. The output contains the results of all measurements of that eventset, no interpolation to the complete runtime is done. Since most other tools that support multiplexing use linear interpolation, you can scale the results yourself with <CODE>(1.0 - (measurement_time/all_time)) * result</CODE>. As you can see, the calculation is pretty simple, but it intr [...]
+
+\section faq8 Are there plans to port LIKWID to other operating systems?
+We do not really plan to port LIKWID to other operating systems. We come from the HPC world and there the main operating systems base on the Linux kernel. The latest Top500 list contains 13 systems using Unix and 1 system with Microsoft® Windows.
+
+\section faq9 Are there plans to port LIKWID to other CPU architectures?
+We would like to port LIKWID to other CPU architectures that support hardware performance measurements but currently there is no time for that and we do not have other architectures than x86 inhouse. We follow the developements and if an architecture gets HPC relevant, we will likely port LIKWID to make it work. The highest probability has ARM and with lower probability we will include SPARC.
+
+\section faq10 Do you plan to introduce a graphical frontend for LIKWID?
+No, we do not!
+
+\section faq12 Why does the startup of likwid-perfctr take so long?
+In order to get reliable time measurements, LIKWID must determine the base clock frequency of your CPU. This is done by a measurement loop that takes about 1 second. You can avoid the measurement loop by creating a topology configuration file with \ref likwid-genTopoCfg.
+
+\section faq13 I want to help, were do I start?
+The best way is to talk to us at the <A HREF="http://groups.google.com/group/likwid-users">mailing list</A>. There are a bunch of small work packages on our ToDo list that can be used as a good starting point for learning how LIKWID works. If you are not a programmer but you have a good idea, let us know and we will discuss it.
+*/
diff --git a/doc/likwid-features.1 b/doc/likwid-features.1
index e67cf44..e19df03 100644
--- a/doc/likwid-features.1
+++ b/doc/likwid-features.1
@@ -4,17 +4,15 @@ likwid-features \- print and toggle the flags of the MSR_IA32_MISC_ENABLE model
 .SH SYNOPSIS
 .B likwid-features 
 .RB [ \-vh ]
-.RB [ \-c
-.IR <coreId> ]
-.RB [ \-s
-.IR <prefetcher_tag> ]
-.RB [ \-u
-.IR <prefetcher_tag> ]
+.RB [ \-t
+.IR coreId ]
+.RB [ \-su
+.IR prefetcher_tag ]
 .SH DESCRIPTION
 .B likwid-features
 is a command line application to print the flags in the model
 specific register (MSR) MSR_IA32_MISC_ENABLE on Intel x86 processors. On Core2 processors
-it can be used to toggle the hardware prefetch flags. It does not work on AMD processors.
+it can be used to toggle the hardware prefetch flags. It does not work on AMD processors and recent Intel processors.
 For a documentation what flags are supported on which processor refer to the Intel
 Software Developer's Manual Volume 3B, Table B.2. The MSR are set individually for every core.
 The following hardware prefetchers can be toggled:
@@ -43,18 +41,18 @@ prints version information to standard output, then exits.
 .B \-\^h
 prints a help message to standard output, then exits.
 .TP
-.B \-\^c " <coreId>"
+.B \-\^t " coreId"
 set on which processor core the MSR should be read
 .TP
-.B \-\^u " <HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER>"
+.B \-\^u " HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER"
 specify which prefetcher to unset
 .TP
-.B \-\^s " <HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER>"
+.B \-\^s " HW_PREFETCHER | CL_PREFETCHER | DCU_PREFETCHER | IP_PREFETCHER"
 specify which prefetcher to set
 
 .SH AUTHOR
 Written by Jan Treibig <jan.treibig at gmail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/rrze-likwid/likwid/issues>.
 .SH "SEE ALSO"
-likwid-perfctr(1), likwid-pin(1), likwid-powermeter(1), likwid-topology(1), likwid-setFrequencies(1)
+likwid-topology(1), likwid-perfCtr(1), likwid-pin(1),
diff --git a/doc/likwid-genCfg.1 b/doc/likwid-genCfg.1
deleted file mode 100644
index 8b7632f..0000000
--- a/doc/likwid-genCfg.1
+++ /dev/null
@@ -1,30 +0,0 @@
-.TH LIKWID-GENCFG 1 <DATE> likwid\-<VERSION>
-.SH NAME
-likwid-genCfg \- Get system topology and write them to file for faster LIKWID startup
-.SH SYNOPSIS
-.B likwid-genCfg
-.RB [\-hv]
-.RB [ \-o
-.IR <filename>]
-.SH DESCRIPTION
-.B likwid-genCfg
-is a command line application that stores the system's CPU and NUMA topology to
-file. LIKWID applications use this file to read in the topology fast instead of
-re-gathering all values. The default output path is /etc/likwid.cfg.
-.SH OPTIONS
-.TP
-.B \-h
-prints a help message to standard output, then exits.
-.TP
-.B \-v
-prints a version message to standard output, then exits.
-.TP
-.B \-\^o " <filename>
-sets output file path (optional)
-
-.SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
-.SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
-.SH "SEE ALSO"
-likwid-topology(1), likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1)
diff --git a/doc/likwid-genTopoCfg.1 b/doc/likwid-genTopoCfg.1
new file mode 100644
index 0000000..57db2aa
--- /dev/null
+++ b/doc/likwid-genTopoCfg.1
@@ -0,0 +1,30 @@
+.TH LIKWID-GENTOPOCFG 1 <DATE> likwid\-<VERSION>
+.SH NAME
+likwid-genTopoCfg \- Get system topology and write them to file for faster LIKWID startup
+.SH SYNOPSIS
+.B likwid-genTopoCfg
+.RB [\-hv]
+.RB [ \-o
+.IR <filename>]
+.SH DESCRIPTION
+.B likwid-genTopoCfg
+is a command line application that stores the system's CPU and NUMA topology to
+file. LIKWID applications use this file to read in the topology fast instead of
+re-gathering all values.
+.SH OPTIONS
+.TP
+.B \-h, \-\-\^help
+prints a help message to standard output, then exits.
+.TP
+.B \-v, \-\-\^version
+prints a version message to standard output, then exits.
+.TP
+.B \-\^o, \-\-\^output <filename>
+sets output file path (Default: /etc/likwid-topo.cfg)
+
+.SH AUTHOR
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
+.SH BUGS
+Report Bugs on <https://github.com/rrze-likwid/likwid/issues>.
+.SH "SEE ALSO"
+likwid-topology(1), likwid-perfctr(1), likwid-pin(1)
diff --git a/doc/likwid-memsweeper.1 b/doc/likwid-memsweeper.1
index f474360..34225b1 100644
--- a/doc/likwid-memsweeper.1
+++ b/doc/likwid-memsweeper.1
@@ -5,24 +5,24 @@ likwid-memsweeper \- A tool to clean up NUMA memory domains and last level cache
 .B likwid-memsweeper
 .RB [\-hv]
 .RB [ \-c
-.IR <NUMA_ID> ]
+.IR <node_list> ]
 .SH DESCRIPTION
 .B likwid-memsweeper
-is a command line application to shrink the file buffer cache by filling the NUMA domain with random pages. Moreover the tool invalidates all cachelines in the LLC for 64 bit x86 systems. If no NUMA domain is specified, all are sweept.
+is a command line application to shrink the file buffer cache by filling the NUMA domain with random pages. Moreover, the tool invalidates all cachelines in the LLC.
 .SH OPTIONS
 .TP
-.B \-h
+.B \-h, \-\-\^help
 prints a help message to standard output, then exits.
 .TP
-.B \-v
+.B \-v, \-\-\^version
 prints a version message to standard output, then exits.
 .TP
-.B \-\^c " <NUMA_ID>
+.B \-\^c <node_list>
 set the NUMA domain for sweeping.
 
 .SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/rrze-likwid/likwid/issues>.
 .SH "SEE ALSO"
-likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1), likwid-topology(1),
+likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1),
diff --git a/doc/likwid-mpirun.1 b/doc/likwid-mpirun.1
index 765b0c8..67c1998 100644
--- a/doc/likwid-mpirun.1
+++ b/doc/likwid-mpirun.1
@@ -3,7 +3,9 @@
 likwid-mpirun \- A tool to start and monitor MPI applications with LIKWID
 .SH SYNOPSIS
 .B likwid-memsweeper
-.RB [\-hd]
+.RB [\-hvdOm]
+.RB [ \-n
+.IR number_of_processes ]
 .RB [ \-hostfile
 .IR filename ]
 .RB [ \-nperdomain
@@ -11,9 +13,11 @@ likwid-mpirun \- A tool to start and monitor MPI applications with LIKWID
 .RB [ \-pin
 .IR expression ]
 .RB [ \-omp
-.IR expression ]
+.IR omptype ]
 .RB [ \-mpi
-.IR expression ]
+.IR mpitype ]
+.RB [ \-g
+.IR eventset ]
 .RB [\-\-]
 .SH DESCRIPTION
 .B likwid-mpirun
@@ -22,32 +26,51 @@ is a command line application that wraps the vendor-specific mpirun tool and add
 to the execution string. The user-given application is ran, measured and the results returned to the staring node.
 .SH OPTIONS
 .TP
-.B \-h
-prints a help message to standard output, then exits.
+.B \-\^h,\-\-\^help
+prints a help message to standard output, then exits
+.TP
+.B \-\^v,\-\-\^version
+prints version information to standard output, then exits
+.TP
+.B \-\^d,\-\-\^debug
+prints debug messages to standard output
 .TP
-.B \-d
-prints debug messages to standard output.
+.B \-\^n,\-\^np,\-\-\^n,\-\-\^np <number_of_processes>
+specifies how many MPI processes should be started
 .TP
-.B \-\^hostfile " filename
-specifies the nodes to schedule the MPI processes on
+.B \-\^hostfile <filename>
+specifies the nodes to schedule the MPI processes on. If not given, the environment variables PBS_NODEFILE, LOADL_HOSTFILE and SLURM_HOSTFILE are checked.
 .TP
-.B \-\^nperdomain " number_of_processes_in_domain
+.B \-\^nperdomain <number_of_processes_in_domain>
 specifies the processes per affinity domain (see
 .B likwid-pin
 for info about affinity domains)
 .TP
-.B \-\^pin " expression
+.B \-\^pin <expression>
 specifies the pinning for hybrid execution (see
 .B likwid-pin
 for info about affinity domains)
 .TP
-.B \-\^omp " expression
-enables hybrid setup. Can only be used in combination with
-.B -pin.
-The only possible value is: intel
+.B \-\^s, \-\-\^skip <mask>
+Specify skip mask as HEX number. For each set bit the corresponding thread is skipped.
+.TP
+.B \-\^omp <omptype>
+enables hybrid setup. Likwid tries to determine OpenMP type automatically. The only possible value are
+.B intel
+and
+.B gnu
+.TP
+.B \-\^mpi <mpitype>
+specifies the MPI implementation that should be used by the wrapper. Possible values are
+.B intelmpi, openmpi
+and
+.B mvapich2
+.TP
+.B \-\^m,\-\-\^marker
+activates the Marker API for the executed MPI processes
 .TP
-.B \-\^mpi " expression
-specifies the MPI implementation that should be used by the wrapper. Possible values are intelmpi, openmpi and mvapich2
+.B \-\^O
+prints output in CSV not ASCII tables
 .TP
 .B \-\-
 stops parsing arguments for likwid-mpirun, in order to set options for underlying MPI implementation after \-\-.
@@ -56,26 +79,26 @@ stops parsing arguments for likwid-mpirun, in order to set options for underlyin
 .IP 1. 4
 For standard application:
 .TP
-.B likwid-mpirun -np 32  ./myApp
+.B likwid-mpirun -np 32 ./myApp
 .PP
 Will run 32 MPI processes, each host is filled with as much processes as written in ppn
 .IP 2. 4
 With pinning:
 .TP
-.B likwid-mpirun -np 32 -nperdomain S:2  ./myApp
+.B likwid-mpirun -np 32 -nperdomain S:2 ./myApp
 .PP
 Will start 32 MPI processes with 2 processes per socket.
 .IP 3. 4
 For hybrid runs:
 .TP
-.B likwid-mpirun -np 32 -pin M0:0-3_M1:0-3  ./myApp
+.B likwid-mpirun -np 32 -pin M0:0-3_M1:0-3 ./myApp
 .PP
 Will start 32 MPI processes with 2 processes per node. Threads of the first process are pinned to the cores 0-3 in NUMA domain 0 (M0). The OpenMP threads of the second process are pinned to the first four cores in NUMA domain 1 (M1)
 
 
 .SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/rrze-likwid/likwid/issues>.
 .SH "SEE ALSO"
-likwid-pin(1), likwid-perfctr(1), likwid-features(1), likwid-powermeter(1), likwid-topology(1),
+likwid-pin(1), likwid-perfctr(1), likwid-powermeter(1)
diff --git a/doc/likwid-perfctr.1 b/doc/likwid-perfctr.1
index ea3e4f3..b417eb4 100644
--- a/doc/likwid-perfctr.1
+++ b/doc/likwid-perfctr.1
@@ -1,152 +1,102 @@
 .TH LIKWID-PERFCTR 1 <DATE> likwid\-<VERSION>
 .SH NAME
-likwid-perfctr \- configure and read out hardware performance counters on x86 cpus
+likwid-perfctr \- configure and read out hardware performance counters on x86 CPUs
 .SH SYNOPSIS
 .B likwid-perfctr 
-.RB [\-vhHVmaeiMoO]
-.RB [ \-c/\-C
-.IR <core_list> ]
+.RB [\-vhHmaie]
+.RB [ \-c
+.IR core_list ]
+.RB [ \-C
+.IR core_list_for_pinning ]
 .RB [ \-g
-.IR <performance_group>
+.IR performance_group
 or
-.IR <performance_event_string> ]
+.IR performance_event_string ]
 .RB [ \-t
-.IR <frequency> ]
+.IR timeline_frequency ]
 .RB [ \-S
-.IR <time> ]
-.RB [ \-s
-.IR <skip_mask> ]
+.IR monitoring_time ]
+.RB [ \-T
+.IR group_switch_frequency ]
+.RB [ \-V
+.IR verbosity ]
+.RB [ \-M
+.IR access_mode ]
 .RB [ \-o
-.IR <output_file> ]
+.IR output_file ]
+.RB [ \-s
+.IR skip_mask ]
 .SH DESCRIPTION
 .B likwid-perfctr
 is a lightweight command line application to configure and read out hardware performance monitoring data
 on supported x86 processors. It can measure either as wrapper without changing the measured application
 or with marker API functions inside the code, which will turn on and off the counters. There are preconfigured
-groups with useful event sets and derived metrics. Additonally arbitrary events can be measured with
-custom event sets. The marker API can measure mulitple named regions. Results are accumulated on multiple calls.
-The following x86 processors are supported:
-.IP \[bu] 
-.B Intel Core 2:
-all variants. Counters:
-.I PMC[0-1], FIXC[0-2]
-.IP \[bu] 
-.B Intel Nehalem:
-Counters:
-.I PMC[0-3], FIXC[0-2], UPMC[0-7]
-.IP \[bu] 
-.B Intel Nehalem EX:
-Counters:
-.I PMC[0-3], FIXC[0-2], MBOX[0-1]C[0-5], BBOX[0-1]C[0-3], RBOX[0-1]C[0-7], WBOX[0-5], UBOX0, SBOX[0-1]C[0-3], CBOX[0-9]C[0-4]
-.IP \[bu] 
-.B Intel Westmere:
- Counters:
-.I PMC[0-3], FIXC[0-2], UPMC[0-7]
-.IP \[bu] 
-.B Intel Westmere EX:
-Counters:
-.I PMC[0-3], FIXC[0-2], MBOX[0-1]C[0-5], BBOX[0-1]C[0-3], RBOX[0-1]C[0-7], WBOX[0-5], UBOX0, SBOX[0-1]C[0-3], CBOX[0-9]C[0-4]
-.IP \[bu] 
-.B Intel Sandy Bridge:
-full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]
-.IP \[bu] 
-.B Intel Sandy Bridge EP:
-partial support for uncore, full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]. MBOX[0-3]C[0-3]
-.IP \[bu] 
-.B Intel Ivy Bridge:
-full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]
-.IP \[bu] 
-.B Intel Ivy Bridge EP:
-partial support for uncore, full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3], CBOX[0-9]C[0-3], MBOX[0-3]C[0-3], MBOX[0-3]FIX
-.IP \[bu] 
-.B Intel Haswell:
-full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]
-.IP \[bu] 
-.B Intel Haswell EP:
-no uncore support, full RAPL support. Counters:
-.I PMC[0-3], FIXC[0-2], PWR[0-3]
-.IP \[bu] 
-.B Intel Atom Silvermont:
-full RAPL support. Counters:
-.I PMC[0-1], FIXC[0-2], PWR[0-1]
-.IP \[bu] 
-.B Intel Pentium M:
-Banias and Dothan variants. Counters:
-.I PMC[0-1]
-.IP \[bu] 
-.B Intel P6:
-Tested on P3.
-.IP \[bu] 
-.B AMD K8:
-all variants. Counters:
-.I PMC[0-3]
-.IP \[bu] 
-.B AMD K10:
-Barcelona, Shanghai, Istanbul, MagnyCours based processors. Counters:
-.I PMC[0-3]
+performance groups with useful event sets and derived metrics. Additonally, arbitrary events can be measured with
+custom event sets. The marker API can measure mulitple named regions and the results are accumulated over multiple region calls.
 
 .SH OPTIONS
 .TP
-.B \-\^v
+.B \-\^v, \-\-\^version
 prints version information to standard output, then exits.
 .TP
-.B \-\^h
+.B \-\^h, \-\-\^help
 prints a help message to standard output, then exits.
 .TP
 .B \-\^H
 prints group help message (use together with -g switch).
 .TP
-.B \-\^V
-verbose output during execution for debugging.
+.B \-\^V <level>, \-\-\^verbose <level>
+verbose output during execution for debugging. 0 for only errors, 1 for informational output, 2 for detailed output and 3 for developer output
 .TP
 .B \-\^m
 run in marker API mode
 .TP
 .B \-\^a
-print available performance groups for current processor.
+print available performance groups for current processor, then exit.
 .TP
 .B \-\^e
 print available counters and performance events of current processor.
 .TP
-.B \-\^o " <filename>
+.B \-\^o, \-\-\^output <filename>
 store all ouput to a file instead of stdout. For the filename the following placeholders are supported: 
-%j for PBS_JOBID, %r for MPI RANK (only Intel MPI at the moment), %h hostname and %p for process pid.
+%j for PBS_JOBID, %r for MPI RANK (only Intel MPI at the moment), %h host name and %p for process pid.
 The placeholders must be separated by underscore as, e.g., -o test_%h_%p. You must specify a suffix to
 the filename. For txt the output is printed as is to the file. Other suffixes trigger a filter on the output.
 Available filters are csv (comma separated values) and xml at the moment.
 .TP
 .B \-\^O
-Do not print tables for results, use easily parseable CSV instead.
+print output in CSV format (conform to RFC 4180, see
+.I https://tools.ietf.org/html/rfc4180
+for details).
 .TP
-.B \-\^i
-print cpuid information about processor and on Intel Performance Monitoring features, then exit.
+.B \-\^i, \-\-\^info
+print cpuid information about processor and about Intel Performance Monitoring features, then exit.
 .TP
-.B \-\^c " <processor_list>"
+.B \-\^c <cpu expression>
 specify a numerical list of processors. The list may contain multiple 
 items, separated by comma, and ranges. For example 0,3,9-11.
 .TP
-.B \-\^C " <processor_list>"
+.B \-\^C <cpu expression>
 specify a numerical list of processors. The list may contain multiple 
 items, separated by comma, and ranges. For example 0,3,9-11. This variant will
 also pin the threads to the cores. Also logical numberings can be used.
 .TP
-.B \-\^g " <performance group> or <performance event set string>"
+.B \-\^g, \-\-\^group <performance group> or <performance event set string>
 specify which performance group to measure. This can be one of the tags output with the -a flag.
 Also a custom event set can be specified by a comma separated list of events. Each event has the format
 eventId:register with the the register being one of a architecture supported performance counter registers.
 .TP
-.B \-\^t " <frequency of measurements>"
-timeline mode for time resolved measurements, possible suffixes 's' and 'ms' like 100ms. The output has the format:
+.B \-\^t <frequency of measurements>
+timeline mode for time resolved measurements.
 .TP
-.B <Event> <Timestamp> <Result thread0> <Result thread1> ...
+.B \-\^S <waittime between measurements>
+End-to-end measurement using likwid-perfctr but sleep instead of executing an application
 .TP
-.B \-\^S " <time_in_seconds>"
-stethoscope mode with duration in senconds. Can be used to measure an application from the outside.
+.B \-\^T <time between group switches>
+Frequency to switch groups if multiple are given on commandline, default is 2s. Value is ignored for a single event set and default frequency of 30s is used to catch overflows.
+.TP
+.B \-\^s, \-\-\^skip <mask>
+Specify skip mask as HEX number. For each set bit the corresponding thread is skipped.
 
 .SH EXAMPLE
 Because 
@@ -163,7 +113,7 @@ The parent process is pinned to processor 0, Thread 0 to processor 1 and Thread
 .IP 2. 4
 As wrapper with custom event set on AMD:
 .TP
-.B likwid-perfctr -C 0-4 -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./myApp
+.B likwid-perfctr -C 0-4 -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./cacheBench
 .PP
 It is specified that the event
 .B INSTRUCTIONS_RETIRED_SSE
@@ -173,84 +123,109 @@ and the event
 .B CPU_CLOCKS_UNHALTED
 on counter
 .B PMC3.
-It is possible calculate the runtime of all threads based on the
+It is possible calculate the run time of all threads based on the
 .B CPU_CLOCKS_UNHALTED
 event. If you want this you have to include this event in your custom event string as shown above.
 
 .IP 3. 4
 As wrapper with custom event set on Intel:
 .TP
-.B likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1 ./myApp
+.B likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,UNC_L3_LINES_IN_ANY:UPMC0 ./stream-icc
 .PP
 On Intel processors fixed events are measured on dedicated counters. These are
 .B INSTR_RETIRED_ANY
-,
-.B CPU_CLK_UNHALTED_CORE.
 and
-.B CPU_CLK_UNHALTED_REF
+.B CPU_CLK_UNHALTED_CORE.
 If you configure these fixed counters, 
 .B likwid-perfctr
-will calculate the runtime and CPI metrics for your run.
+will calculate the run time and CPI metrics for your run.
 
 .IP 4. 4
 Using the marker API to measure only parts of your code (this can be used both with groups or custom event sets):
 .TP
 .B likwid-perfctr -m -C 0-4 -g INSTRUCTIONS_RETIRED_SSE:PMC0,CPU_CLOCKS_UNHALTED:PMC3 ./cacheBench
 .PP
-You have to link you code against liblikwid.a/.so and use the marker API calls.
+You have to link you code against liblikwid.so and use the marker API calls.
+Examples can be found in examples folder <PREFIX>/share/likwid/examples.
 The following code snippet shows the necessary calls:
 
 .nf
 #include <likwid.h>
 
 /* only one thread calls init */
-if (threadId == 0)
-{
-    likwid_markerInit();
-}
-/* if you want to measure an threaded application
- * you have to call likwid_markerThreadInit() for
- * preparation, example with OpenMP */
-#pragma omp parallel
-{
-	likwid_markerThreadInit();
-}
-BARRIER;
-likwid_markerStartRegion("Benchmark");
-/* your code to be measured is here.*/
+LIKWID_MARKER_INIT;
+
+/* Must be called by each thread the should 
+ * perform measurements.
+ * If you place it in the same parallel
+ * region as LIKWID_MARKER_START, perform a
+ * barrier between the statements to avoid
+ * timing problems.
+ */
+LIKWID_MARKER_THREADINIT;
+
+/* If you run the code region only once, register
+ * the region tag previously to reduce the overhead
+ * of START and STOP calls. Call it once for each
+ * thread in parallel environment.
+ * This call is optional, START will do the same operations.
+ */
+LIKWID_MARKER_REGISTER("name");
 
-likwid_markerStopRegion("Benchmark");
-BARRIER;
-/* again only one thread can close the markers */
-if (threadId == 0)
-{
-    likwid_markerClose();
-}
+/* Start measurement */
+LIKWID_MARKER_START("name");
+/*
+ * Your code to be measured is here
+ * You can also nest named regions
+ * No whitespaces are allowed in the region names!
+ */
+LIKWID_MARKER_STOP("name");
+
+/* If you want to measure multiple groups/event sets
+ * Switches through groups in round-robin fashion
+ */
+LIKWID_MARKER_SWITCH;
+
+/* Finally */
+LIKWID_MARKER_CLOSE;
 .fi
 
 .IP 5. 4
 Using likwid in timeline mode:
 .TP
-.B likwid-perfctr -c 0-3 -g FLOPS_DP -t 300ms  ./myApp > out.txt
+.B likwid-perfctr -c 0-3 -g FLOPS_DP -t 300ms ./cacheBench > out.txt
 .PP
 This will read out the counters every 300ms on physical cores 0-3 and write the results to out.txt.
-For timeline mode there is a frontend application likwid-scope, which enables live plotting of selected events.
-For more code examples have a look at the likwid WIKI pages. The processes are
-.B not
-pinned to the CPUs 0-3.
+The application is not pinned to the CPUs. The output syntax of the timeline mode is:
+
+.B <groupID> <numberOfEvents> <numberOfThreads> <Timestamp> <Event1_Thread1> <Event1_Thread2> ... <EventN_ThreadN>
+
+For timeline mode there is a frontend application likwid-perfscope(1), which enables live plotting of selected events.
 
 .IP 6. 4
 Using likwid in stethoscope mode:
 .TP
 .B likwid-perfctr -c 0-3 -g FLOPS_DP -S 2s
 .PP
-This will start the counters and read them out after 2s on physical cores 0-3 and write the results to stdout. The processes are
-.B not
-pinned to the CPUs 0-3.
+This will start the counters and read them out after 2s on physical cores 0-3 and write the results to stdout.
+
+.IP 7. 4
+Using likwid with counter options:
+.TP
+.B likwid-perfctr -c S0:1 at S1:1 -g LLC_LOOKUPS_DATA_READ:CBOX0C0:STATE=0x9 ./cacheBench
+.PP
+This will program the counter
+.B CBOX0C0
+(the counter 0 of the LLC cache box 0) to measure the event
+.B LLC_LOOKUPS_DATA_READ
+and filter the increments by the state of a cacheline.
+.B STATE=0x9
+for this event means all <invalid> and <modified> cachelines. Which options are allowed for which box is listed in LIKWID's html documentation. The values for the options can be found in the vendors performance monitoring documentations. Likwid measures the first CPU of socket 0 and the first CPU of socket 1. See likwid-pin(1) for details regarding the cpu expressions.
+For more code examples have a look at the likwid WIKI pages and LIKWID's html documentation.
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/rrze-likwid/likwid/issues>.
 .SH SEE ALSO
-likwid-topology(1), likwid-features(1), likwid-pin(1), likwid-bench(1)
+likwid-topology(1), likwid-perfscope(1), likwid-pin(1), likwid-bench(1)
diff --git a/doc/likwid-perfscope.1 b/doc/likwid-perfscope.1
index 2d48e21..e7d2316 100644
--- a/doc/likwid-perfscope.1
+++ b/doc/likwid-perfscope.1
@@ -1,55 +1,177 @@
 .TH LIKWID-PERFSCOPE 1 <DATE> likwid\-<VERSION>
 .SH NAME
 likwid-perfscope \- Frontend for the timeline mode of
-.N likwid-perfctr(1)
-that on-the-fly generates pictures from the measurements
+.B likwid-perfctr(1)
+that generates pictures on-the-fly from the measurements
 .SH SYNOPSIS
-.B likwid-perfscope 
-.RB [\-h]
-.RB [ \-cores
+.B likwid-perfscope
+.RB [\-hvadp]
+.RB [ \-c
+.IR <cpu_list> ]
+.RB [ \-C
 .IR <cpu_list> ]
-.RB [ \-freq
+.RB [ \-t
 .IR <frequency> ]
-.RB [ \-group
-.IR <eventset> ]
+.RB [ \-r
+.IR <value> ]
+.RB [ \-g
+.IR <eventset_and_plotconfig> ]
+.RB [ \-\-\^host
+.IR <hostname> ]
+.B <executable>
+
 .SH DESCRIPTION
 .B likwid-perfscope
-is a command line application written in Perl that uses the timeline daemon mode of
+is a command line application written in Lua that uses the timeline daemon mode of
 .B likwid-perfctr(1)
 to create on-the-fly pictures with the current measurements. It uses the
 .B feedGnuplot(1)
-script to send the current data to gnuplot.
+script to send the current data to gnuplot. Since the plot windows are normally closed directly after the execution of the monitored applications,
+.B likwid-perfscope
+waits until Ctrl+c is pressed.
 .SH OPTIONS
 .TP
-.B \-h
-prints a help message to standard output, then exits.
+.B \-\^h,\-\-\^help
+Prints a help message to standard output, then exits.
 .TP
-.B \-\^cores " <cpu_list>
-measures the given group on given CPUs in <cpu_list>
+.B \-\^v,\-\-\^version
+Prints version information to standard output, then exits.
+.TP
+.B \-\^C " <cpu_list>
+Measures on given CPUs in <cpu_list>. See
+.B likwid-pin(1)
+for further information about the syntax.
+.TP
+.B \-\^C " <cpu_list>
+Measures the given group on given CPUs in <cpu_list>. See
+.B likwid-pin(1)
+for further information about the syntax. The application is pinned to that cores.
+.TP
+.B \-\^a,\-\-\^all
+List preconfigured event and plot configurations
+.TP
+.B \-\^d,\-\-\^dump
+Print the measurements of
+.B likwid-perfctr(1)
+to stdout.
 .TP
-.B \-\^freq " <frequency>
-reads the current performance values every <frequency>. Available suffixes are 's' and 'ms', e.g. 500ms. Default value is 1s
+.B \-\^t,\-\-\^time " <frequency>
+Reads the current performance values every <frequency>. Available suffixes are 's', 'ms' or 'us, e.g. 500ms. Default value is 1s.
 .TP
-.B \-\^group " <eventset>
-defines the events and counters that should be read. Possible values can be gathered from
+.B \-\^g,\-\-\^group " <eventset_and_plotconfig>
+Defines the events and counters that should be read. Possible values can be gathered from
 .B likwid-perfctr(1).
-Default is group 'FLOPS_DP'
+You can give multiple
+.B \-\^g
+options on the commandline. They will be measured in a round-robin fashion and one plot generated per option. Moreover, the
+.B \-\^g
+option accepts config options for
+.B feedGnuplot(1),
+see section
+.B EVENTSETS
+.TP
+.B \-\^r,\-\-\^range " <value>
+Plot only the last <value> values. Often refered to as sliding window.
+.TP
+.B \-\^p,\-\-\^plotdump
+Use the dumping feature of feedGnuplot to print out the plot configuration and its data at each timestep.
+Can be used to create file-based plots afterwards.
+.TP
+.B \-\-\^host " <hostname>
+Instead of performing likwid-perfctr on the local machine, execute it on a remote machine and plot data locally. Uses ssh and you probably need to enter the password before starting. You can also give something like user at host.
+
+
+.SH EVENTSETS
+In contrast to the \-\^g option for
+.B likwid-perfctr
+the \-\^g option for
+.B likwid-perfscope
+is extended to accept configuration options for
+.B feedGnuplot.
+There are some predefined plot configurations embedded into
+.B likwid-perfscope
+which can be listed with
+.B \-\^a
+command line option. They are filtered to show only configs that are available for your current system.
+If you need to measure and plot custom events you can set the plotting options as last entry in your eventset. The plotting config options can be set as a ':' separated list. If you select preconfigured group, you can overwrite single fields in the config like changing the title or the matching. The folling options are available:
 
-.SH EXAMPLE
 .IP 1. 4
-Monitor double precision floating-point operations:
+.B title=<string>, TITLE=<string>
 .TP
-.B likwid-perfscope -group FLOPS_DP -cores 0-3 -freq 500ms
+Use the given title for the plot, use "" to enclose text with spaces and escape characters which could be interpreted by the shell. ':' are not allowed!
+.PP
+.IP 2. 4
+.B xtitle=<string>, XTITLE=<string>
+.TP
+Use the given title for the x-axis of the plot, use "" to enclose text with spaces and escape characters which could be interpreted by the shell. ':' are not allowed!
+.PP
+.IP 3. 4
+.B ytitle=<string>, YTITLE=<string>
+.TP
+Use the given title for the left y-axis of the plot, use "" to enclose text with spaces and escape characters which could be interpreted by the shell. ':' are not allowed!
+.PP
+.IP 4. 4
+.B <string>=<string>
+.TP
+All option string items that is not recognized as keyword like TITLE are used as formulas for the output. You can set multiple of those items in one option string. Each is calculated and integrated in the output plot. The first <string> is used as legend entry. The second <string> is the formula for the function.
+.PP
+.IP 5. 4
+.B y2title=<string>, Y2TITLE=<string>, y2title=<id-string>, Y2TITLE=<id-string>
+.TP
+Use the given title for the right y-axis of the plot. If no id is set, the last y2-axis is related to the last formula. If id is set, the formula with the id is used for the y2-axis. The id starts with index 1 for the first formula. Use "" to enclose text with spaces and escape characters which could be interpreted by the shell with '\'. ':' are not allowed!
+.PP
+
+.SH EXAMPLE
+.IP 1. 5
+Measure and print a preconfigured plotting configuration:
+.TP
+.B likwid-perfscope -g L3_BAND -C 0-2 -t 1s ./a.out
+.PP
+This measures the L3 bandwidth with likwid-perfctr every second on CPU cores 0,1,2 and use the plotting configuration L3_BAND. The plot will have a title and the axes are labeled properly.
+.IP 2. 5
+Measure and print a preconfigured plotting configuration:
+.TP
+.B likwid-perfscope -g L2_BAND:TITLE="My Title" -C 0 -t 1s ./a.out
+.PP
+This measures the L2 bandwidth with likwid-perfctr every second on CPU core 0 and use the plotting configuration L2_BAND. The title of the output plot is changed to the custom title "My Title".
+.IP 3. 5
+Custom event set with plotting configuration:
+.TP
+.B likwid-perfscope -g INSTR_RETIRED_ANY:FIXC0,CPU_CLK_UNHALTED_CORE:FIXC1,CPI=FIXC0/FIXC1:YTITLE="Cycles per Instruction" -C 0 --time 500ms ./a.out
 .PP
 Executes
 .B likwid-perfctr
-on the first four cores. The values are read every 500ms are forwarded to gnuplot using the
-.B feedGnuplot
-script.
+on the first core. The values for the events
+.B INSTR_RETIRED_ANY
+and
+.B CPU_CLK_UNHALTED_CORE
+are read every 500ms. The raw values are transformed using the formula
+.B FIXC0/FIXC1
+and forwarded to gnuplot using the
+.B feedGnuplot(1)
+script with the curve name 'CPI' in the legend. The y-axis is labeled with the string "Cycles per Instruction".
+IP 4. 5
+Custom event set with plotting configuration:
+.TP
+.B likwid-perfscope -g L3_BAND,CPI=FIXC0/FIXC1:Y2TITLE="2-Cycles per Instruction" -C 0 --time 500ms ./a.out
+.PP
+This measures the L3 bandwidth for CPU 0 every 500 ms. Additionally, a second curve is plotted with the function
+.B FIXC0/FIXC1
+with the legend entry
+.B CPI.
+The right y-axis is labeled with
+.B 'Cycles per Instruction'
+and is associated to the second formula. The first formula is hidden in the
+.B L3_BAND
+plot group. Since the
+.B CPI
+formula is the last in the list, the curve id is not needed in the
+.B Y2TITLE
+as this is the default behavior.
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/rrze-likwid/likwid/issues>.
 .SH "SEE ALSO"
-likwid-perfctr(1), feedGnuplot(1), likwid-pin(1), likwid-powermeter(1), likwid-setFrequencies(1)
+likwid-perfctr(1), feedGnuplot(1)
diff --git a/doc/likwid-pin.1 b/doc/likwid-pin.1
index efea873..1929822 100644
--- a/doc/likwid-pin.1
+++ b/doc/likwid-pin.1
@@ -2,138 +2,182 @@
 .SH NAME
 likwid-pin \- pin a sequential or threaded application to dedicated processors
 .SH SYNOPSIS
-.B likwid-pin 
-.RB [\-vhqipS]
+.B likwid-pin
+.RB [\-vhSpqi]
+.RB [ \-V
+.IR verbosity ]
 .RB [ \-c
-.IR <core_list> ]
+.IR corelist ]
 .RB [ \-s
-.IR <skip_mask> ]
-.RB [ \-d
-.IR <delimiter> ]
+.IR skip_mask ]
 .SH DESCRIPTION
 .B likwid-pin
-is a command line application to pin a sequential or multithreaded 
-applications to dedicated processors. It can be used as replacement for 
-.B taskset(1).
+is a command line application to pin a sequential or multithreaded
+application to dedicated processors. It can be used as replacement for taskset.
 Opposite to taskset no affinity mask but single processors are specified.
-For multithreaded applications based on the pthread library the 
-.I pthread_create
+For multithreaded applications based on the pthread library the
+.B pthread_create
 library call is overloaded through LD_PRELOAD and each created thread is pinned
-to a dedicated processor as specified in 
-.I core_list
-.
+to a dedicated processor as specified in
+.I core_list .
 .PP
-Per default every generated thread is pinned to the core in the order of calls 
-to 
-.I pthread_create.
-It is possible to skip single threads using -s commandline option.
+Per default every generated thread is pinned to the core in the order of calls to
+.B pthread_create
+it is possible to skip single threads.
 .PP
-For OpenMP implementations gcc and icc compilers are explicitly supported. Others may also work.
+The OpenMP implementations of GCC and ICC compilers are explicitly supported.
+Clang's OpenMP backend should also work as it is built on top of Intel's OpenMP runtime library.
+Others may also work
 .B likwid-pin
-sets the environment variable OMP_NUM_THREADS for you if not already present.
-It will set as many threads as present in the pin expression.  Be aware that
+sets the environment variable
+.B OMP_NUM_THREADS
+for you if not already present.
+It will set as many threads as present in the pin expression. Be aware that
 with pthreads the parent thread is always pinned. If you create for example 4
 threads with
-.I pthread_create 
-and do not use the parent process as worker you
-still have to provide num_threads+1 processor ids.
+.B pthread_create
+and do not use the parent process as worker you still have to provide
+.B num_threads+1
+processor ids.
 .PP
 .B likwid-pin
-supports different numberings for pinning. Per default physical numbering of
-the cores is used.  This is the numbering also 
-.B likwid-topology(1)
-reports. But also logical numbering inside the node or the sockets can be used.  If using
-with a N (e.g. -c N:0-6) the cores are logical numbered over the whole node.
-Physical cores come first. If a system e.g. has 8 cores with 16 SMT threads
-with -c N:0-7 you get all physical cores.  If you specify -c N:0-15 you get all
-physical cores and all SMT threads. With S you can specify logical numberings
-inside sockets, again physical cores come first. You can mix different domains
-separated with @. E.g. -c S0:0-3 at S2:2-3 you pin thread 0-3 to logical cores 0-3 on socket 0
-and threads 4-5 on logical cores 2-3 on socket 2.
+supports different numberings for pinning. See section
+.B CPU EXPRESSION
+for details.
 .PP
-For applications where first touch policy on numa systems cannot be employed
+For applications where first touch policy on NUMA systems cannot be employed
 .B likwid-pin
 can be used to turn on interleave memory placement. This can significantly
-speed up the performance of memory bound multithreaded codes. All numa nodes
+speed up the performance of memory bound multithreaded codes. All NUMA nodes
 the user pinned threads to are used for interleaving.
 
 .SH OPTIONS
 .TP
-.B \-\^v
+.B \-\^h,\-\-\^help
+prints a help message to standard output, then exits.
+.TP
+.B \-\^v,\-\-\^version
 prints version information to standard output, then exits.
 .TP
-.B \-\^h
-prints a help message to standard output, then exits.
+.B \-\^V, \-\-\^verbose <level>
+verbose output during execution for debugging. 0 for only errors, 1 for informational output, 2 for detailed output and 3 for developer output
 .TP
-.B \-\^c " <processor_list> OR <thread_expression> OR <scatter policy> "
-specify a numerical list of processors. The list may contain multiple 
-items, separated by comma, and ranges. For example 0,3,9-11. You can also use
-logical numberings, either within a node (N), a socket (S<id>) or a numa domain (M<id>).
-likwid-pin also supports logical pinning within a cpuset with a L prefix. If you ommit this option
-likwid-pin will pin the threads to the processors on the node with physical cores first.
-See below for details on using a thread expression or scatter policy
+.B \-\^c <cpu expression>
+specify a numerical list of processors. The list may contain multiple  items, separated by comma, and ranges. For example 0,3,9-11. Other format are available, see the
+.B CPU EXPRESSION
+section.
 .TP
-.B \-\^s " <skip_mask>
+.B \-\^s, \-\-\^skip <mask>
 Specify skip mask as HEX number. For each set bit the corresponding thread is skipped.
 .TP
-.B \-\^S
-All ccNUMA memory domains belonging to the specified threadlist will be cleaned before the run. Can solve file buffer cache problems on Linux.
+.B \-\^S,\-\-\^sweep
+All ccNUMA memory domains belonging to the specified thread list will be cleaned before the run. Can solve file buffer cache problems on Linux.
 .TP
 .B \-\^p
-prints the available thread domains for logical pinning. If used in combination with -c, the physical processor IDs are printed to stdout.
+prints the available thread domains for logical pinning
 .TP
 .B \-\^i
-set numa memory policy to interleave spanning all numa nodes involved in pinning
+set NUMA memory policy to interleave involving all NUMA nodes involved in pinning
 .TP
-.B \-\^q
+.B \-\^q,\-\-\^quiet
 silent execution without output
-.TP
-.B \-\^d " <delimiter>
-set delimiter used to output the physical processor list (-p & -c)
 
+.SH CPU EXPRESSION
+.IP 1. 4
+The most intuitive CPU selection method is a comma-separated list of phyiscal CPU IDs. An example for this is
+.B 0,2
+which schedules the threads on CPU cores 
+.B 0
+and
+.B 2.
+The physical numbering also allows the usage of ranges like
+.B 0-2
+which results in the list
+.B 0,1,2.
+.IP 2. 4
+The CPUs can be selected by their indices inside of an affinity domain. The affinity domain is optional and if not given, Likwid assumes the domain
+.B 'N'
+for the whole node. The format is
+.B L:<indexlist>
+for selecting the CPUs inside of domain
+.B 'N'
+or
+.B L:<domain>:<indexlist>
+for selecting the CPUs inside the given domain. Assuming an virtual affinity domain
+.B 'P'
+that contains the CPUs
+.B 0,4,1,5,2,6,3,7.
+After sorting it to have physical cores first we get:
+.B 0,1,2,3,4,5,6,7.
+The logical numbering
+.B L:P:0-2
+results in the selection
+.B 0,1,2
+from the physical cores first list.
+.IP 3. 4
+The expression syntax enables the selection according to an selection function with variable input parameters. The format is either
+.B E:<affinity domain>:<numberOfThreads>
+to use the first <numberOfThreads> threads in affinity domain <affinity domain> or
+.B E:<affinity domain>:<numberOfThreads>:<chunksize>:<stride>
+to use <numberOfThreads> threads with <chunksize> threads selected in row while skipping <stride> threads in affinity domain <affinity domain>. Examples are
+.B E:N:4:1:2
+for selecting the first four physical CPUs on a system with 2 SMT threads per core or
+.B E:P:4:2:4
+for choosing the first two threads in affinity domain
+.B P,
+skipping 2 threads and selecting again two threads. The resulting CPU list for virtual affinity domain
+.B P
+is
+.B 0,4,2,6
+.IP 3. 4
+The last format schedules the threads not only in a single affinity domain but distributed them evenly over all available affinity domains of the same kind. In contrast to the other formats, the selection is done using the physical cores first and then the SMT threads. The format is
+.B <affinity domain without number>:scatter
+like
+.B M:scatter
+to schedule the threads evenly in all available memory affinity domains. Assuming the two socket domains
+.B S0 = 0,4,1,5
+and
+.B S1 = 2,6,3,7
+the expression
+.B S:scatter
+results in the CPU list
+.B 0,2,1,3,4,6,5,7
 
 .SH EXAMPLE
-.IP 1. 4
+.IP 1. 5
 For standard pthread application:
 .TP
-.B likwid-pin -c 0,2,4-6  ./myApp
+.B likwid-pin -c 0,2,4-6 ./myApp
 .PP
-The parent process is pinned to processor 0. Thread 0 to processor 2, thread
-1 to processor 4, thread 2 to processor 5 and thread 3 to processor 6. If more threads
-are created than specified in the processor list, these threads are pinned to processor 0
-as fallback.
-.IP 2. 4
-For gcc OpenMP as many ids must be specified in processor list as there are threads: 
-.TP
-.B OMP_NUM_THREADS=4; likwid-pin -c 0,2,1,3  ./myApp
-.IP 3. 4
-Full control over the pinning can be achieved by specifying a skip mask.
-For example the following command skips the pinning of thread 1:
+The parent process is pinned to processor 0 which is likely to be thread 0 in
+.B ./myApp.
+Thread 1 is pinned to processor 2, thread 2 to processor 4, thread 3 to processor 5 and thread 4 to processor 6. If more threads
+are created than specified in the processor list, these threads are pinned to processor 0 as fallback.
+.IP 2. 5
+For selection of CPUs inside of a CPUset only the logical numbering is allowed. Assuming CPUset
+.B 0,4,1,5:
 .TP
-.B OMP_NUM_THREADS=4; likwid-pin -s 0x1 -c 0,2,1,3  ./myApp
-.IP 4. 4
-The -c switch supports the definition of threads in a specific affinity domain like
-NUMA node or cache group. The available affinity domains can be retrieved with the -p switch 
-and no further option on the commandline. The common affinity domains are N (whole Node), 
-SX (socket X), CX (cache group X) and MX (memory group X). Multiple affinity domains 
-can be set separated by @. In order to pin 2 threads on each socket of a 2-socket system:
-.TP
-.B OMP_NUM_THREADS=4; likwid-pin -c S0:0-1 at S1:0-1  ./myApp
-.IP 5. 4
-Another argument definition of the -c switch allows the threads to be pinned according
-to an expression like E:N:4:1:2. The syntax is E:<thread domain>:<number of threads>(:<chunk size>:<stride>).
-The example pins 8 threads with 2 SMT threads per core on a SMT 4 machine:
-.TP
-.B OMP_NUM_THREADS=4; likwid-pin -c E:N:8:2:4  ./myApp
-.IP 6. 4
-The last alternative for the -c switch is the automatic scattering of threads on affinity domains.
-For example to scatter the threads over all memory domains in a system:
+.B likwid-pin -c L:1,3 ./myApp
+.PP
+This command pins
+.B ./myApp
+on CPU
+.B 4
+and the thread started by
+.B ./myApp
+on CPU
+.B 5
+.IP 3. 5
+A common use-case for the numbering by expression is pinning of an application on the Intel Xeon Phi coprocessor with its 60 cores each having 4 SMT threads.
 .TP
-.B OMP_NUM_THREADS=4; likwid-pin -c M:scatter  ./myApp
+.B likwid-pin -c E:N:60:1:4 ./myApp
+.PP
+This command schedules one thread per physical CPU core for
+.B ./myApp.
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/rrze-likwid/likwid/issues>.
 .SH "SEE ALSO"
-taskset(1), likwid-perfctr(1), likwid-features(1), likwid-powermeter(1), likwid-setFrequencies(1)
+taskset(1), likwid-perfctr(1), likwid-features(1), likwid-topology(1),
diff --git a/doc/likwid-powermeter.1 b/doc/likwid-powermeter.1
index f4a3ba2..c114142 100644
--- a/doc/likwid-powermeter.1
+++ b/doc/likwid-powermeter.1
@@ -3,49 +3,72 @@
 likwid-powermeter \- A tool to print power and clocking information on Intel CPUs
 .SH SYNOPSIS
 .B likwid-powermeter 
-.RB [ \-vhip ]
+.RB [ \-vhpitf ]
+.RB [ \-V
+.IR verbosity_level ]
 .RB [ \-c
-.IR <socket_list> ]
+.IR socket_list ]
 .RB [ \-s
-.IR <duration_in_seconds> ]
+.IR duration ]
 .RB [ \-M
-.IR <access_mode>]
+.IR <0|1> ]
 .SH DESCRIPTION
 .B likwid-powermeter
-is a command line application to get the energy comsumption of Intel RAPL capable processors. 
-It also prints information about TDP and Turbo Mode steps supported.
+is a command line application to get the Energy comsumption on Intel RAPL capable processors. Currently
+only Intel SandyBridge is supported. It also prints information about TDP and Turbo Mode steps supported.
 The Turbo Mode information works on all Turbo mode enabled Intel processors. The tool can be either used
 in stethoscope mode for a specified duration or as a wrapper to your application measuring your complete 
 run. RAPL works on a per package (socket) base.
-Please note that the RAPL counters are also accessible as normal events within
-.B likwid-perfctr.
+Please note that the RAPL counters are also accessible as normal events withing likwid-perfctr.
 .SH OPTIONS
 .TP
-.B \-\^v
+.B \-\^h,\-\-\^help
+prints a help message to standard output, then exits.
+.TP
+.B \-\^v,\-\-\^version
 prints version information to standard output, then exits.
 .TP
-.B \-\^h
-prints a help message to standard output, then exits.
+.B \-\^V, \-\-\^verbose <level>
+verbose output during execution for debugging. 0 for only errors, 1 for informational output, 2 for detailed output and 3 for developer output
+.TP
+.B \-\^c <socket_list>
+set on which socket(s) the RAPL interface is accessed. List of sockets like 0,1,2 or 0-2 are allowed.
 .TP
-.B \-\^c " <socket_list>"
-set on which sockets the RAPL interface is accessed. comma-separated list of socket IDs
+.B \-\^M <0|1>
+set how MSR registers are accessed, 0=direct, 1=accessDaemon.
+.TP
+.B \-\^s <duration>
+set measure duration in us, ms or s. (default 2s)
 .TP
 .B \-\^p
-prints out information about dynamic clocks and CPI information on the socket measured. Uses likwid-perfctr internally.
+prints out information about dynamic clocks and CPI information on the socket(s) measured.
 .TP
-.B \-\^i
-prints out information TDP and Turbo mode steps
+.B \-\^i,\-\-\^info
+prints out information TDP and Turbo mode steps of all RAPL domains supporting it.
 .TP
-.B \-\^M " <access_mode>"
-set the access method. 0 for direct access to MSR/RAPL registers, 1 for using the accessDaemon.
+.B \-\^t
+prints out the temperature of all CPUs in the system.
 .TP
-.B \-\^s " <duration_in_seconds>
-measure the power for a specific time (default 2s)
+.B \-\^f
+prints out the temperature like
+.B \-\^t
+but used Fahrenheit as temperature unit.
 
+.SH EXAMPLE
+.IP 1. 3
+Measure the power consumption for 4 seconds on socket 1
+.TP
+.B likwid-powermeter -s 4 -c 1
+.PP
+.IP 2. 3
+Use it as wrapper for an application to measure the energy for the whole execution
+.TP
+.B likwid-powermeter -c 1 ./a.out
+.PP
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/rrze-likwid/likwid/issues>.
 .SH "SEE ALSO"
-likwid-topology(1), likwid-perfctr(1), likwid-pin(1), likwid-features(1), likwid-setFrequencies(1)
+likwid-topology(1), likwid-perfctr(1), likwid-pin(1)
diff --git a/doc/likwid-setFreq.1 b/doc/likwid-setFreq.1
index 87054c7..5ab9653 100644
--- a/doc/likwid-setFreq.1
+++ b/doc/likwid-setFreq.1
@@ -4,7 +4,7 @@ likwid-setFreq \- Mediator for
 .B likwid-setFrequencies(1)
 that performs the actual setting of CPU cores' frequency and governor.
 .SH SYNOPSIS
-.B likwid-setFreq 
+.B likwid-setFreq
 .IR <coreId>
 .IR <frequency>
 .IR [<governor>]
@@ -14,11 +14,13 @@ that performs the actual setting of CPU cores' frequency and governor.
 is a command line application that mediates the request from
 .B likwid-setFrequencies(1)
 because setting a CPU core's frequency and/or governor requires root privileges. This executable must be suid-root.
+.B likwid-setFreq
+works only with the kernel module acpi-cpufreq. The recent intel_pstate module does not allow to set fixed frequencies.
 
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/rrze-likwid/likwid/issues>.
 .SH "SEE ALSO"
-likwid-setFrequencies(1), likwid-perfctr(1), feedGnuplot(1), likwid-pin(1), likwid-powermeter(1)
+likwid-setFrequencies(1)
diff --git a/doc/likwid-setFrequencies.1 b/doc/likwid-setFrequencies.1
index b268280..99dd777 100644
--- a/doc/likwid-setFrequencies.1
+++ b/doc/likwid-setFrequencies.1
@@ -3,20 +3,30 @@
 likwid-setFrequencies \- print and manage the clock frequency of CPU cores
 .SH SYNOPSIS
 .B likwid-setFrequencies 
-.RB [\-hpl]
+.RB [\-hvplmp]
 .RB [ \-c
-.IR <cpu_list,_socket_list_or_expression> ]
+.IR <cpu_list> ]
 .RB [ \-g
 .IR <governor> ]
-.RB [ \-f
+.RB [ \-f,\-\-\^freq
 .IR <frequency> ]
 .SH DESCRIPTION
 .B likwid-setFrequencies
-is a command line application to set the clock frequency of CPU cores. Since only priviledged users are allowed to change the frequency of CPU cores, the application works in combination with a daemon 
-.B likwid-setFreq.
-The daemon needs the suid permission bit to be set in order to manipulate the sysfs entries. With 
+is a command line application to set the clock frequency of CPU cores. Since only priviledged users are allowed to change the frequency of CPU cores, the application works in combination with a daemon
+.B likwid-setFreq(1).
+The daemon needs the suid permission bit to be set in order to manipulate the sysfs entries. With
 .B likwid-setFrequencies
 the clock of all cores inside a the cpu_list or affinity domain can be set to a specific frequency or governor at once.
+.B likwid-setFrequencies
+works only with the kernel module
+.B acpi-cpufreq.
+The recent
+.B intel_pstate
+module does not allow to set fixed frequencies. In order to deactivate
+.B intel_pstate
+add 'intel_pstate=disable' to your kernel boot commandline (commonly in grub) and load the
+.B acpi-cpufreq
+module.
 .SH OPTIONS
 .TP
 .B \-h
@@ -28,19 +38,23 @@ prints the current frequencies for all CPU cores
 .B \-l
 prints all configurable frequencies
 .TP
-.B \-\^c " <cpu_list,_socket_list_or_expression>
-set the affinity domain where to set the frequencies. Common are N (Node), SX (Socket X), CX (Cache Group X) and MX (Memory Group X). For detailed information about affinity domains see
+.B \-m
+prints all configurable governors
+.TP
+.B \-\^c <cpu_list>
+set the affinity domain where to set the frequencies. Common are N (Node), SX (Socket X), CX (Cache Group X) and MX (Memory Group X).
+For detailed information about affinity domains see
 .B likwid-pin(1)
 .TP
-.B \-\^g " <governor>
+.B \-\^g <governor>
 set the governor of all CPU cores inside the affinity domain. Current governors are ondemand, performance, turbo. Default is ondemand
 .TP
-.B \-\^f " <frequency>
+.B \-\^f, \-\-\^freq <frequency>
 set a fixed frequency at all CPU cores inside the affinity domain. Implicitly sets userspace governor for the cores.
 
 .SH AUTHOR
-Written by Thomas Roehl <thomas.roehl at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/rrze-likwid/likwid/issues>.
 .SH "SEE ALSO"
-likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1), likwid-topology(1),
+likwid-pin(1), likwid-perfctr(1), likwid-powermeter(1)
diff --git a/doc/likwid-topology.1 b/doc/likwid-topology.1
index 64bc8b4..4a7a9da 100644
--- a/doc/likwid-topology.1
+++ b/doc/likwid-topology.1
@@ -2,41 +2,47 @@
 .SH NAME
 likwid-topology \- print thread and cache topology
 .SH SYNOPSIS
-.B likwid-topology 
+.B likwid-topology
 .RB [\-hvgcC]
+.RB [ \-V
+.IR level ]
 .RB [ \-o
-.IR <filename> ]
+.IR output_file ]
 .SH DESCRIPTION
 .B likwid-topology
-is a command line application to print the thread and cache topology on multicore x86 processors. Used with mono spaced fonts it can
-draw the processor topology of a machine in ASCII art. Beyond topology
-.B likwid-topology
-determines the clock of a processor and prints detailed informations about the caches hierarchy and NUMA structure.
+is a command line application to print the thread and cache
+topology on multicore x86 processors. Used with mono spaced fonts it can
+draw the processor topology of a machine in ascii art. Beyond topology
+likwid-topology determines the clock of a processor and prints detailed
+informations about the caches hierarchy.
 .SH OPTIONS
 .TP
-.B \-v
+.B \-h, \-\-\^help
+prints a help message to standard output, then exits.
+.TP
+.B \-v, \-\-\^version
 prints version information to standard output, then exits.
 .TP
-.B \-h
-prints a help message to standard output, then exits.
+.B \-V, \-\-\^verbose <level>
+sets the verbosity level of LIKWID's topology backend. Possible levels range from 0 to 3.
 .TP
 .B \-g
 prints topology information in ASCII art. Best viewed with monospaced font.
 .TP
-.B \-c
-prints detailed informations about cache hierarchy
+.B \-c, \-\-\^caches
+prints detailed information about cache hierarchy
 .TP
-.B \-C
-measures and output the processor clock. This involves a longer runtime of
-.B likwid-topology.
+.B \-C, \-\-\^clock
+measures and output the processor clock. This involves a longer run time of likwid-topology.
 .TP
-.B \-\^f " <filename>
-Specify output file for topology information. According to the file suffix, the information
-is converted using converter scripts installed at <PREFIX>/share/likwid
+.B \-o, \-\-\^output <file>
+write the output to file instead of stdout.
+Likwid applies filter scripts according to filename suffix.
+Currently available scripts are xml and csv.
 
 .SH AUTHOR
-Written by Jan Treibig <jan.treibig at gmail.com>.
+Written by Thomas Roehl <thomas.roehl at googlemail.com>.
 .SH BUGS
-Report Bugs on <http://code.google.com/p/likwid/issues/list>.
+Report Bugs on <https://github.com/rrze-likwid/likwid/issues>.
 .SH "SEE ALSO"
-likwid-perfctr(1), likwid-features(1), likwid-pin(1), likwid-powermeter(1), likwid-setFrequencies(1)
+likwid-perfctr(1), likwid-features(1), likwid-pin(1),
diff --git a/doc/likwid.cfg.md b/doc/likwid.cfg.md
new file mode 100644
index 0000000..2122dee
--- /dev/null
+++ b/doc/likwid.cfg.md
@@ -0,0 +1,38 @@
+/*! \page likwid.cfg <CODE>likwid.cfg</CODE>
+<H1>Information</H1>
+<CODE>likwid.cfg</CODE> is the global configuration file for LIKWID but it is optional. The configuration is normally defined at compile time. It allows to set the path to the access mode for the MSR/PCI access daemon and some other basic options.<BR>
+LIKWID searches for the configuration file at different paths like <CODE>/usr/local/etc/likwid.cfg</CODE>.<BR>
+<B>Note: It was introduced with version 4 and is not fully integrated in the LIKWID code.</B>
+
+<H1>Config file options</H1>
+<H1>Config file</H1>
+The global configuration file has the following options:
+<TABLE>
+<TR>
+  <TH>Option</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>topology_file = <path></TD>
+  <TD>Path to the toplogy file created with \ref likwid-genTopoCfg</TD>
+</TR>
+<TR>
+  <TD>access_mode = <daemon|direct></TD>
+  <TD>Set access mode. The direct mode can only used by users with root priviledges. The daemon uses \ref likwid-accessD.</TD>
+</TR>
+<TR>
+  <TD>daemon_path = <path></TD>
+  <TD>Path to the access daemon.</TD>
+</TR>
+<TR>
+  <TD>max_threads = <arg></TD>
+  <TD>Adjust maximally supported threads/CPUs. <B>Note:</B> not use by now, fixed at compile time.</TD>
+</TR>
+<TR>
+  <TD>max_nodes = <arg></TD>
+  <TD>Adjust maximally supported NUMA nodes. <B>Note:</B> not use by now, fixed at compile time.</TD>
+</TR>
+</TABLE>
+
+
+*/
diff --git a/doc/logo.png b/doc/logo.png
new file mode 100644
index 0000000..048ed9a
Binary files /dev/null and b/doc/logo.png differ
diff --git a/doc/lua-doxygen.md b/doc/lua-doxygen.md
new file mode 100644
index 0000000..b8bcd79
--- /dev/null
+++ b/doc/lua-doxygen.md
@@ -0,0 +1,2615 @@
+/*! \page lua_Info Information about LIKWID's Lua API
+<H1>How to include Lua API into own Lua applications</H1>
+<CODE>
+package.path = package.path .. ';<PREFIX>/share/lua/?.lua'<BR>
+local likwid = require("likwid")<BR>
+</CODE>
+<P></P>
+Now all function and variables can be called with<BR>
+<CODE>likwid.<I>functionname()</I></CODE><BR>
+or<BR>
+<CODE>likwid.<I>variable</I></CODE>
+
+<H1>Global variables defined by LIKWID's Lua API</H1>
+<TABLE>
+<TR>
+  <TH>Variablename</TH>
+  <TH>Description</TH>
+</TR>
+<TR>
+  <TD>\a groupfolder</TD>
+  <TD>Path to the folder containing the definitions of the performance groups</TD>
+</TR>
+<TR>
+  <TD>\a version</TD>
+  <TD>Version of LIKWID</TD>
+</TR>
+<TR>
+  <TD>\a release</TD>
+  <TD>Release number of LIKWID</TD>
+</TR>
+<TR>
+  <TD>\a pinlibpath</TD>
+  <TD>Path to the pinning library. Is added automatically to $LD_PRELOAD by \ref likwid-pin and \ref likwid-perfctr</TD>
+</TR>
+<TR>
+  <TD>\a hline</TD>
+  <TD>Horizontal line with 80 '-' characters</TD>
+</TR>
+<TR>
+  <TD>\a sline</TD>
+  <TD>Horizontal line with 80 '*' characters</TD>
+</TR>
+<TR>
+  <TD>\a dline</TD>
+  <TD>Horizontal line with 80 '=' characters</TD>
+</TR>
+</TABLE>
+*/
+
+/*! \page lua_Config Config file module
+<H1>Data type definition for Lua config file module in the Lua API</H1>
+\anchor lua_config
+<H2>Config file read</H2>
+<P>This structure is returned by \ref getConfiguration function<BR>The config file can be created with \ref likwid-genTopoCfg executable. It searches the files /etc/likwid.cfg and <PREFIX>/etc/likwid.cfg. Other configuration file paths can be set in config.mk before building LIKWID.</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a configFile</TD>
+  <TD>Path to the config file</TD>
+</TR>
+<TR>
+  <TD>\a topologyFile</TD>
+  <TD>Path to the config file containing topology information</TD>
+</TR>
+<TR>
+  <TD>\a daemonPath</TD>
+  <TD>Path to the access daemon</TD>
+</TR>
+<TR>
+  <TD>\a daemonMode</TD>
+  <TD>Access mode for LIKWID (0 = direct access, 1 = access daemon)</TD>
+</TR>
+<TR>
+  <TD>\a maxNumThreads</TD>
+  <TD>Maximal amount of hardware threads in the system</TD>
+</TR>
+<TR>
+  <TD>\a maxNumNodes</TD>
+  <TD>Maximal amount of NUMA nodes in the system</TD>
+</TR>
+<TR>
+  <TD>\a maxHashTableSize</TD>
+  <TD>Maximal size for the internally used hash table</TD>
+</TR>
+</TABLE>
+
+<H1>Function definitions for Lua config file module in the Lua API</H1>
+\anchor getConfiguration
+<H2>getConfiguration()</H2>
+<P>Read the configuration file and return a list of config options</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>List of configuration options, see \ref lua_config</TD>
+</TR>
+</TABLE>
+
+\anchor setVerbosity
+<H2>setVerbosity()</H2>
+<P>Define and/or change the verbosity level of LIKWID</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a verbosity</TD>
+      <TD>0 = only errors<BR>1 = infos<BR>2 = detail<BR>3 = developer<BR>Other flags are rejected.</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor putConfiguration
+<H2>putConfiguration()</H2>
+<P>Frees the C-structures that were created by \ref getConfiguration function.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+*/
+
+/*! \page lua_Access Access client module
+<H1>Data type definition for Lua access client module in the Lua API</H1>
+<H1>Function definitions for Lua access client module in the Lua API</H1>
+\anchor setAccessMode
+<H2>setAccessMode()</H2>
+<P>Define and/or change the access mode to the MSR and PCI registers</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a accessFlag</TD>
+      <TD>0 = direct access<BR>1 = access daemon<BR>Other flags are rejected.</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Always 0</TD>
+</TR>
+</TABLE>
+
+*/
+
+/*! \page lua_CPUTopology CPU information module
+<H1>Data type definition for CPU information module in the Lua API</H1>
+\anchor lua_cpuinfo
+<H2>Cpu Info</H2>
+<P>This structure is returned by \ref getCpuInfo function<BR>It is similar to the C struct CpuInfo</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a family</TD>
+  <TD>Family ID of CPU</TD>
+</TR>
+<TR>
+  <TD>\a model</TD>
+  <TD>Model ID of CPU</TD>
+</TR>
+<TR>
+  <TD>\a stepping</TD>
+  <TD>Revision of CPU</TD>
+</TR>
+<TR>
+  <TD>\a clock</TD>
+  <TD>Base clock frequency</TD>
+</TR>
+<TR>
+  <TD>\a turbo</TD>
+  <TD>Flag if the system supports the Turbo mode</TD>
+</TR>
+<TR>
+  <TD>\a name</TD>
+  <TD>Name of the microarchitecture</TD>
+</TR>
+<TR>
+  <TD>\a osname</TD>
+  <TD>Name of the CPU as given by manufacturer</TD>
+</TR>
+<TR>
+  <TD>\a short_name</TD>
+  <TD>Short name of microarchitecture</TD>
+</TR>
+<TR>
+  <TD>\a features</TD>
+  <TD>String with all interesting CPU feature flags as a space separated list</TD>
+</TR>
+<TR>
+  <TD>\a featureFlags</TD>
+  <TD>Bitmask with all interesting CPU feature flags<BR>Bit positions can be retrieved from the FeatureBit enum</TD>
+</TR>
+<TR>
+  <TD>\a isIntel</TD>
+  <TD>Flag to check if the system is using Intel CPUs</TD>
+</TR>
+<TR>
+  <TD>\a perf_version</TD>
+  <TD>Version of architectural performance monitoring capabilities</TD>
+</TR>
+<TR>
+  <TD>\a perf_num_ctr</TD>
+  <TD>Amount of core-local general-purpose counters</TD>
+</TR>
+<TR>
+  <TD>\a perf_num_fixed_ctr</TD>
+  <TD>Amount of core-local fixed-purpose counters</TD>
+</TR>
+<TR>
+  <TD>\a perf_width_ctr</TD>
+  <TD>Register width of core-local counters</TD>
+</TR>
+</TABLE>
+
+
+\anchor lua_cputopo
+<H2>Cpu Topology</H2>
+<P>This structure is returned by \ref getCpuTopology function<BR>The nested list structure is similar to the C struct CpuTopology.</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a numHWThreads</TD>
+  <TD>Total amount of hardware threads in the system</TD>
+</TR>
+<TR>
+  <TD>\a activeHWThreads</TD>
+  <TD>Amount of active hardware threads in the system</TD>
+</TR>
+<TR>
+  <TD>\a numSockets</TD>
+  <TD>Number of CPU sockets in the system</TD>
+</TR>
+<TR>
+  <TD>\a numCoresPerSocket</TD>
+  <TD>Number of physical cores of each socket in the system</TD>
+</TR>
+<TR>
+  <TD>\a numThreadsPerCore</TD>
+  <TD>Number of hardware threads of each core in the system</TD>
+</TR>
+<TR>
+  <TD>\a numCacheLevels</TD>
+  <TD>Amount of cache levels in the system</TD>
+</TR>
+<TR>
+  <TD>\a threadPool<BR>(List with<BR>\a numHWThreads entries)</TD>
+    <TD>
+    <TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>\a threadId</TD>
+      <TD>Thread ID</TD>
+    </TR>
+    <TR>
+      <TD>\a coreId</TD>
+      <TD>ID of physical CPU core</TD>
+    </TR>
+    <TR>
+      <TD>\a apicId</TD>
+      <TD>ID of the interrupt line for the hardware thread as defined by ACPI</TD>
+    </TR>
+    <TR>
+      <TD>\a packageId</TD>
+      <TD>ID of CPU socket for the current thread</TD>
+    </TR>
+    <TR>
+      <TD>\a inCpuSet</TD>
+      <TD>Defines whether the thread is available in current cpuset</TD>
+    </TR>
+    </TABLE>
+    </TD>
+</TR>
+<TR>
+  <TD>\a cacheLevels<BR>(List with<BR>\a numCacheLevels entries)</TD>
+    <TD>
+    <TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>\a level</TD>
+      <TD>Level of cache</TD>
+    </TR>
+    <TR>
+      <TD>\a associativity</TD>
+      <TD>Associativity in cache level</TD>
+    </TR>
+    <TR>
+      <TD>\a sets</TD>
+      <TD>Sets in cache level</TD>
+    </TR>
+    <TR>
+      <TD>\a lineSize</TD>
+      <TD>Size of a cache line in cache level</TD>
+    </TR>
+    <TR>
+      <TD>\a size</TD>
+      <TD>Size in bytes of cache level</TD>
+    </TR>
+    <TR>
+      <TD>\a threads</TD>
+      <TD>Amount of threads sharing the cache</TD>
+    </TR>
+    <TR>
+      <TD>\a inclusive</TD>
+      <TD>Inclusiveness of cache</TD>
+    </TR>
+    <TR>
+      <TD>\a type</TD>
+      <TD>
+        <TABLE>
+        <TR>
+          <TH>Typename</TH>
+          <TH>comment</TH>
+        </TR>
+        <TR>
+          <TD>DATACACHE</TD>
+          <TD>Cache manages only data</TD>
+        </TR>
+        <TR>
+          <TD>INSTRUCTIONCACHE</TD>
+          <TD>Cache manages only instructions</TD>
+        </TR>
+        <TR>
+          <TD>UNIFIEDCACHE</TD>
+          <TD>Cache manages data and instructions</TD>
+        </TR>
+        <TR>
+          <TD>ITLB</TD>
+          <TD>Translation Lookaside Buffer for instruction page addresses</TD>
+        </TR>
+        <TR>
+          <TD>DTLB</TD>
+          <TD>Translation Lookaside Buffer for data page addresses</TD>
+        </TR>
+        <TR>
+          <TD>NOCACHE</TD>
+          <TD>Type cannot be determined</TD>
+        </TR>
+        </TABLE>
+      </TD>
+    </TR>
+    </TABLE>
+    </TD>
+</TR>
+<TR>
+  <TD>\a topologyTree</TD>
+  <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>\a ID</TD>
+      <TD>ID of socket</TD>
+    </TR>
+    <TR>
+      <TD>\a Childs</TD>
+      <TD><TABLE>
+        <TR>
+            <TH>Membername</TH>
+            <TH>Comment</TH>
+        </TR>
+        <TR>
+            <TD>\a ID</TD>
+            <TD>ID of CPU core</TD>
+        </TR>
+        <TR>
+            <TD>\a Childs</TD>
+            <TD>List of thread IDs for the current CPU core</TD>
+        </TR>
+      </TABLE></TD>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+
+<H1>Function definitions for Lua CPU information module in the Lua API</H1>
+\anchor getCpuInfo
+<H2>getCpuInfo()</H2>
+<P>Get basic information about the CPUs in the system</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Cpu Info \ref lua_cpuinfo</TD>
+</TR>
+</TABLE>
+
+\anchor getCpuTopology
+<H2>getCpuTopology()</H2>
+<P>Get the topology information about the CPUs in the system</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD>Cpu Topology \ref lua_cputopo</TD>
+</TR>
+</TABLE>
+
+<H2>putTopology()</H2>
+<P>Frees C struct CpuInfo and CpuTopology. You can still use the lua_cpuinfo and lua_cputopo data structures<BR>If you call \ref getCpuInfo or \ref getCpuTopology functions again after calling this function, the topology information will be read again.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor cpustr_to_cpulist
+<H2>cpustr_to_cpulist()</H2>
+<P>Resolve the given CPU expression string to a list of CPUs as available in the system</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuexpression</TD>
+      <TD>CPU expression string. Look at \ref likwid-pin for possible formats</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a nrCPUs</TD>
+      <TD>Number of CPUs in the \a cpulist</TD>
+    </TR>
+    <TR>
+      <TD>\a cpulist</TD>
+      <TD>List containing the CPU IDs after resolution of the cpu expression</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+<H2>printSupportedCPUs()</H2>
+<P>Print all Intel and AMD CPU types that are supported by Likwid</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+*/
+
+
+/*! \page lua_NumaInfo NUMA memory topology module
+
+<H1>Data type definition for Lua NUMA topology module in the Lua API</H1>
+\anchor lua_numainfo
+<H2>NUMA Info</H2>
+<P>This structure is returned by \ref getNumaInfo function<BR>It is similar to the C struct NumaTopology</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a numberOfNodes</TD>
+  <TD>Amount of NUMA nodes in the system</TD>
+</TR>
+<TR>
+  <TD>\a nodes</TD>
+    <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>id</TD>
+      <TD>ID of NUMA node</TD>
+    </TR>
+    <TR>
+      <TD>totalMemory</TD>
+      <TD>Total amount of memory in the NUMA domain</TD>
+    </TR>
+    <TR>
+      <TD>freeMemory</TD>
+      <TD>Free amount of memory in the NUMA domain</TD>
+    </TR>
+    <TR>
+      <TD>numberOfProcessors</TD>
+      <TD>Amount of CPUs in the NUMA domain</TD>
+    </TR>
+    <TR>
+      <TD>numberOfDistances</TD>
+      <TD>Amount of distances to local and remote NUMA nodes</TD>
+    </TR>
+    <TR>
+      <TD>processors</TD>
+      <TD>List of CPU IDs in the NUMA domain</TD>
+    </TR>
+    <TR>
+      <TD>distances</TD>
+      <TD>Two dimensional list of distances to NUMA nodes in the system</TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+</TABLE>
+
+<H1>Function definitions for Lua NUMA topology module in the Lua API</H1>
+\anchor getNumaInfo
+<H2>getNumaInfo()</H2>
+<P>Get information about the NUMA domains in the system</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>NUMA Info \ref lua_numainfo</TD>
+</TR>
+</TABLE>
+
+
+<H2>putNumaInfo()</H2>
+<P>Frees C struct NumaTopology. You can still use the lua_numainfo data structure<BR>If you call \ref getNumaInfo function again after calling this function, the NUMA topology information will be read again.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+<H2>setMemInterleaved()</H2>
+<P>Set the 'Interleaved' memory policy to allocate data only on given CPUs</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a nrThreads</TD>
+      <TD>Amount of threads in the \a threads2Cpus list</TD>
+    </TR>
+    <TR>
+      <TD>\a threads2Cpus</TD>
+      <TD>List of thread to CPU relations</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+<H2>nodestr_to_nodelist()</H2>
+<P>Resolve the given node expression in NUMA affinity domain</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a node expression</TD>
+      <TD>List of CPUs in NUMA node</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a nrThreads</TD>
+      <TD>Amount of threads in the \a threads2Cpus list</TD>
+    </TR>
+    <TR>
+      <TD>\a threads2Cpus</TD>
+      <TD>List of thread to CPU relations</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+<H2>sockstr_to_socklist()</H2>
+<P>Resolve the given socket expression in socket affinity domain</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a socket expression</TD>
+      <TD>List of CPUs in socket affinity domain</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Return</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a nrThreads</TD>
+      <TD>Amount of threads in the \a threads2Cpus list</TD>
+    </TR>
+    <TR>
+      <TD>\a threads2Cpus</TD>
+      <TD>List of thread to CPU relations</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+*/
+
+/*! \page lua_AffinityInfo Thread affinity module
+
+<H1>Data type definition for Lua thread affinity module in the Lua API</H1>
+\anchor lua_affinityinfo
+<H2>Affinity Info</H2>
+<P>This structure is returned by \ref getAffinityInfo function<BR>It is similar to the C struct AffinityDomains</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a numberOfAffinityDomains</TD>
+  <TD>Total amount of affinity domains in the system</TD>
+</TR>
+<TR>
+  <TD>\a numberOfSocketDomains</TD>
+  <TD>Amount of affinity domains for CPU sockets in the system</TD>
+</TR>
+<TR>
+  <TD>\a numberOfNumaDomains</TD>
+  <TD>Amount of affinity domains for NUMA domains in the system</TD>
+</TR>
+<TR>
+  <TD>\a numberOfCacheDomains</TD>
+  <TD>Amount of affinity domains for LLC domains in the system</TD>
+</TR>
+<TR>
+  <TD>\a numberOfProcessorsPerSocket</TD>
+  <TD>Amount of hardware threads for each CPU socket in the system</TD>
+</TR>
+<TR>
+  <TD>\a numberOfCoresPerCache</TD>
+  <TD>Amount of physical CPU cores for each LLC in the system</TD>
+</TR>
+<TR>
+  <TD>\a numberOfProcessorsPerCache</TD>
+  <TD>Amount of hardware threads for each LLC in the system</TD>
+</TR>
+<TR>
+  <TD>\a domains</TD>
+    <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>tag</TD>
+      <TD>Tag identifiying the affinity domain</TD>
+    </TR>
+    <TR>
+      <TD>numberOfCores</TD>
+      <TD>Amount of physical CPU cores in the affinity domain</TD>
+    </TR>
+    <TR>
+      <TD>numberOfProcessors</TD>
+      <TD>Amount of hardware threads in the affinity domain</TD>
+    </TR>
+    <TR>
+      <TD>processorList</TD>
+      <TD>List with hardware thread IDs that are in the affinity domain</TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+</TABLE>
+<H1>Function definitions for Lua thread affinity module in the Lua API</H1>
+\anchor getAffinityInfo
+<H2>getAffinityInfo()</H2>
+<P>Get information about the affinity domains in the system</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>NUMA Info \ref lua_affinityinfo</TD>
+</TR>
+</TABLE>
+<H2>putAffinityInfo()</H2>
+<P>Frees C struct AffinityDomains. You can still use the lua_affinityinfo data structure<BR>If you call \ref getAffinityInfo function again after calling this function, the thread affinity information will be read again.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+\anchor pinProcess
+<H2>pinProcess()</H2>
+<P>Pins the current pocess to the given CPU ID</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>CPU to pin the process on</TD>
+    </TR>
+    <TR>
+      <TD>\a silent</TD>
+      <TD>Verbosity of pinning method</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+*/
+
+
+/*! \page lua_Perfmon Performance monitoring module
+<H1>Data type definition for Lua performance monitoring module in the Lua API</H1>
+\anchor lua_counterinfo
+<H2>Event and Counter Info</H2>
+<P>This structure is returned by \ref getEventsAndCounters function</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a Counters</TD>
+  <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>Name</TD>
+      <TD>Counter name as used by LIKWID</TD>
+    </TR>
+    <TR>
+      <TD>Index</TD>
+      <TD>Index of counter definition in internal list of counters</TD>
+    </TR>
+    <TR>
+      <TD>Type</TD>
+      <TD>ID number of counter type, use TypeName to get a human-readable name</TD>
+    </TR>
+    <TR>
+      <TD>TypeName</TD>
+      <TD>Name of counter type</TD>
+    </TR>
+    <TR>
+      <TD>Options</TD>
+      <TD>String with the options available for the counter</TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+<TR>
+  <TD>\a Events</TD>
+  <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>Name</TD>
+      <TD>Event name as used by LIKWID</TD>
+    </TR>
+    <TR>
+      <TD>ID</TD>
+      <TD>Event ID as defined by CPU vendor</TD>
+    </TR>
+    <TR>
+      <TD>Umask</TD>
+      <TD>Umask further restricting the event defined by ID</TD>
+    </TR>
+    <TR>
+      <TD>Limit</TD>
+      <TD>String containing the name(s) of registers the event can be programmed on</TD>
+    </TR>
+    <TR>
+      <TD>Options</TD>
+      <TD>String with the options available for the event</TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor lua_groupdata
+<H2>Info about a performance group</H2>
+<P>This structure is returned by \ref get_groupdata function</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>EventString</TD>
+  <TD>Event set used for the performance group. Well formatted for \ref addEventSet function</TD>
+</TR>
+<TR>
+  <TD>GroupString</TD>
+  <TD>Name of the performance group</TD>
+</TR>
+<TR>
+  <TD>LongDescription</TD>
+  <TD>Description of the group. The 'LONG' section in the performance group file</TD>
+</TR>
+<TR>
+  <TD>\a Events</TD>
+  <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>Event ID</TD>
+      <TD><TABLE>
+      <TR>
+        <TD>\a Event</TD>
+        <TD>Name of event</TD>
+      </TR>
+      <TR>
+        <TD>\a Counter</TD>
+        <TD>LIKWID's name of the counter register</TD>
+      </TR>
+      </TABLE></TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+<TR>
+  <TD>\a Metrics</TD>
+  <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>Metric ID</TD>
+      <TD><TABLE>
+      <TR>
+        <TD>\a description</TD>
+        <TD>Descriptive information of the metric</TD>
+      </TR>
+      <TR>
+        <TD>\a formula</TD>
+        <TD>Formula for calculating the metrics value</TD>
+      </TR>
+      </TABLE></TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+</TABLE>
+
+
+\anchor lua_pcidevinfo
+<H2>Info about online PCI devices used for performance monitoring</H2>
+<P>This structure is returned by \ref getOnlineDevices function</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a Name (used by LIKWID)</TD>
+  <TD><TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>Name</TD>
+      <TD>Name of PCI device</TD>
+    </TR>
+    <TR>
+      <TD>Path</TD>
+      <TD>Path to PCI device</TD>
+    </TR>
+    <TR>
+      <TD>Type</TD>
+      <TD>Human-readable name of the PCI device type</TD>
+    </TR>
+    <TR>
+      <TD>TypeDescription</TD>
+      <TD>Description about the PCI device</TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+</TABLE>
+
+<H1>Function definitions for Lua performance monitoring module in the Lua API</H1>
+\anchor init
+<H2>init()</H2>
+<P>Initializes the Perfmon module of LIKWID, like opening the MSR files and check the PCI devices<BR>If in access daemon mode, a single daemon instance is started to forward measurements on all given CPUs</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a nrThreads</TD>
+      <TD>Number of CPUs that should be measured</TD>
+    </TR>
+    <TR>
+      <TD>\a thread2Cpus</TD>
+      <TD>List with length \a nrThreads containing the relation between thread number and measured CPU</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor addEventSet
+<H2>addEventSet()</H2>
+<P>Creates the internal management structures for the given event set. Checks the registers and if needed PCI device access<BR>The \ref init function as to be called previously</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a eventSet</TD>
+      <TD>String composed of all events in the event set. Format is Event1:Counter1(:Option11:Options12:...),Event2:Counter2...</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>The group ID of the added event set</TD>
+</TR>
+</TABLE>
+
+
+\anchor setupCounters
+<H2>setupCounters()</H2>
+<P>Setup the config registers to measure the events defined by group</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupID</TD>
+      <TD>ID of group returned by \ref addEventSet function.</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor startCounters
+<H2>startCounters()</H2>
+<P>Starts the perfmon group previously set up with \ref setupCounters function.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor stopCounters
+<H2>stopCounters()</H2>
+<P>Stops the perfmon group and reads the counters into the internal result section. Use the \ref getResult or \ref getResults functions to get the results.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor readCounters
+<H2>readCounters()</H2>
+<P>Reads the perfmon group into the internal result section. Use the \ref getResult or \ref getResults functions to get the results.<BR>The counters will be stopped shortly and started after reading to exclude the LIKWID code from measurements.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor switchGroup
+<H2>switchGroup()</H2>
+<P>Switches the currently active group in the perfmon module. If the given group ID does not exist, it fallbacks to group ID 1.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a newgroup</TD>
+      <TD>Switch active group to \a newgroup</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor finalize
+<H2>finalize()</H2>
+<P>Destroy internal structures and clean all used registers</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Always 0</TD>
+</TR>
+</TABLE>
+
+\anchor getResult
+<H2>getResult()</H2>
+<P>Get result for a group, event, thread combination. All options must be given</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupID</TD>
+      <TD>Return result from group defined by \a groupID</TD>
+    </TR>
+    <TR>
+      <TD>\a eventID</TD>
+      <TD>Return result for event with \a eventID. Position in string given to \ref addEventSet function</TD>
+    </TR>
+    <TR>
+      <TD>\a threadID</TD>
+      <TD>Return result for thread with \a threadID as defined by the \a thread2Cpus input parameter for \ref init function</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Result</TD>
+</TR>
+</TABLE>
+
+\anchor getResults
+<H2>getResults()</H2>
+<P>Get all results for all group, event, thread combinations</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Three-dimensional list with results. First dim. is groups, second dim. is events and third dim. are the threads</TD>
+</TR>
+</TABLE>
+
+\anchor getMarkerResults
+<H2>getMarkerResults()</H2>
+<P>Get the results for an output file written by \ref MarkerAPI</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a filename</TD>
+      <TD>Filename written by \ref MarkerAPI</TD>
+    </TR>
+    <TR>
+      <TD>\a group_list</TD>
+      <TD>List of defined groups</TD>
+    </TR>
+    <TR>
+      <TD>\a num_cpus</TD>
+      <TD>Amount of defined CPUs. Is used just used for checking if the \ref MarkerAPI run is valid. If LIKWID_MARKER_THREADINIT is not called properly the tests will fail</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Four-dimensional list with results. First dim. is groups, second dim. is management regions, and third dim. are the events and fourth dim. are the threads</TD>
+</TR>
+</TABLE>
+
+\anchor getEventsAndCounters
+<H2>getEventsAndCounters()</H2>
+<P>Get a list containing all event and counter definitions</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Event and counter info like \ref lua_counterinfo</TD>
+</TR>
+</TABLE>
+
+\anchor getOnlineDevices
+<H2>getOnlineDevices()</H2>
+<P>Get a list containing all online PCI devices</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>PCI device info like \ref lua_pcidevinfo</TD>
+</TR>
+</TABLE>
+
+\anchor getNumberOfGroups
+<H2>getNumberOfGroups()</H2>
+<P>Returns the number of event sets (groups) added to the perfmon module</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Amount of configured groups</TD>
+</TR>
+</TABLE>
+
+\anchor getIdOfActiveGroup
+<H2>getIdOfActiveGroup()</H2>
+<P>Returns the ID of the currently active group</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>ID of active group</TD>
+</TR>
+</TABLE>
+
+\anchor getRuntimeOfGroup
+<H2>getRuntimeOfGroup()</H2>
+<P>Returns the measurement time of the given groupID</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupID</TD>
+      <TD>Return the measurement time for group defined by \a groupID</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Measurement time of group</TD>
+</TR>
+</TABLE>
+
+\anchor getNumberOfEvents
+<H2>getNumberOfEvents()</H2>
+<P>Returns the amount of events for the given groupID</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupID</TD>
+      <TD>Return the measurement time for group defined by \a groupID</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Amount of events in group</TD>
+</TR>
+</TABLE>
+
+\anchor getNumberOfThreads
+<H2>getNumberOfThreads()</H2>
+<P>Returns the number of threads as given to \ref init function</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Amount of measurement threads</TD>
+</TR>
+</TABLE>
+
+\anchor get_groups
+<H2>get_groups()</H2>
+<P>Returns a list of all performance groups in \a groupfolder</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a architecture</TD>
+      <TD>Short name of architecture. Can be found in CPU info \ref lua_cpuinfo as \a short_name</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a numerOfGroups</TD>
+      <TD>Amount of groups in \a groupfolder for given \a architecture</TD>
+    </TR>
+    <TR>
+      <TD>\a groups</TD>
+      <TD>List with the names of all performance groups in \a groupfolder for given \a architecture</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor get_groupdata
+<H2>get_groupdata()</H2>
+<P>Read in the performance group \a group</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a group</TD>
+      <TD>Get group data for \a group </TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groupdata</TD>
+      <TD>Structure with all group information found for the performance group \a group, see \ref lua_groupdata</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+*/
+
+/*! \page lua_PowerInfo Power and Energy monitoring module
+<H1>Data type definition for Lua power and energy monitoring module in the Lua API</H1>
+\anchor lua_powerinfo
+<H2>Power Information</H2>
+<P>This structure is returned by \ref getPowerInfo function<BR>The nested list structure is almost similar to the C struct CpuTopology.</P>
+<TABLE>
+<TR>
+  <TH>Membername</TH>
+  <TH>Comment</TH>
+</TR>
+<TR>
+  <TD>\a hasRAPL</TD>
+  <TD>If set, the system supports power readings through the RAPL interface</TD>
+</TR>
+<TR>
+  <TD>\a baseFrequency</TD>
+  <TD>Nominal clock frequency of the system</TD>
+</TR>
+<TR>
+  <TD>\a minFrequency</TD>
+  <TD>Minimal supported clock frequency of the system</TD>
+</TR>
+<TR>
+  <TD>\a powerUnit</TD>
+  <TD>Multiplier for power readings</TD>
+</TR>
+<TR>
+  <TD>\a timeUnit</TD>
+  <TD>Multiplier for time readings from RAPL</TD>
+</TR>
+<TR>
+  <TD>\a turbo</TD>
+    <TD>
+    <TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>\a numSteps</TD>
+      <TD>Amount of turbo mode steps</TD>
+    </TR>
+    <TR>
+      <TD>\a steps</TD>
+      <TD>List containing the turbo mode steps</TD>
+    </TR>
+    </TABLE></TD>
+</TR>
+<TR>
+  <TD>\a domains</TD>
+    <TD>
+    <TABLE>
+    <TR>
+      <TH>Membername</TH>
+      <TH>Comment</TH>
+    </TR>
+    <TR>
+      <TD>\a RAPL domain</TD>
+      <TD>
+        <TABLE>
+        <TR>
+          <TH>Typename</TH>
+          <TH>comment</TH>
+        </TR>
+        <TR>
+          <TD>ID</TD>
+          <TD>Type of domain (PKG, PP0, PP1, DRAM)</TD>
+        </TR>
+        <TR>
+          <TD>energyUnit</TD>
+          <TD>Multiplier for energy readings for RAPL domain</TD>
+        </TR>
+        <TR>
+          <TD>supportStatus</TD>
+          <TD>RAPL domain has a status register to read energy values</TD>
+        </TR>
+        <TR>
+          <TD>supportPerf</TD>
+          <TD>RAPL domain has a perf register</TD>
+        </TR>
+        <TR>
+          <TD>supportPolicy</TD>
+          <TD>RAPL domain has a policy register to define a global energy policy</TD>
+        </TR>
+        <TR>
+          <TD>supportLimit</TD>
+          <TD>RAPL domain has a policy register to define a limit for the energy consumption</TD>
+        </TR>
+        <TR>
+          <TD>supportInfo</TD>
+          <TD>RAPL domain has a policy register to define a limit for the energy consumption</TD>
+        </TR>
+        <TR>
+          <TD>tdp</TD>
+          <TD>Thermal Design Power<BR>Only if supportInfo is set</TD>
+        </TR>
+        <TR>
+          <TD>minPower</TD>
+          <TD>Minimal power consumption for the RAPL domain<BR>Only if supportInfo is set</TD>
+        </TR>
+        <TR>
+          <TD>maxPower</TD>
+          <TD>Maximal power consumption for the RAPL domain<BR>Only if supportInfo is set</TD>
+        </TR>
+        <TR>
+          <TD>maxTimeWindow</TD>
+          <TD>Maximal duration between updates of the RAPL status registers<BR>Only if supportInfo is set</TD>
+        </TR>
+        </TABLE>
+        </TD>
+    </TR>
+    </TABLE>
+    </TD>
+</TR>
+</TABLE>
+<H1>Function definitions for Lua power and energy monitoring module in the Lua API</H1>
+\anchor getPowerInfo
+<H2>getPowerInfo()</H2>
+<P>Get information about the RAPL interface in the system</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Power Info \ref lua_powerinfo</TD>
+</TR>
+</TABLE>
+\anchor putPowerInfo
+<H2>putPowerInfo()</H2>
+<P>Frees C struct PowerInfo. You can still use the lua_powerinfo data structure<BR>If you call \ref getPowerInfo function again after calling this function, the power information struct will be filled again.</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor startPower
+<H2>startPower()</H2>
+<P>Start measuring given RAPL domain on given CPU</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>Start the power measurement on CPU \a cpuID</TD>
+    </TR>
+    <TR>
+      <TD>\a domainID</TD>
+      <TD>Start the power measurement for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Power value at start</TD>
+</TR>
+</TABLE>
+
+\anchor stopPower
+<H2>stopPower()</H2>
+<P>Stop measuring given RAPL domain on given CPU</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>Stop the power measurement on CPU \a cpuID</TD>
+    </TR>
+    <TR>
+      <TD>\a domainID</TD>
+      <TD>Stop the power measurement for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Power value at stop</TD>
+</TR>
+</TABLE>
+
+
+\anchor printEnergy
+<H2>printEnergy()</H2>
+<P></P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a before</TD>
+      <TD>Result from \ref startPower function</TD>
+    </TR>
+    <TR>
+      <TD>\a after</TD>
+      <TD>Result from \ref stopPower function</TD>
+    </TR>
+    <TR>
+      <TD>\a domainID</TD>
+      <TD>Print the power result for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Power value at stop</TD>
+</TR>
+</TABLE>
+
+\anchor limitGet
+<H2>limitGet() (EXPERIMENTAL)</H2>
+<P>Get the current limit in the limit register of domain. The limit is defined as maximal power consumption in a time window</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>Get limit for CPU \a cpuID</TD>
+    </TR>
+    <TR>
+      <TD>\a domainID</TD>
+      <TD>Get limit for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a power</TD>
+      <TD>Power limit value</TD>
+    </TR>
+    <TR>
+      <TD>\a time</TD>
+      <TD>Duration of time window</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+
+\anchor limitSet
+<H2>limitSet() (EXPERIMENTAL)</H2>
+<P></P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>Set limit for CPU \a cpuID</TD>
+    </TR>
+    <TR>
+      <TD>\a domainID</TD>
+      <TD>Set limit for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+    </TR>
+    <TR>
+      <TD>\a power</TD>
+      <TD>Set power value to \a power</TD>
+    </TR>
+    <TR>
+      <TD>\a time</TD>
+      <TD>Set time window value to \a time</TD>
+    </TR>
+    <TR>
+      <TD>\a clamp</TD>
+      <TD>Should the limit be clamped or can it sometimes exceed the power limit if in total the limit is satisfied</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Error code, 0 for success</TD>
+</TR>
+</TABLE>
+
+\anchor limitState
+<H2>limitState() (EXPERIMENTAL)</H2>
+<P>Get the state of the limit</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>Get the state on CPU \a cpuID</TD>
+    </TR>
+    <TR>
+      <TD>\a domainID</TD>
+      <TD>Get the state for domain domainID<BR>Possible values: 0=PKG, 1=PP0, 2=PP1, 3=DRAM</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>State, 0 for off, 1 for on</TD>
+</TR>
+</TABLE>
+*/
+
+/*! \page lua_ThermalInfo Thermal monitoring module
+<H1>Data type definition for Lua thermal monitoring module in the Lua API</H1>
+<H1>Function definitions for Lua thermal monitoring module in the Lua API</H1>
+\anchor initTemp
+<H2>initTemp()</H2>
+<P>Initialize the thermal measurements on given CPU</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>Initialize thermal readings on CPU \a cpuID</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor initTemp
+<H2>readTemp()</H2>
+<P>Measure the temperature on given CPU</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a cpuID</TD>
+      <TD>Read the temperature on CPU \a cpuID</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Temperature</TD>
+</TR>
+</TABLE>
+*/
+
+/*! \page lua_Timer Time measurement module
+<H1>Data type definition for Lua time measurement module in the Lua API</H1>
+<H1>Function definitions for Lua time measurement module in the Lua API</H1>
+\anchor getCpuClock
+<H2>getCpuClock()</H2>
+<P>Returns the nominal clock speed</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Clock speed in Hz</TD>
+</TR>
+</TABLE>
+
+\anchor startClock
+<H2>startClock()</H2>
+<P>Start the TSC clock</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Current timestamp</TD>
+</TR>
+</TABLE>
+
+\anchor stopClock
+<H2>stopClock()</H2>
+<P>Stop the TSC clock</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Current timestamp</TD>
+</TR>
+</TABLE>
+
+\anchor getClockCycles
+<H2>getClockCycles()</H2>
+<P>Return the amount of cycles between start and stop timestamps</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a start</TD>
+      <TD>Start timestamp</TD>
+    </TR>
+    <TR>
+      <TD>\a stop</TD>
+      <TD>Stop timestamp</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Amount of cycles between start and stop</TD>
+</TR>
+</TABLE>
+
+\anchor getClock
+<H2>getClock()</H2>
+<P>Return the time in seconds between start and stop timestamps</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a start</TD>
+      <TD>Start timestamp</TD>
+    </TR>
+    <TR>
+      <TD>\a stop</TD>
+      <TD>Stop timestamp</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Time in seconds between start and stop</TD>
+</TR>
+</TABLE>
+
+\anchor sleep
+<H2>sleep()</H2>
+<P>Sleep for specified amount of seconds</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a seconds</TD>
+      <TD>Sleep for seconds</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Remaining time to sleep. >0 if sleep is interrupted</TD>
+</TR>
+</TABLE>
+
+\anchor usleep
+<H2>usleep()</H2>
+<P>Sleep for at least the specified amount of microseconds</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a usecs</TD>
+      <TD>Sleep for microseconds. \a usec must be in range 1 to 999999</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>Status of usleep. If interrupted the status is != 0</TD>
+</TR>
+</TABLE>
+
+*/
+
+/*! \page lua_MemSweep Memory sweeping module
+<H1>Data type definition for Lua memory sweeping module in the Lua API</H1>
+<H1>Function definitions for Lua memory sweeping module in the Lua API</H1>
+\anchor memSweep
+<H2>memSweep()</H2>
+<P>Sweep the memory and LLC for given threads</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a nrThreads</TD>
+      <TD>Amount of threads in the \a threads2Cpus list</TD>
+    </TR>
+    <TR>
+      <TD>\a Cpus</TD>
+      <TD>List with thread to CPU relations</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor memSweepDomain
+<H2>memSweepDomain()</H2>
+<P>Sweep the memory and LLC for a given NUMA domain</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a domainID</TD>
+      <TD>Sweep the memory and LLC at the NUMA domain specified by \a domainID</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+*/
+
+/*! \page lua_Misc Miscellaneous functions module
+<H1>Data type definition for Lua miscellaneous functions module in the Lua API</H1>
+<H1>Function definitions for Lua miscellaneous functions module in the Lua API</H1>
+\anchor startProgram
+<H2>startProgram()</H2>
+<P>Start an executable in a new thread</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a Exec</TD>
+      <TD>String containing the executable and its options</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>PID of newly created thread</TD>
+</TR>
+</TABLE>
+
+\anchor checkProgram
+<H2>checkProgram()</H2>
+<P>Check if the executable is running</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>True/False</TD>
+</TR>
+</TABLE>
+
+\anchor killProgram
+<H2>killProgram()</H2>
+<P>Kill the executable with SIGTERM</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a PID</TD>
+      <TD>PID to send the SIGTERM signal</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+
+\anchor setenv
+<H2>setenv()</H2>
+<P>Set environment variable. Lua only provides getenv()</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a Name</TD>
+      <TD>Name of environment variable</TD>
+    </TR>
+    <TR>
+      <TD>\a Value</TD>
+      <TD>Value for the environment variable</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor getpid
+<H2>getpid()</H2>
+<P>Get the PID of the current process</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>PID number</TD>
+</TR>
+</TABLE>
+
+\anchor access
+<H2>access()</H2>
+<P>Check the file existance for a given filepath</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a Filepath</TD>
+      <TD>Name of Filepath to check for existance</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>PID number</TD>
+</TR>
+</TABLE>
+
+\anchor msr_available
+<H2>msr_available()</H2>
+<P>Check whether the msr files are available. Basically checks whether the msr kernel module is loaded properly</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>True/False</TD>
+</TR>
+</TABLE>
+
+\anchor gethostname
+<H2>gethostname()</H2>
+<P>Returns the hostname of the system in short format</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a Hostname</TD>
+      <TD>Hostname in short format</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor getjid
+<H2>getjid()</H2>
+<P>Returns the job ID if running in a batch environment. Basically reads the <CODE>PBS_JOBID</CODE> environment variable</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a Job ID</TD>
+      <TD>Job ID or 'X' if not in batch environment</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor getMPIrank
+<H2>getMPIrank()</H2>
+<P>Returns the MPI rank of the current process. Basically read the <CODE>PMI_RANK</CODE> and <CODE>OMPI_COMM_WORLD_RANK</CODE> environment variables</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD>None</TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a MPI Rank</TD>
+      <TD>MPI rank or 'X' if not in MPI environment</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+*/
+
+
+/*! \page lua_InputOutput Input and output functions module
+<H1>Data type definition for Lua output functions module in the Lua API</H1>
+<H1>Function definitions for Lua output functions module in the Lua API</H1>
+\anchor getopt
+<H2>getopt()</H2>
+<P>Read commandline parameters and split them to the given options. The version LIKWID uses was originally taken from the web but extended to talk short '-o' and long options "--option". It returns an iterator for the commandline options.<BR>Basic usage:<BR></P>
+<CODE>
+for opt,arg in likwid.getopt(arg, {"n:","h"}) do<BR>
+    if (type(arg) == "string") then<BR>
+        local s,e = arg:find("-")<BR>
+        if s == 1 then<BR>
+            print(string.format("ERROR: Argmument %s to option -%s starts with invalid character -.", arg, opt))<BR>
+            print("ERROR: Did you forget an argument to an option?")<BR>
+            os.exit(1)<BR>
+        end<BR>
+    end<BR>
+    --parse options<BR>
+end<BR>
+</CODE><BR>
+The option 'n' takes an argument, specified by the ':'. If found the option argument for option 'h' is true. The type check for the argument is recommended to get errors with an argument awaiting option where the argument is missing.
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a commandline</TD>
+      <TD>Normally, Lua saves the commandline parameters in variable 'arg'</TD>
+    </TR>
+    <TR>
+      <TD>\a optionlist</TD>
+      <TD>List of options that should be recognized. Options with ':' as last character need an argument</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a option</TD>
+      <TD>Option string found on the commandline without leading '-'</TD>
+    </TR>
+    <TR>
+      <TD>\a argument</TD>
+      <TD>Argument to the \a option. If \a option does not require an argument, true or false is returned in \a argument</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor parse_time
+<H2>parse_time()</H2>
+<P>Parses time interval describing strings like 2s, 100ms or 250us</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a timestr</TD>
+      <TD>String describing a time interval</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a duration</TD>
+      <TD>Time string \a timestr resolved to usecs</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+</TABLE>
+
+\anchor printtable
+<H2>printtable()</H2>
+<P>Prints the given two dimensional table as fancy ASCII table. For CSV output use \ref printcsv</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a table</TD>
+      <TD>Two dimensional list with table entries. First dim. are columns and second dim. the lines</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor printcsv
+<H2>printcsv()</H2>
+<P>Prints the given two dimensional table in CSV format. For ASCII table output see \ref printtable</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a table</TD>
+      <TD>Two dimensional list with table entries. First dim. are columns and second dim. the lines</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor stringsplit
+<H2>stringsplit()</H2>
+<P>Splits the given string at separating character</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a str</TD>
+      <TD>String to split</TD>
+    </TR>
+    <TR>
+      <TD>\a sSeparator</TD>
+      <TD>String with separating character</TD>
+    </TR>
+    <TR>
+      <TD>\a nMax</TD>
+      <TD>Split string maximally \a nMax times (optional)</TD>
+    </TR>
+    <TR>
+      <TD>\a bRegexp</TD>
+      <TD>Lua RegEx string for separation (optional)</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>List of \a str splitted at \a sSeparator or \a bRegexp</TD>
+</TR>
+</TABLE>
+
+\anchor printOutput
+<H2>printOutput()</H2>
+<P>Prints results</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groups</TD>
+      <TD>List of groups for printing</TD>
+    </TR>
+    <TR>
+      <TD>\a results</TD>
+      <TD>List of results as returned by \ref getResults function</TD>
+    </TR>
+    <TR>
+      <TD>\a groupData</TD>
+      <TD>List of group data structures</TD>
+    </TR>
+    <TR>
+      <TD>\a cpulist</TD>
+      <TD>List of thread ID to CPU ID relations</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor print_markerOutput
+<H2>print_markerOutput()</H2>
+<P>Prints results of a Marker API run. This is different to \ref printOutput because we have to resolve the measurement regions</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a groups</TD>
+      <TD>List of groups for printing</TD>
+    </TR>
+    <TR>
+      <TD>\a results</TD>
+      <TD>List of results as returned by \ref getMarkerResults function</TD>
+    </TR>
+    <TR>
+      <TD>\a groupData</TD>
+      <TD>List of group data structures</TD>
+    </TR>
+    <TR>
+      <TD>\a cpulist</TD>
+      <TD>List of thread ID to CPU ID relations</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+
+\anchor addSimpleAsciiBox
+<H2>addSimpleAsciiBox()</H2>
+<P>Add a simple ASCII box with given label to box container. This function is only used by \ref likwid-topology</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a container</TD>
+      <TD>Box container containing all boxes</TD>
+    </TR>
+    <TR>
+      <TD>\a lineIdx</TD>
+      <TD>Add box at line index \a lineIdx</TD>
+    </TR>
+    <TR>
+      <TD>\a colIdx</TD>
+      <TD>Add box at column index \a colIdx</TD>
+    </TR>
+    <TR>
+      <TD>\a label</TD>
+      <TD>Content of the box</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor addJoinedAsciiBox
+<H2>addJoinedAsciiBox()</H2>
+<P>Add a joined ASCII box with given label to box container. Joined boxes can span the space of multiple simple boxes. This function is only used by \ref likwid-topology</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a container</TD>
+      <TD>Box container containing all boxes</TD>
+    </TR>
+    <TR>
+      <TD>\a lineIdx</TD>
+      <TD>Add box at line index \a lineIdx</TD>
+    </TR>
+    <TR>
+      <TD>\a startColIdx</TD>
+      <TD>Start joined box at column index \a startColIdx</TD>
+    </TR>
+    <TR>
+      <TD>\a endColIdx</TD>
+      <TD>End joined box at column index \a endColIdx</TD>
+    </TR>
+    <TR>
+      <TD>\a label</TD>
+      <TD>Content of the box</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+
+\anchor printAsciiBox
+<H2>printAsciiBox()</H2>
+<P>Print the box container previously filled with \ref addSimpleAsciiBox and \ref addJoinedAsciiBox. This function is only used by \ref likwid-topology</P>
+<TABLE>
+<TR>
+  <TH>Direction</TH>
+  <TH>Data type(s)</TH>
+</TR>
+<TR>
+  <TD>Input Parameter</TD>
+  <TD><TABLE>
+    <TR>
+      <TD>\a container</TD>
+      <TD>Box container containing all boxes</TD>
+    </TR>
+  </TABLE></TD>
+</TR>
+<TR>
+  <TD>Returns</TD>
+  <TD>None</TD>
+</TR>
+</TABLE>
+*/
diff --git a/examples/C-likwidAPI.c b/examples/C-likwidAPI.c
new file mode 100644
index 0000000..85955ac
--- /dev/null
+++ b/examples/C-likwidAPI.c
@@ -0,0 +1,136 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  C-likwidAPI.c
+ *
+ *      Description:  Example how to use the LIKWID API in C/C++ applications
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <likwid.h>
+
+#define EVENTSET "INSTR_RETIRED_ANY:FIXC0"
+
+
+int main(int argc, char* argv[])
+{
+    int i;
+    int err;
+    int* cpus;
+    int gid;
+    double result = 0.0;
+
+    // Load the topology module and print some values.
+    err = topology_init();
+    if (err < 0)
+    {
+        printf("Failed to initialize LIKWID's topology module\n");
+        return 1;
+    }
+    // CpuInfo_t contains global information like name, CPU family, ...
+    CpuInfo_t info = get_cpuInfo();
+    // CpuTopology_t contains information about the topology of the CPUs.
+    CpuTopology_t topo = get_cpuTopology();
+    printf("Likwid example on a %s with %d CPUs\n", info->name, topo->numHWThreads);
+
+    cpus = malloc(topo->numHWThreads * sizeof(int));
+    if (!cpus)
+        return 1;
+
+    for (i=0;i<topo->numHWThreads;i++)
+    {
+        cpus[i] = topo->threadPool[i].apicId;
+    }
+
+    // Must be called before perfmon_init() but only if you want to use another
+    // access mode as the pre-configured one. For direct access (0) you have to
+    // be root.
+    //accessClient_setaccessmode(0);
+
+    // Initialize the perfmon module.
+    err = perfmon_init(topo->numHWThreads, cpus);
+    if (err < 0)
+    {
+        printf("Failed to initialize LIKWID's performance monitoring module\n");
+        topology_finalize();
+        return 1;
+    }
+
+    // Add eventset string to the perfmon module.
+    gid = perfmon_addEventSet(EVENTSET);
+    if (gid < 0)
+    {
+        printf("Failed to add event string %s to LIKWID's performance monitoring module\n", EVENTSET);
+        perfmon_finalize();
+        topology_finalize();
+        return 1;
+    }
+
+    // Setup the eventset identified by group ID (gid).
+    err = perfmon_setupCounters(gid);
+    if (err < 0)
+    {
+        printf("Failed to setup group %d in LIKWID's performance monitoring module\n", gid);
+        perfmon_finalize();
+        topology_finalize();
+        return 1;
+    }
+    // Start all counters in the previously set up event set.
+    err = perfmon_startCounters();
+    if (err < 0)
+    {
+        printf("Failed to start counters for group %d for thread %d\n",gid, (-1*err)-1);
+        perfmon_finalize();
+        topology_finalize();
+        return 1;
+    }
+    // Perform something
+    sleep(2);
+    // Stop all counters in the previously started event set.
+    err = perfmon_stopCounters();
+    if (err < 0)
+    {
+        printf("Failed to stop counters for group %d for thread %d\n",gid, (-1*err)-1);
+        perfmon_finalize();
+        topology_finalize();
+        return 1;
+    }
+
+
+    // Print the result of every thread/CPU.
+    for (i = 0;i < topo->numHWThreads; i++)
+    {
+        result = perfmon_getResult(gid, 0, i);
+        printf("Measurement result for event set %s at CPU %d: %f\n", EVENTSET, cpus[i], result);
+    }
+
+    // Uninitialize the perfmon module.
+    perfmon_finalize();
+    // Uninitialize the topology module.
+    topology_finalize();
+    return 0;
+}
diff --git a/examples/C-markerAPI.c b/examples/C-markerAPI.c
new file mode 100644
index 0000000..82a4a25
--- /dev/null
+++ b/examples/C-markerAPI.c
@@ -0,0 +1,87 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  C-markerAPI.c
+ *
+ *      Description:  Example how to use the C/C++ Marker API
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <omp.h>
+#include <likwid.h>
+
+#define SLEEPTIME 2
+
+int main(int argc, char* argv[])
+{
+    int i;
+    int nevents = 10;
+    double events[10];
+    double time;
+    int count;
+    // Init Marker API in serial region once in the beginning
+    LIKWID_MARKER_INIT;
+    #pragma omp parallel
+    {
+        // Each thread must add itself to the Marker API, therefore must be
+        // in parallel region
+        LIKWID_MARKER_THREADINIT;
+        // Optional. Register region name
+        LIKWID_MARKER_REGISTER("example");
+    }
+
+
+    #pragma omp parallel
+    {
+        printf("Thread %d sleeps now for %d seconds\n", omp_get_thread_num(), SLEEPTIME);
+        // Start measurements inside a parallel region
+        LIKWID_MARKER_START("example");
+        // Insert your code here.
+        // Often contains an OpenMP for pragma. Regions can be nested.
+        sleep(SLEEPTIME);
+        // Stop measurements inside a parallel region
+        LIKWID_MARKER_STOP("example");
+        printf("Thread %d wakes up again\n", omp_get_thread_num());
+        // If multiple groups given, you can switch to the next group
+        LIKWID_MARKER_SWITCH;
+        // If you need the performance data inside your application, use
+        LIKWID_MARKER_GET("example", &nevents, events, &time, &count);
+        // where events is an array of doubles with nevents entries,
+        // time is a double* and count an int*.
+        printf("Region example measures %d events, total measurement time is %f\n", nevents, time);
+        printf("The region was called %d times\n", count);
+        for (i = 0; i < nevents; i++)
+        {
+            printf("Event %d: %f\n", i, events[i]);
+        }
+    }
+
+    // Close Marker API and write results to file for further evaluation done
+    // by likwid-perfctr
+    LIKWID_MARKER_CLOSE;
+    return 0;
+}
diff --git a/examples/F-markerAPI.F90 b/examples/F-markerAPI.F90
new file mode 100644
index 0000000..7cd668e
--- /dev/null
+++ b/examples/F-markerAPI.F90
@@ -0,0 +1,79 @@
+! =======================================================================================
+!
+!      Filename:  F-markerAPI.F90
+!
+!      Description:  Example how to use the Fortran90 Marker API
+!
+!      Version:   4.0
+!      Released:  16.6.2015
+!
+!      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+!      Project:  likwid
+!
+!      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+!
+!      This program is free software: you can redistribute it and/or modify it under
+!      the terms of the GNU General Public License as published by the Free Software
+!      Foundation, either version 3 of the License, or (at your option) any later
+!      version.
+!
+!      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+!      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+!      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+!
+!      You should have received a copy of the GNU General Public License along with
+!      this program.  If not, see <http://www.gnu.org/licenses/>.
+!
+! =======================================================================================
+
+#define SLEEPTIME 2
+
+program FmarkerAPI
+    use likwid
+    include "omp_lib.h"
+    INTEGER :: nr_events
+    DOUBLE PRECISION, DIMENSION(10) :: events
+    DOUBLE PRECISION :: time
+    INTEGER :: c
+    nr_events = 10
+    ! Init Marker API in serial region once in the beginning.
+    call likwid_markerInit()
+
+!$OMP PARALLEL
+    ! Each thread must add itself to the Marker API, therefore must be
+    ! in parallel region.
+    call likwid_markerthreadInit()
+    ! Optional. Register region name and initialize hash table entries.
+    call likwid_markerRegisterRegion("example")
+!$OMP END PARALLEL
+
+!$OMP PARALLEL
+    print '(a,i0,a,i0,a)', "Thread ", omp_get_thread_num()," sleeps now for ", SLEEPTIME," seconds"
+    ! Start measurements inside a parallel region.
+    call likwid_markerStartRegion("example")
+    ! Insert your code here
+    ! Often contains an OpenMP for pragma. Regions can be nested.
+    call Sleep(SLEEPTIME)
+    ! Stop measurements inside a parallel region.
+    call likwid_markerStopRegion("example")
+    print '(a,i0,a)', "Thread ", omp_get_thread_num()," wakes up again"
+    ! If multiple groups given, you can switch to the next group.
+    call likwid_markerNextGroup();
+    ! If you need the performance data inside your application, use
+    call likwid_markerGetRegion("example", nr_events, events, time, c)
+    ! Events is an array of DOUBLE PRECISION with nr_events (INTEGER) entries,
+    ! time is a DOUBLE PRECISION and count an INTEGER.
+    ! After returning the events array contains maximally nr_events results.
+    print '(a,i0,a,f9.3)', "Region example measures ", nr_events, " events, total measurement time is ", time
+    print '(a,i0,a)', "The region was called ", c, " times"
+    do i=1,nr_events
+        print '(a,i0,a,e13.7)', "Event ",i,": ",events(i)
+    end do
+    
+!$OMP END PARALLEL
+
+! Close Marker API and write results to file for further evaluation done
+! by likwid-perfctr.
+call likwid_markerClose()
+
+end program FmarkerAPI
diff --git a/examples/Lua-likwidAPI.lua b/examples/Lua-likwidAPI.lua
new file mode 100644
index 0000000..d41652d
--- /dev/null
+++ b/examples/Lua-likwidAPI.lua
@@ -0,0 +1,93 @@
+#!<PREFIX>/bin/likwid-lua
+--[[
+ * =======================================================================================
+
+ *
+ *      Filename:  Lua-likwidAPI.lua
+ *
+ *      Description:  Example how to use the LIKWID API in Lua scripts
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = package.path .. ';<PREFIX>/share/lua/?.lua'
+
+local likwid = require("likwid")
+
+EVENTSET = "INSTR_RETIRED_ANY:FIXC0"
+
+cpuinfo = likwid.getCpuInfo()
+cputopo = likwid.getCpuTopology()
+
+print(string.format("Likwid example on a %s with %d CPUs", cpuinfo.name, cputopo.numHWThreads))
+
+local cpus = {}
+for i, cpu in pairs(cputopo.threadPool) do
+    table.insert(cpus, cpu.apicId)
+end
+
+if likwid.init(#cpus, cpus) ~= 0 then
+    print("Failed to initialize LIKWID's performance monitoring module")
+    likwid.putTopology()
+    os.exit(1)
+end
+
+local gid = likwid.addEventSet(EVENTSET)
+if gid <= 0 then
+    print(string.format("Failed to add events %s to LIKWID's performance monitoring module", EVENTSET))
+    likwid.finalize()
+    likwid.putTopology()
+    os.exit(1)
+end
+
+
+if likwid.setupCounters(gid) < 0 then
+    printf(string.format("Failed to setup group %d in LIKWID's performance monitoring module\n", gid))
+    likwid.finalize()
+    likwid.putTopology()
+    os.exit(1)
+end
+if likwid.startCounters() < 0 then
+    printf(string.format("Failed to start group %d in LIKWID's performance monitoring module\n", gid))
+    likwid.finalize()
+    likwid.putTopology()
+    os.exit(1)
+end
+-- Application code
+likwid.sleep(2)
+if likwid.stopCounters() < 0 then
+    printf(string.format("Failed to stop group %d in LIKWID's performance monitoring module\n", gid))
+    likwid.finalize()
+    likwid.putTopology()
+    os.exit(1)
+end
+
+
+for i,cpu in pairs(cpus) do
+    result = likwid.getResult(gid, 1, i)
+    print(string.format("Measurement result for event set %s at CPU %d: %f", EVENTSET, cpu, result))
+end
+
+
+likwid.putTopology()
+likwid.finalize()
diff --git a/examples/Makefile b/examples/Makefile
new file mode 100644
index 0000000..f82337c
--- /dev/null
+++ b/examples/Makefile
@@ -0,0 +1,36 @@
+
+include ../config.mk
+include ../make/include_$(COMPILER).mk
+
+all: C-markerAPI C-likwidAPI F-markerAPI Lua-likwidAPI C-markerAPI-run C-likwidAPI-run F-markerAPI-run Lua-likwidAPI-run
+
+
+C-markerAPI:
+	$(CC) -fopenmp -DLIKWID_PERFMON -I$(PREFIX)/include -L$(PREFIX)/lib C-markerAPI.c -o C-markerAPI -llikwid -lm
+
+C-markerAPI-run:
+	$(PREFIX)/bin/likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0 -m ./C-markerAPI
+
+C-likwidAPI:
+	$(CC) -fopenmp -DLIKWID_PERFMON -I$(PREFIX)/include -L$(PREFIX)/lib C-likwidAPI.c -o C-likwidAPI -llikwid -lm
+
+C-likwidAPI-run:
+	./C-likwidAPI
+
+F-markerAPI:
+	$(FC) -fopenmp -DLIKWID_PERFMON -I$(PREFIX) -L$(PREFIX) F-markerAPI.F90 -o F-markerAPI -llikwid -lm
+
+F-markerAPI-run:
+	$(PREFIX)/bin/likwid-perfctr -C 0 -g INSTR_RETIRED_ANY:FIXC0 -m ./F-markerAPI
+
+Lua-likwidAPI:
+	sed -e "s+<PREFIX>+$(PREFIX)+g" Lua-likwidAPI.lua > Lua-likwidAPI
+	chmod +x Lua-likwidAPI
+
+Lua-likwidAPI-run:
+	./Lua-likwidAPI
+
+clean:
+	rm -f C-markerAPI C-likwidAPI F-markerAPI Lua-likwidAPI
+
+.PHONY: clean C-markerAPI C-likwidAPI F-markerAPI Lua-likwidAPI
diff --git a/ext/hwloc/AUTHORS b/ext/hwloc/AUTHORS
new file mode 100644
index 0000000..837b27f
--- /dev/null
+++ b/ext/hwloc/AUTHORS
@@ -0,0 +1,8 @@
+Cédric Augonnet <Cedric.Augonnet at labri.fr>
+Jérôme Clet-Ortega <Jerome.Clet-Ortega at labri.fr>
+Ludovic Courtès <Ludovic.Courtes at inria.fr>
+Brice Goglin <Brice.Goglin at inria.fr>
+Nathalie Furmento <Nathalie.Furmento at labri.fr>
+Samuel Thibault <Samuel.Thibault at labri.fr>
+Jeff Squyres <jsquyres at cisco.com>
+Alexey Kardashevskiy <aik at au1.ibm.com>
diff --git a/ext/hwloc/COPYING b/ext/hwloc/COPYING
new file mode 100644
index 0000000..32128c7
--- /dev/null
+++ b/ext/hwloc/COPYING
@@ -0,0 +1,28 @@
+Copyright © 2009 CNRS
+Copyright © 2009 inria.  All rights reserved.
+Copyright © 2009 Université Bordeaux 1
+Copyright © 2009 Cisco Systems, Inc.  All rights reserved.
+Copyright © 2012 Blue Brain Project, EPFL. All rights reserved.
+See COPYING in top-level directory.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions
+are met:
+1. Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+3. The name of the author may not be used to endorse or promote products
+   derived from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/ext/hwloc/Makefile b/ext/hwloc/Makefile
new file mode 100644
index 0000000..54b3a30
--- /dev/null
+++ b/ext/hwloc/Makefile
@@ -0,0 +1,64 @@
+SRC_DIRS    = ./hwloc
+MAKE_DIR   = ../../make
+
+#DO NOT EDIT BELOW
+
+include ../../config.mk
+include $(MAKE_DIR)/include_$(COMPILER).mk
+
+CFLAGS    = -O2 -Wall -fPIC
+INCLUDES  += -I./include
+#DEFINES   =
+LIBS      = -L. -lm -Wl,-E
+LFLAGS    = -Wno-unused-result -fPIC
+Q         ?= @
+ifeq ($(COMPILER),MIC)
+CFLAGS += -mmic
+LFLAGS += -mmic
+endif
+
+#CONFIGURE BUILD SYSTEM
+BUILD_DIR  = ./$(COMPILER)
+
+VPATH     = $(SRC_DIRS)
+FILES     = $(notdir $(foreach dir,$(SRC_DIRS),$(wildcard $(dir)/*.c)))
+OBJ       = $(patsubst %.c, $(BUILD_DIR)/%.o, $(FILES))
+
+
+
+STATIC_LIBHWLOC = libhwloc.a
+SHARED_LIBHWLOC = libhwloc.so
+
+CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
+
+all: $(BUILD_DIR) $(OBJ) $(STATIC_LIBHWLOC) $(SHARED_LIBHWLOC)
+
+$(BUILD_DIR):
+	@mkdir $(BUILD_DIR)
+
+
+$(STATIC_LIBHWLOC): $(OBJ)
+	$(Q)${AR} -cq $(STATIC_LIBHWLOC) $(OBJ)
+
+$(SHARED_LIBHWLOC): $(OBJ)
+	${Q}$(CC) $(LFLAGS) -Wall -shared -fPIC -o $(SHARED_LIBHWLOC) $(OBJ) $(LIBS)
+
+#PATTERN RULES
+$(BUILD_DIR)/%.o:  %.c
+	${Q}$(CC) -c  $(CFLAGS) $(CPPFLAGS) $< -o $@
+	${Q}$(CC) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+
+ifeq ($(findstring $(MAKECMDGOALS),clean),)
+-include $(OBJ:.o=.d)
+endif
+
+.PHONY: clean distclean
+
+clean:
+	@rm -rf $(BUILD_DIR)
+
+distclean: clean
+	@rm -f $(TARGET) $(STATIC_LIBHWLOC) $(SHARED_LIBHWLOC) $(PCILIB)
+
+
+
diff --git a/ext/hwloc/hwloc/base64.c b/ext/hwloc/hwloc/base64.c
new file mode 100644
index 0000000..7a3392f
--- /dev/null
+++ b/ext/hwloc/hwloc/base64.c
@@ -0,0 +1,306 @@
+/*
+ * Copyright © 2012 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ *
+ * Modifications after import:
+ * - removed all #if
+ * - updated prototypes
+ * - updated #include
+ */
+
+/*	$OpenBSD: base64.c,v 1.5 2006/10/21 09:55:03 otto Exp $	*/
+
+/*
+ * Copyright (c) 1996 by Internet Software Consortium.
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND INTERNET SOFTWARE CONSORTIUM DISCLAIMS
+ * ALL WARRANTIES WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL INTERNET SOFTWARE
+ * CONSORTIUM BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL
+ * DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR
+ * PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
+ * ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
+ * SOFTWARE.
+ */
+
+/*
+ * Portions Copyright (c) 1995 by International Business Machines, Inc.
+ *
+ * International Business Machines, Inc. (hereinafter called IBM) grants
+ * permission under its copyrights to use, copy, modify, and distribute this
+ * Software with or without fee, provided that the above copyright notice and
+ * all paragraphs of this notice appear in all copies, and that the name of IBM
+ * not be used in connection with the marketing of any product incorporating
+ * the Software or modifications thereof, without specific, written prior
+ * permission.
+ *
+ * To the extent it has a right to do so, IBM grants an immunity from suit
+ * under its patents, if any, for the use, sale or manufacture of products to
+ * the extent that such products are used for performing Domain Name System
+ * dynamic updates in TCP/IP networks by means of the Software.  No immunity is
+ * granted for any product per se or for any other function of any product.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", AND IBM DISCLAIMS ALL WARRANTIES,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+ * PARTICULAR PURPOSE.  IN NO EVENT SHALL IBM BE LIABLE FOR ANY SPECIAL,
+ * DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER ARISING
+ * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE, EVEN
+ * IF IBM IS APPRISED OF THE POSSIBILITY OF SUCH DAMAGES.
+ */
+
+/* OPENBSD ORIGINAL: lib/libc/net/base64.c */
+
+static const char Base64[] =
+	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static const char Pad64 = '=';
+
+/* (From RFC1521 and draft-ietf-dnssec-secext-03.txt)
+   The following encoding technique is taken from RFC 1521 by Borenstein
+   and Freed.  It is reproduced here in a slightly edited form for
+   convenience.
+
+   A 65-character subset of US-ASCII is used, enabling 6 bits to be
+   represented per printable character. (The extra 65th character, "=",
+   is used to signify a special processing function.)
+
+   The encoding process represents 24-bit groups of input bits as output
+   strings of 4 encoded characters. Proceeding from left to right, a
+   24-bit input group is formed by concatenating 3 8-bit input groups.
+   These 24 bits are then treated as 4 concatenated 6-bit groups, each
+   of which is translated into a single digit in the base64 alphabet.
+
+   Each 6-bit group is used as an index into an array of 64 printable
+   characters. The character referenced by the index is placed in the
+   output string.
+
+                         Table 1: The Base64 Alphabet
+
+      Value Encoding  Value Encoding  Value Encoding  Value Encoding
+          0 A            17 R            34 i            51 z
+          1 B            18 S            35 j            52 0
+          2 C            19 T            36 k            53 1
+          3 D            20 U            37 l            54 2
+          4 E            21 V            38 m            55 3
+          5 F            22 W            39 n            56 4
+          6 G            23 X            40 o            57 5
+          7 H            24 Y            41 p            58 6
+          8 I            25 Z            42 q            59 7
+          9 J            26 a            43 r            60 8
+         10 K            27 b            44 s            61 9
+         11 L            28 c            45 t            62 +
+         12 M            29 d            46 u            63 /
+         13 N            30 e            47 v
+         14 O            31 f            48 w         (pad) =
+         15 P            32 g            49 x
+         16 Q            33 h            50 y
+
+   Special processing is performed if fewer than 24 bits are available
+   at the end of the data being encoded.  A full encoding quantum is
+   always completed at the end of a quantity.  When fewer than 24 input
+   bits are available in an input group, zero bits are added (on the
+   right) to form an integral number of 6-bit groups.  Padding at the
+   end of the data is performed using the '=' character.
+
+   Since all base64 input is an integral number of octets, only the
+         -------------------------------------------------
+   following cases can arise:
+
+       (1) the final quantum of encoding input is an integral
+           multiple of 24 bits; here, the final unit of encoded
+	   output will be an integral multiple of 4 characters
+	   with no "=" padding,
+       (2) the final quantum of encoding input is exactly 8 bits;
+           here, the final unit of encoded output will be two
+	   characters followed by two "=" padding characters, or
+       (3) the final quantum of encoding input is exactly 16 bits;
+           here, the final unit of encoded output will be three
+	   characters followed by one "=" padding character.
+   */
+
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include <private/private.h>
+
+int
+hwloc_encode_to_base64(const char *src, size_t srclength, char *target, size_t targsize)
+{
+	size_t datalength = 0;
+	unsigned char input[3];
+	unsigned char output[4];
+	unsigned int i;
+
+	while (2 < srclength) {
+		input[0] = *src++;
+		input[1] = *src++;
+		input[2] = *src++;
+		srclength -= 3;
+
+		output[0] = input[0] >> 2;
+		output[1] = ((input[0] & 0x03) << 4) + (input[1] >> 4);
+		output[2] = ((input[1] & 0x0f) << 2) + (input[2] >> 6);
+		output[3] = input[2] & 0x3f;
+
+		if (datalength + 4 > targsize)
+			return (-1);
+		target[datalength++] = Base64[output[0]];
+		target[datalength++] = Base64[output[1]];
+		target[datalength++] = Base64[output[2]];
+		target[datalength++] = Base64[output[3]];
+	}
+
+	/* Now we worry about padding. */
+	if (0 != srclength) {
+		/* Get what's left. */
+		input[0] = input[1] = input[2] = '\0';
+		for (i = 0; i < srclength; i++)
+			input[i] = *src++;
+
+		output[0] = input[0] >> 2;
+		output[1] = ((input[0] & 0x03) << 4) + (input[1] >> 4);
+		output[2] = ((input[1] & 0x0f) << 2) + (input[2] >> 6);
+
+		if (datalength + 4 > targsize)
+			return (-1);
+		target[datalength++] = Base64[output[0]];
+		target[datalength++] = Base64[output[1]];
+		if (srclength == 1)
+			target[datalength++] = Pad64;
+		else
+			target[datalength++] = Base64[output[2]];
+		target[datalength++] = Pad64;
+	}
+	if (datalength >= targsize)
+		return (-1);
+	target[datalength] = '\0';	/* Returned value doesn't count \0. */
+	return (datalength);
+}
+
+/* skips all whitespace anywhere.
+   converts characters, four at a time, starting at (or after)
+   src from base - 64 numbers into three 8 bit bytes in the target area.
+   it returns the number of data bytes stored at the target, or -1 on error.
+ */
+
+int
+hwloc_decode_from_base64(char const *src, char *target, size_t targsize)
+{
+	unsigned int tarindex, state;
+	int ch;
+	char *pos;
+
+	state = 0;
+	tarindex = 0;
+
+	while ((ch = *src++) != '\0') {
+		if (isspace(ch))	/* Skip whitespace anywhere. */
+			continue;
+
+		if (ch == Pad64)
+			break;
+
+		pos = strchr(Base64, ch);
+		if (pos == 0) 		/* A non-base64 character. */
+			return (-1);
+
+		switch (state) {
+		case 0:
+			if (target) {
+				if (tarindex >= targsize)
+					return (-1);
+				target[tarindex] = (pos - Base64) << 2;
+			}
+			state = 1;
+			break;
+		case 1:
+			if (target) {
+				if (tarindex + 1 >= targsize)
+					return (-1);
+				target[tarindex]   |=  (pos - Base64) >> 4;
+				target[tarindex+1]  = ((pos - Base64) & 0x0f)
+							<< 4 ;
+			}
+			tarindex++;
+			state = 2;
+			break;
+		case 2:
+			if (target) {
+				if (tarindex + 1 >= targsize)
+					return (-1);
+				target[tarindex]   |=  (pos - Base64) >> 2;
+				target[tarindex+1]  = ((pos - Base64) & 0x03)
+							<< 6;
+			}
+			tarindex++;
+			state = 3;
+			break;
+		case 3:
+			if (target) {
+				if (tarindex >= targsize)
+					return (-1);
+				target[tarindex] |= (pos - Base64);
+			}
+			tarindex++;
+			state = 0;
+			break;
+		}
+	}
+
+	/*
+	 * We are done decoding Base-64 chars.  Let's see if we ended
+	 * on a byte boundary, and/or with erroneous trailing characters.
+	 */
+
+	if (ch == Pad64) {		/* We got a pad char. */
+		ch = *src++;		/* Skip it, get next. */
+		switch (state) {
+		case 0:		/* Invalid = in first position */
+		case 1:		/* Invalid = in second position */
+			return (-1);
+
+		case 2:		/* Valid, means one byte of info */
+			/* Skip any number of spaces. */
+			for (; ch != '\0'; ch = *src++)
+				if (!isspace(ch))
+					break;
+			/* Make sure there is another trailing = sign. */
+			if (ch != Pad64)
+				return (-1);
+			ch = *src++;		/* Skip the = */
+			/* Fall through to "single trailing =" case. */
+			/* FALLTHROUGH */
+
+		case 3:		/* Valid, means two bytes of info */
+			/*
+			 * We know this char is an =.  Is there anything but
+			 * whitespace after it?
+			 */
+			for (; ch != '\0'; ch = *src++)
+				if (!isspace(ch))
+					return (-1);
+
+			/*
+			 * Now make sure for cases 2 and 3 that the "extra"
+			 * bits that slopped past the last full byte were
+			 * zeros.  If we don't check them, they become a
+			 * subliminal channel.
+			 */
+			if (target && target[tarindex] != 0)
+				return (-1);
+		}
+	} else {
+		/*
+		 * We ended by seeing the end of the string.  Make sure we
+		 * have no partial bytes lying around.
+		 */
+		if (state != 0)
+			return (-1);
+	}
+
+	return (tarindex);
+}
diff --git a/ext/hwloc/hwloc/bind.c b/ext/hwloc/hwloc/bind.c
new file mode 100644
index 0000000..e2b5a06
--- /dev/null
+++ b/ext/hwloc/hwloc/bind.c
@@ -0,0 +1,781 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2011 inria.  All rights reserved.
+ * Copyright © 2009-2010, 2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <hwloc/helper.h>
+#ifdef HAVE_SYS_MMAN_H
+#  include <sys/mman.h>
+#endif
+/* <malloc.h> is only needed if we don't have posix_memalign() */
+#if defined(hwloc_getpagesize) && !defined(HAVE_POSIX_MEMALIGN) && defined(HAVE_MEMALIGN) && defined(HAVE_MALLOC_H)
+#include <malloc.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <stdlib.h>
+#include <errno.h>
+
+/* TODO: HWLOC_GNU_SYS, HWLOC_IRIX_SYS,
+ *
+ * IRIX: see MP_MUSTRUN / _DSM_MUSTRUN, pthread_setrunon_np, /hw, procss_cpulink, numa_create
+ *
+ * We could use glibc's sched_setaffinity generically when it is available
+ *
+ * Darwin and OpenBSD don't seem to have binding facilities.
+ */
+
+static hwloc_const_bitmap_t
+hwloc_fix_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t set)
+{
+  hwloc_const_bitmap_t topology_set = hwloc_topology_get_topology_cpuset(topology);
+  hwloc_const_bitmap_t complete_set = hwloc_topology_get_complete_cpuset(topology);
+
+  if (!topology_set) {
+    /* The topology is composed of several systems, the cpuset is ambiguous. */
+    errno = EXDEV;
+    return NULL;
+  }
+
+  if (hwloc_bitmap_iszero(set)) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (!hwloc_bitmap_isincluded(set, complete_set)) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (hwloc_bitmap_isincluded(topology_set, set))
+    set = complete_set;
+
+  return set;
+}
+
+int
+hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t set, int flags)
+{
+  set = hwloc_fix_cpubind(topology, set);
+  if (!set)
+    return -1;
+
+  if (flags & HWLOC_CPUBIND_PROCESS) {
+    if (topology->binding_hooks.set_thisproc_cpubind)
+      return topology->binding_hooks.set_thisproc_cpubind(topology, set, flags);
+  } else if (flags & HWLOC_CPUBIND_THREAD) {
+    if (topology->binding_hooks.set_thisthread_cpubind)
+      return topology->binding_hooks.set_thisthread_cpubind(topology, set, flags);
+  } else {
+    if (topology->binding_hooks.set_thisproc_cpubind)
+      return topology->binding_hooks.set_thisproc_cpubind(topology, set, flags);
+    else if (topology->binding_hooks.set_thisthread_cpubind)
+      return topology->binding_hooks.set_thisthread_cpubind(topology, set, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_cpubind(hwloc_topology_t topology, hwloc_bitmap_t set, int flags)
+{
+  if (flags & HWLOC_CPUBIND_PROCESS) {
+    if (topology->binding_hooks.get_thisproc_cpubind)
+      return topology->binding_hooks.get_thisproc_cpubind(topology, set, flags);
+  } else if (flags & HWLOC_CPUBIND_THREAD) {
+    if (topology->binding_hooks.get_thisthread_cpubind)
+      return topology->binding_hooks.get_thisthread_cpubind(topology, set, flags);
+  } else {
+    if (topology->binding_hooks.get_thisproc_cpubind)
+      return topology->binding_hooks.get_thisproc_cpubind(topology, set, flags);
+    else if (topology->binding_hooks.get_thisthread_cpubind)
+      return topology->binding_hooks.get_thisthread_cpubind(topology, set, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t set, int flags)
+{
+  set = hwloc_fix_cpubind(topology, set);
+  if (!set)
+    return -1;
+
+  if (topology->binding_hooks.set_proc_cpubind)
+    return topology->binding_hooks.set_proc_cpubind(topology, pid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, int flags)
+{
+  if (topology->binding_hooks.get_proc_cpubind)
+    return topology->binding_hooks.get_proc_cpubind(topology, pid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+#ifdef hwloc_thread_t
+int
+hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_const_bitmap_t set, int flags)
+{
+  set = hwloc_fix_cpubind(topology, set);
+  if (!set)
+    return -1;
+
+  if (topology->binding_hooks.set_thread_cpubind)
+    return topology->binding_hooks.set_thread_cpubind(topology, tid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_bitmap_t set, int flags)
+{
+  if (topology->binding_hooks.get_thread_cpubind)
+    return topology->binding_hooks.get_thread_cpubind(topology, tid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+#endif
+
+int
+hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t set, int flags)
+{
+  if (flags & HWLOC_CPUBIND_PROCESS) {
+    if (topology->binding_hooks.get_thisproc_last_cpu_location)
+      return topology->binding_hooks.get_thisproc_last_cpu_location(topology, set, flags);
+  } else if (flags & HWLOC_CPUBIND_THREAD) {
+    if (topology->binding_hooks.get_thisthread_last_cpu_location)
+      return topology->binding_hooks.get_thisthread_last_cpu_location(topology, set, flags);
+  } else {
+    if (topology->binding_hooks.get_thisproc_last_cpu_location)
+      return topology->binding_hooks.get_thisproc_last_cpu_location(topology, set, flags);
+    else if (topology->binding_hooks.get_thisthread_last_cpu_location)
+      return topology->binding_hooks.get_thisthread_last_cpu_location(topology, set, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t set, int flags)
+{
+  if (topology->binding_hooks.get_proc_last_cpu_location)
+    return topology->binding_hooks.get_proc_last_cpu_location(topology, pid, set, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+static hwloc_const_nodeset_t
+hwloc_fix_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset)
+{
+  hwloc_const_bitmap_t topology_nodeset = hwloc_topology_get_topology_nodeset(topology);
+  hwloc_const_bitmap_t complete_nodeset = hwloc_topology_get_complete_nodeset(topology);
+
+  if (!hwloc_topology_get_topology_cpuset(topology)) {
+    /* The topology is composed of several systems, the nodeset is thus
+     * ambiguous. */
+    errno = EXDEV;
+    return NULL;
+  }
+
+  if (!complete_nodeset) {
+    /* There is no NUMA node */
+    errno = ENODEV;
+    return NULL;
+  }
+
+  if (hwloc_bitmap_iszero(nodeset)) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (!hwloc_bitmap_isincluded(nodeset, complete_nodeset)) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (hwloc_bitmap_isincluded(topology_nodeset, nodeset))
+    return complete_nodeset;
+
+  return nodeset;
+}
+
+static int
+hwloc_fix_membind_cpuset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_const_cpuset_t cpuset)
+{
+  hwloc_const_bitmap_t topology_set = hwloc_topology_get_topology_cpuset(topology);
+  hwloc_const_bitmap_t complete_set = hwloc_topology_get_complete_cpuset(topology);
+  hwloc_const_bitmap_t complete_nodeset = hwloc_topology_get_complete_nodeset(topology);
+
+  if (!topology_set) {
+    /* The topology is composed of several systems, the cpuset is thus
+     * ambiguous. */
+    errno = EXDEV;
+    return -1;
+  }
+
+  if (!complete_nodeset) {
+    /* There is no NUMA node */
+    errno = ENODEV;
+    return -1;
+  }
+
+  if (hwloc_bitmap_iszero(cpuset)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (!hwloc_bitmap_isincluded(cpuset, complete_set)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (hwloc_bitmap_isincluded(topology_set, cpuset)) {
+    hwloc_bitmap_copy(nodeset, complete_nodeset);
+    return 0;
+  }
+
+  hwloc_cpuset_to_nodeset(topology, cpuset, nodeset);
+  return 0;
+}
+
+int
+hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  nodeset = hwloc_fix_membind(topology, nodeset);
+  if (!nodeset)
+    return -1;
+
+  if (flags & HWLOC_MEMBIND_PROCESS) {
+    if (topology->binding_hooks.set_thisproc_membind)
+      return topology->binding_hooks.set_thisproc_membind(topology, nodeset, policy, flags);
+  } else if (flags & HWLOC_MEMBIND_THREAD) {
+    if (topology->binding_hooks.set_thisthread_membind)
+      return topology->binding_hooks.set_thisthread_membind(topology, nodeset, policy, flags);
+  } else {
+    if (topology->binding_hooks.set_thisproc_membind)
+      return topology->binding_hooks.set_thisproc_membind(topology, nodeset, policy, flags);
+    else if (topology->binding_hooks.set_thisthread_membind)
+      return topology->binding_hooks.set_thisthread_membind(topology, nodeset, policy, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_set_membind(hwloc_topology_t topology, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+  hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+  int ret;
+
+  if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+    ret = -1;
+  else
+    ret = hwloc_set_membind_nodeset(topology, nodeset, policy, flags);
+
+  hwloc_bitmap_free(nodeset);
+  return ret;
+}
+
+int
+hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  if (flags & HWLOC_MEMBIND_PROCESS) {
+    if (topology->binding_hooks.get_thisproc_membind)
+      return topology->binding_hooks.get_thisproc_membind(topology, nodeset, policy, flags);
+  } else if (flags & HWLOC_MEMBIND_THREAD) {
+    if (topology->binding_hooks.get_thisthread_membind)
+      return topology->binding_hooks.get_thisthread_membind(topology, nodeset, policy, flags);
+  } else {
+    if (topology->binding_hooks.get_thisproc_membind)
+      return topology->binding_hooks.get_thisproc_membind(topology, nodeset, policy, flags);
+    else if (topology->binding_hooks.get_thisthread_membind)
+      return topology->binding_hooks.get_thisthread_membind(topology, nodeset, policy, flags);
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_membind(hwloc_topology_t topology, hwloc_cpuset_t set, hwloc_membind_policy_t * policy, int flags)
+{
+  hwloc_nodeset_t nodeset;
+  int ret;
+
+  nodeset = hwloc_bitmap_alloc();
+  ret = hwloc_get_membind_nodeset(topology, nodeset, policy, flags);
+
+  if (!ret)
+    hwloc_cpuset_from_nodeset(topology, set, nodeset);
+
+  hwloc_bitmap_free(nodeset);
+  return ret;
+}
+
+int
+hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  nodeset = hwloc_fix_membind(topology, nodeset);
+  if (!nodeset)
+    return -1;
+
+  if (topology->binding_hooks.set_proc_membind)
+    return topology->binding_hooks.set_proc_membind(topology, pid, nodeset, policy, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+
+int
+hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+  hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+  int ret;
+
+  if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+    ret = -1;
+  else
+    ret = hwloc_set_proc_membind_nodeset(topology, pid, nodeset, policy, flags);
+
+  hwloc_bitmap_free(nodeset);
+  return ret;
+}
+
+int
+hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  if (topology->binding_hooks.get_proc_membind)
+    return topology->binding_hooks.get_proc_membind(topology, pid, nodeset, policy, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, hwloc_membind_policy_t * policy, int flags)
+{
+  hwloc_nodeset_t nodeset;
+  int ret;
+
+  nodeset = hwloc_bitmap_alloc();
+  ret = hwloc_get_proc_membind_nodeset(topology, pid, nodeset, policy, flags);
+
+  if (!ret)
+    hwloc_cpuset_from_nodeset(topology, set, nodeset);
+
+  hwloc_bitmap_free(nodeset);
+  return ret;
+}
+
+int
+hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  nodeset = hwloc_fix_membind(topology, nodeset);
+  if (!nodeset)
+    return -1;
+
+  if (topology->binding_hooks.set_area_membind)
+    return topology->binding_hooks.set_area_membind(topology, addr, len, nodeset, policy, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+  hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+  int ret;
+
+  if (hwloc_fix_membind_cpuset(topology, nodeset, set))
+    ret = -1;
+  else
+    ret = hwloc_set_area_membind_nodeset(topology, addr, len, nodeset, policy, flags);
+
+  hwloc_bitmap_free(nodeset);
+  return ret;
+}
+
+int
+hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags)
+{
+  if (topology->binding_hooks.get_area_membind)
+    return topology->binding_hooks.get_area_membind(topology, addr, len, nodeset, policy, flags);
+
+  errno = ENOSYS;
+  return -1;
+}
+
+int
+hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_cpuset_t set, hwloc_membind_policy_t * policy, int flags)
+{
+  hwloc_nodeset_t nodeset;
+  int ret;
+
+  nodeset = hwloc_bitmap_alloc();
+  ret = hwloc_get_area_membind_nodeset(topology, addr, len, nodeset, policy, flags);
+
+  if (!ret)
+    hwloc_cpuset_from_nodeset(topology, set, nodeset);
+
+  hwloc_bitmap_free(nodeset);
+  return ret;
+}
+
+void *
+hwloc_alloc_heap(hwloc_topology_t topology __hwloc_attribute_unused, size_t len)
+{
+  void *p;
+#if defined(hwloc_getpagesize) && defined(HAVE_POSIX_MEMALIGN)
+  errno = posix_memalign(&p, hwloc_getpagesize(), len);
+  if (errno)
+    p = NULL;
+#elif defined(hwloc_getpagesize) && defined(HAVE_MEMALIGN)
+  p = memalign(hwloc_getpagesize(), len);
+#else
+  p = malloc(len);
+#endif
+  return p;
+}
+
+#ifdef MAP_ANONYMOUS
+void *
+hwloc_alloc_mmap(hwloc_topology_t topology __hwloc_attribute_unused, size_t len)
+{
+  return mmap(NULL, len, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
+}
+#endif
+
+int
+hwloc_free_heap(hwloc_topology_t topology __hwloc_attribute_unused, void *addr, size_t len __hwloc_attribute_unused)
+{
+  free(addr);
+  return 0;
+}
+
+#ifdef MAP_ANONYMOUS
+int
+hwloc_free_mmap(hwloc_topology_t topology __hwloc_attribute_unused, void *addr, size_t len)
+{
+  if (!addr)
+    return 0;
+  return munmap(addr, len);
+}
+#endif
+
+void *
+hwloc_alloc(hwloc_topology_t topology, size_t len)
+{
+  if (topology->binding_hooks.alloc)
+    return topology->binding_hooks.alloc(topology, len);
+  return hwloc_alloc_heap(topology, len);
+}
+
+void *
+hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  void *p;
+  nodeset = hwloc_fix_membind(topology, nodeset);
+  if (!nodeset)
+    goto fallback;
+  if (flags & HWLOC_MEMBIND_MIGRATE) {
+    errno = EINVAL;
+    goto fallback;
+  }
+
+  if (topology->binding_hooks.alloc_membind)
+    return topology->binding_hooks.alloc_membind(topology, len, nodeset, policy, flags);
+  else if (topology->binding_hooks.set_area_membind) {
+    p = hwloc_alloc(topology, len);
+    if (!p)
+      return NULL;
+    if (topology->binding_hooks.set_area_membind(topology, p, len, nodeset, policy, flags) && flags & HWLOC_MEMBIND_STRICT) {
+      int error = errno;
+      free(p);
+      errno = error;
+      return NULL;
+    }
+    return p;
+  } else {
+    errno = ENOSYS;
+  }
+
+fallback:
+  if (flags & HWLOC_MEMBIND_STRICT)
+    /* Report error */
+    return NULL;
+  /* Never mind, allocate anyway */
+  return hwloc_alloc(topology, len);
+}
+
+void *
+hwloc_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+  hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+  void *ret;
+
+  if (hwloc_fix_membind_cpuset(topology, nodeset, set)) {
+    if (flags & HWLOC_MEMBIND_STRICT)
+      ret = NULL;
+    else
+      ret = hwloc_alloc(topology, len);
+  } else
+    ret = hwloc_alloc_membind_nodeset(topology, len, nodeset, policy, flags);
+
+  hwloc_bitmap_free(nodeset);
+  return ret;
+}
+
+int
+hwloc_free(hwloc_topology_t topology, void *addr, size_t len)
+{
+  if (topology->binding_hooks.free_membind)
+    return topology->binding_hooks.free_membind(topology, addr, len);
+  return hwloc_free_heap(topology, addr, len);
+}
+
+/*
+ * Empty binding hooks always returning success
+ */
+
+static int dontset_return_complete_cpuset(hwloc_topology_t topology, hwloc_cpuset_t set)
+{
+  hwloc_const_cpuset_t cpuset = hwloc_topology_get_complete_cpuset(topology);
+  if (cpuset) {
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+    return 0;
+  } else
+    return -1;
+}
+
+static int dontset_thisthread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thisthread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_cpuset(topology, set);
+}
+static int dontset_thisproc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thisproc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_cpuset(topology, set);
+}
+static int dontset_proc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_proc_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_bitmap_t cpuset, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_cpuset(topology, cpuset);
+}
+#ifdef hwloc_thread_t
+static int dontset_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid __hwloc_attribute_unused, hwloc_bitmap_t cpuset, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_cpuset(topology, cpuset);
+}
+#endif
+
+static int dontset_return_complete_nodeset(hwloc_topology_t topology, hwloc_nodeset_t set, hwloc_membind_policy_t *policy)
+{
+  hwloc_const_nodeset_t nodeset = hwloc_topology_get_complete_nodeset(topology);
+  if (nodeset) {
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_nodeset(topology));
+    *policy = HWLOC_MEMBIND_DEFAULT;
+    return 0;
+  } else
+    return -1;
+}
+
+static int dontset_thisproc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thisproc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static int dontset_thisthread_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_thisthread_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static int dontset_proc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_proc_membind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_pid_t pid __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static int dontset_area_membind(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int dontget_area_membind(hwloc_topology_t topology __hwloc_attribute_unused, const void *addr __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_bitmap_t set, hwloc_membind_policy_t * policy, int flags __hwloc_attribute_unused)
+{
+  return dontset_return_complete_nodeset(topology, set, policy);
+}
+
+static void * dontalloc_membind(hwloc_topology_t topology __hwloc_attribute_unused, size_t size __hwloc_attribute_unused, hwloc_const_bitmap_t set __hwloc_attribute_unused, hwloc_membind_policy_t policy __hwloc_attribute_unused, int flags __hwloc_attribute_unused)
+{
+  return malloc(size);
+}
+static int dontfree_membind(hwloc_topology_t topology __hwloc_attribute_unused, void *addr __hwloc_attribute_unused, size_t size __hwloc_attribute_unused)
+{
+  free(addr);
+  return 0;
+}
+
+static void hwloc_set_dummy_hooks(struct hwloc_binding_hooks *hooks,
+				  struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+  hooks->set_thisproc_cpubind = dontset_thisproc_cpubind;
+  hooks->get_thisproc_cpubind = dontget_thisproc_cpubind;
+  hooks->set_thisthread_cpubind = dontset_thisthread_cpubind;
+  hooks->get_thisthread_cpubind = dontget_thisthread_cpubind;
+  hooks->set_proc_cpubind = dontset_proc_cpubind;
+  hooks->get_proc_cpubind = dontget_proc_cpubind;
+#ifdef hwloc_thread_t
+  hooks->set_thread_cpubind = dontset_thread_cpubind;
+  hooks->get_thread_cpubind = dontget_thread_cpubind;
+#endif
+  hooks->get_thisproc_last_cpu_location = dontget_thisproc_cpubind; /* cpubind instead of last_cpu_location is ok */
+  hooks->get_thisthread_last_cpu_location = dontget_thisthread_cpubind; /* cpubind instead of last_cpu_location is ok */
+  hooks->get_proc_last_cpu_location = dontget_proc_cpubind; /* cpubind instead of last_cpu_location is ok */
+  /* TODO: get_thread_last_cpu_location */
+  hooks->set_thisproc_membind = dontset_thisproc_membind;
+  hooks->get_thisproc_membind = dontget_thisproc_membind;
+  hooks->set_thisthread_membind = dontset_thisthread_membind;
+  hooks->get_thisthread_membind = dontget_thisthread_membind;
+  hooks->set_proc_membind = dontset_proc_membind;
+  hooks->get_proc_membind = dontget_proc_membind;
+  hooks->set_area_membind = dontset_area_membind;
+  hooks->get_area_membind = dontget_area_membind;
+  hooks->alloc_membind = dontalloc_membind;
+  hooks->free_membind = dontfree_membind;
+}
+
+void
+hwloc_set_native_binding_hooks(struct hwloc_binding_hooks *hooks, struct hwloc_topology_support *support)
+{
+#    ifdef HWLOC_LINUX_SYS
+    hwloc_set_linuxfs_hooks(hooks, support);
+#    endif /* HWLOC_LINUX_SYS */
+
+#    ifdef HWLOC_BGQ_SYS
+    hwloc_set_bgq_hooks(hooks, support);
+#    endif /* HWLOC_BGQ_SYS */
+
+#    ifdef HWLOC_AIX_SYS
+    hwloc_set_aix_hooks(hooks, support);
+#    endif /* HWLOC_AIX_SYS */
+
+#    ifdef HWLOC_OSF_SYS
+    hwloc_set_osf_hooks(hooks, support);
+#    endif /* HWLOC_OSF_SYS */
+
+#    ifdef HWLOC_SOLARIS_SYS
+    hwloc_set_solaris_hooks(hooks, support);
+#    endif /* HWLOC_SOLARIS_SYS */
+
+#    ifdef HWLOC_WIN_SYS
+    hwloc_set_windows_hooks(hooks, support);
+#    endif /* HWLOC_WIN_SYS */
+
+#    ifdef HWLOC_DARWIN_SYS
+    hwloc_set_darwin_hooks(hooks, support);
+#    endif /* HWLOC_DARWIN_SYS */
+
+#    ifdef HWLOC_FREEBSD_SYS
+    hwloc_set_freebsd_hooks(hooks, support);
+#    endif /* HWLOC_FREEBSD_SYS */
+
+#    ifdef HWLOC_NETBSD_SYS
+    hwloc_set_netbsd_hooks(hooks, support);
+#    endif /* HWLOC_NETBSD_SYS */
+
+#    ifdef HWLOC_HPUX_SYS
+    hwloc_set_hpux_hooks(hooks, support);
+#    endif /* HWLOC_HPUX_SYS */
+}
+
+/* If the represented system is actually not this system, use dummy binding hooks. */
+void
+hwloc_set_binding_hooks(struct hwloc_topology *topology)
+{
+  if (topology->is_thissystem) {
+    hwloc_set_native_binding_hooks(&topology->binding_hooks, &topology->support);
+    /* every hook not set above will return ENOSYS */
+  } else {
+    /* not this system, use dummy binding hooks that do nothing (but don't return ENOSYS) */
+    hwloc_set_dummy_hooks(&topology->binding_hooks, &topology->support);
+  }
+
+  /* if not is_thissystem, set_cpubind is fake
+   * and get_cpubind returns the whole system cpuset,
+   * so don't report that set/get_cpubind as supported
+   */
+  if (topology->is_thissystem) {
+#define DO(which,kind) \
+    if (topology->binding_hooks.kind) \
+      topology->support.which##bind->kind = 1;
+    DO(cpu,set_thisproc_cpubind);
+    DO(cpu,get_thisproc_cpubind);
+    DO(cpu,set_proc_cpubind);
+    DO(cpu,get_proc_cpubind);
+    DO(cpu,set_thisthread_cpubind);
+    DO(cpu,get_thisthread_cpubind);
+#ifdef hwloc_thread_t
+    DO(cpu,set_thread_cpubind);
+    DO(cpu,get_thread_cpubind);
+#endif
+    DO(cpu,get_thisproc_last_cpu_location);
+    DO(cpu,get_proc_last_cpu_location);
+    DO(cpu,get_thisthread_last_cpu_location);
+    DO(mem,set_thisproc_membind);
+    DO(mem,get_thisproc_membind);
+    DO(mem,set_thisthread_membind);
+    DO(mem,get_thisthread_membind);
+    DO(mem,set_proc_membind);
+    DO(mem,get_proc_membind);
+    DO(mem,set_area_membind);
+    DO(mem,get_area_membind);
+    DO(mem,alloc_membind);
+  }
+}
diff --git a/ext/hwloc/hwloc/bitmap.c b/ext/hwloc/hwloc/bitmap.c
new file mode 100644
index 0000000..e2b807a
--- /dev/null
+++ b/ext/hwloc/hwloc/bitmap.c
@@ -0,0 +1,1492 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc.h>
+#include <private/misc.h>
+#include <private/private.h>
+#include <hwloc/bitmap.h>
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <assert.h>
+#include <errno.h>
+#include <ctype.h>
+
+/*
+ * possible improvements:
+ * - have a way to change the initial allocation size:
+ *   add hwloc_bitmap_set_foo() to changes a global here,
+ *   and make the hwloc core call based on the early number of PUs
+ * - preallocate inside the bitmap structure (so that the whole structure is a cacheline for instance)
+ *   and allocate a dedicated array only later when reallocating larger
+ * - add a bitmap->ulongs_empty_first which guarantees that some first ulongs are empty,
+ *   making tests much faster for big bitmaps since there's no need to look at first ulongs.
+ *   no need for ulongs_empty_first to be exactly the max number of empty ulongs,
+ *   clearing bits that were set earlier isn't very common.
+ */
+
+/* magic number */
+#define HWLOC_BITMAP_MAGIC 0x20091007
+
+/* actual opaque type internals */
+struct hwloc_bitmap_s {
+  unsigned ulongs_count; /* how many ulong bitmasks are valid, >= 1 */
+  unsigned ulongs_allocated; /* how many ulong bitmasks are allocated, >= ulongs_count */
+  unsigned long *ulongs;
+  int infinite; /* set to 1 if all bits beyond ulongs are set */
+#ifdef HWLOC_DEBUG
+  int magic;
+#endif
+};
+
+/* overzealous check in debug-mode, not as powerful as valgrind but still useful */
+#ifdef HWLOC_DEBUG
+#define HWLOC__BITMAP_CHECK(set) do {				\
+  assert((set)->magic == HWLOC_BITMAP_MAGIC);			\
+  assert((set)->ulongs_count >= 1);				\
+  assert((set)->ulongs_allocated >= (set)->ulongs_count);	\
+} while (0)
+#else
+#define HWLOC__BITMAP_CHECK(set)
+#endif
+
+/* extract a subset from a set using an index or a cpu */
+#define HWLOC_SUBBITMAP_INDEX(cpu)		((cpu)/(HWLOC_BITS_PER_LONG))
+#define HWLOC_SUBBITMAP_CPU_ULBIT(cpu)		((cpu)%(HWLOC_BITS_PER_LONG))
+/* Read from a bitmap ulong without knowing whether x is valid.
+ * Writers should make sure that x is valid and modify set->ulongs[x] directly.
+ */
+#define HWLOC_SUBBITMAP_READULONG(set,x)	((x) < (set)->ulongs_count ? (set)->ulongs[x] : (set)->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO)
+
+/* predefined subset values */
+#define HWLOC_SUBBITMAP_ZERO			0UL
+#define HWLOC_SUBBITMAP_FULL			(~0UL)
+#define HWLOC_SUBBITMAP_ULBIT(bit)		(1UL<<(bit))
+#define HWLOC_SUBBITMAP_CPU(cpu)		HWLOC_SUBBITMAP_ULBIT(HWLOC_SUBBITMAP_CPU_ULBIT(cpu))
+#define HWLOC_SUBBITMAP_ULBIT_TO(bit)		(HWLOC_SUBBITMAP_FULL>>(HWLOC_BITS_PER_LONG-1-(bit)))
+#define HWLOC_SUBBITMAP_ULBIT_FROM(bit)		(HWLOC_SUBBITMAP_FULL<<(bit))
+#define HWLOC_SUBBITMAP_ULBIT_FROMTO(begin,end)	(HWLOC_SUBBITMAP_ULBIT_TO(end) & HWLOC_SUBBITMAP_ULBIT_FROM(begin))
+
+struct hwloc_bitmap_s * hwloc_bitmap_alloc(void)
+{
+  struct hwloc_bitmap_s * set;
+
+  set = malloc(sizeof(struct hwloc_bitmap_s));
+  if (!set)
+    return NULL;
+
+  set->ulongs_count = 1;
+  set->ulongs_allocated = 64/sizeof(unsigned long);
+  set->ulongs = malloc(64);
+  if (!set->ulongs) {
+    free(set);
+    return NULL;
+  }
+
+  set->ulongs[0] = HWLOC_SUBBITMAP_ZERO;
+  set->infinite = 0;
+#ifdef HWLOC_DEBUG
+  set->magic = HWLOC_BITMAP_MAGIC;
+#endif
+  return set;
+}
+
+struct hwloc_bitmap_s * hwloc_bitmap_alloc_full(void)
+{
+  struct hwloc_bitmap_s * set = hwloc_bitmap_alloc();
+  if (set) {
+    set->infinite = 1;
+    set->ulongs[0] = HWLOC_SUBBITMAP_FULL;
+  }
+  return set;
+}
+
+void hwloc_bitmap_free(struct hwloc_bitmap_s * set)
+{
+  if (!set)
+    return;
+
+  HWLOC__BITMAP_CHECK(set);
+#ifdef HWLOC_DEBUG
+  set->magic = 0;
+#endif
+
+  free(set->ulongs);
+  free(set);
+}
+
+/* enlarge until it contains at least needed_count ulongs.
+ */
+static void
+hwloc_bitmap_enlarge_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
+{
+  unsigned tmp = 1 << hwloc_flsl((unsigned long) needed_count - 1);
+  if (tmp > set->ulongs_allocated) {
+    set->ulongs = realloc(set->ulongs, tmp * sizeof(unsigned long));
+    assert(set->ulongs);
+    set->ulongs_allocated = tmp;
+  }
+}
+
+/* enlarge until it contains at least needed_count ulongs,
+ * and update new ulongs according to the infinite field.
+ */
+static void
+hwloc_bitmap_realloc_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
+{
+  unsigned i;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  if (needed_count <= set->ulongs_count)
+    return;
+
+  /* realloc larger if needed */
+  hwloc_bitmap_enlarge_by_ulongs(set, needed_count);
+
+  /* fill the newly allocated subset depending on the infinite flag */
+  for(i=set->ulongs_count; i<needed_count; i++)
+    set->ulongs[i] = set->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+  set->ulongs_count = needed_count;
+}
+
+/* realloc until it contains at least cpu+1 bits */
+#define hwloc_bitmap_realloc_by_cpu_index(set, cpu) hwloc_bitmap_realloc_by_ulongs(set, ((cpu)/HWLOC_BITS_PER_LONG)+1)
+
+/* reset a bitmap to exactely the needed size.
+ * the caller must reinitialize all ulongs and the infinite flag later.
+ */
+static void
+hwloc_bitmap_reset_by_ulongs(struct hwloc_bitmap_s * set, unsigned needed_count)
+{
+  hwloc_bitmap_enlarge_by_ulongs(set, needed_count);
+  set->ulongs_count = needed_count;
+}
+
+/* reset until it contains exactly cpu+1 bits (roundup to a ulong).
+ * the caller must reinitialize all ulongs and the infinite flag later.
+ */
+#define hwloc_bitmap_reset_by_cpu_index(set, cpu) hwloc_bitmap_reset_by_ulongs(set, ((cpu)/HWLOC_BITS_PER_LONG)+1)
+
+struct hwloc_bitmap_s * hwloc_bitmap_dup(const struct hwloc_bitmap_s * old)
+{
+  struct hwloc_bitmap_s * new;
+
+  if (!old)
+    return NULL;
+
+  HWLOC__BITMAP_CHECK(old);
+
+  new = malloc(sizeof(struct hwloc_bitmap_s));
+  if (!new)
+    return NULL;
+
+  new->ulongs = malloc(old->ulongs_allocated * sizeof(unsigned long));
+  if (!new->ulongs) {
+    free(new);
+    return NULL;
+  }
+  new->ulongs_allocated = old->ulongs_allocated;
+  new->ulongs_count = old->ulongs_count;
+  memcpy(new->ulongs, old->ulongs, new->ulongs_count * sizeof(unsigned long));
+  new->infinite = old->infinite;
+#ifdef HWLOC_DEBUG
+  new->magic = HWLOC_BITMAP_MAGIC;
+#endif
+  return new;
+}
+
+void hwloc_bitmap_copy(struct hwloc_bitmap_s * dst, const struct hwloc_bitmap_s * src)
+{
+  HWLOC__BITMAP_CHECK(dst);
+  HWLOC__BITMAP_CHECK(src);
+
+  hwloc_bitmap_reset_by_ulongs(dst, src->ulongs_count);
+
+  memcpy(dst->ulongs, src->ulongs, src->ulongs_count * sizeof(unsigned long));
+  dst->infinite = src->infinite;
+}
+
+/* Strings always use 32bit groups */
+#define HWLOC_PRIxSUBBITMAP		"%08lx"
+#define HWLOC_BITMAP_SUBSTRING_SIZE	32
+#define HWLOC_BITMAP_SUBSTRING_LENGTH	(HWLOC_BITMAP_SUBSTRING_SIZE/4)
+#define HWLOC_BITMAP_STRING_PER_LONG	(HWLOC_BITS_PER_LONG/HWLOC_BITMAP_SUBSTRING_SIZE)
+
+int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  ssize_t size = buflen;
+  char *tmp = buf;
+  int res, ret = 0;
+  int needcomma = 0;
+  int i;
+  unsigned long accum = 0;
+  int accumed = 0;
+#if HWLOC_BITS_PER_LONG == HWLOC_BITMAP_SUBSTRING_SIZE
+  const unsigned long accum_mask = ~0UL;
+#else /* HWLOC_BITS_PER_LONG != HWLOC_BITMAP_SUBSTRING_SIZE */
+  const unsigned long accum_mask = ((1UL << HWLOC_BITMAP_SUBSTRING_SIZE) - 1) << (HWLOC_BITS_PER_LONG - HWLOC_BITMAP_SUBSTRING_SIZE);
+#endif /* HWLOC_BITS_PER_LONG != HWLOC_BITMAP_SUBSTRING_SIZE */
+
+  HWLOC__BITMAP_CHECK(set);
+
+  /* mark the end in case we do nothing later */
+  if (buflen > 0)
+    tmp[0] = '\0';
+
+  if (set->infinite) {
+    res = hwloc_snprintf(tmp, size, "0xf...f");
+    needcomma = 1;
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= size)
+      res = size>0 ? size - 1 : 0;
+    tmp += res;
+    size -= res;
+  }
+
+  i=set->ulongs_count-1;
+
+  if (set->infinite) {
+    /* ignore starting FULL since we have 0xf...f already */
+    while (i>=0 && set->ulongs[i] == HWLOC_SUBBITMAP_FULL)
+      i--;
+  } else {
+    /* ignore starting ZERO except the last one */
+    while (i>=0 && set->ulongs[i] == HWLOC_SUBBITMAP_ZERO)
+      i--;
+  }
+
+  while (i>=0 || accumed) {
+    /* Refill accumulator */
+    if (!accumed) {
+      accum = set->ulongs[i--];
+      accumed = HWLOC_BITS_PER_LONG;
+    }
+
+    if (accum & accum_mask) {
+      /* print the whole subset if not empty */
+        res = hwloc_snprintf(tmp, size, needcomma ? ",0x" HWLOC_PRIxSUBBITMAP : "0x" HWLOC_PRIxSUBBITMAP,
+		     (accum & accum_mask) >> (HWLOC_BITS_PER_LONG - HWLOC_BITMAP_SUBSTRING_SIZE));
+      needcomma = 1;
+    } else if (i == -1 && accumed == HWLOC_BITMAP_SUBSTRING_SIZE) {
+      /* print a single 0 to mark the last subset */
+      res = hwloc_snprintf(tmp, size, needcomma ? ",0x0" : "0x0");
+    } else if (needcomma) {
+      res = hwloc_snprintf(tmp, size, ",");
+    } else {
+      res = 0;
+    }
+    if (res < 0)
+      return -1;
+    ret += res;
+
+#if HWLOC_BITS_PER_LONG == HWLOC_BITMAP_SUBSTRING_SIZE
+    accum = 0;
+    accumed = 0;
+#else
+    accum <<= HWLOC_BITMAP_SUBSTRING_SIZE;
+    accumed -= HWLOC_BITMAP_SUBSTRING_SIZE;
+#endif
+
+    if (res >= size)
+      res = size>0 ? size - 1 : 0;
+
+    tmp += res;
+    size -= res;
+  }
+
+  /* if didn't display anything, display 0x0 */
+  if (!ret) {
+    res = hwloc_snprintf(tmp, size, "0x0");
+    if (res < 0)
+      return -1;
+    ret += res;
+  }
+
+  return ret;
+}
+
+int hwloc_bitmap_asprintf(char ** strp, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  int len;
+  char *buf;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  len = hwloc_bitmap_snprintf(NULL, 0, set);
+  buf = malloc(len+1);
+  *strp = buf;
+  return hwloc_bitmap_snprintf(buf, len+1, set);
+}
+
+int hwloc_bitmap_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_restrict string)
+{
+  const char * current = string;
+  unsigned long accum = 0;
+  int count=0;
+  int infinite = 0;
+
+  /* count how many substrings there are */
+  count++;
+  while ((current = strchr(current+1, ',')) != NULL)
+    count++;
+
+  current = string;
+  if (!strncmp("0xf...f", current, 7)) {
+    current += 7;
+    if (*current != ',') {
+      /* special case for infinite/full bitmap */
+      hwloc_bitmap_fill(set);
+      return 0;
+    }
+    current++;
+    infinite = 1;
+    count--;
+  }
+
+  hwloc_bitmap_reset_by_ulongs(set, (count + HWLOC_BITMAP_STRING_PER_LONG - 1) / HWLOC_BITMAP_STRING_PER_LONG);
+  set->infinite = 0;
+
+  while (*current != '\0') {
+    unsigned long val;
+    char *next;
+    val = strtoul(current, &next, 16);
+
+    assert(count > 0);
+    count--;
+
+    accum |= (val << ((count * HWLOC_BITMAP_SUBSTRING_SIZE) % HWLOC_BITS_PER_LONG));
+    if (!(count % HWLOC_BITMAP_STRING_PER_LONG)) {
+      set->ulongs[count / HWLOC_BITMAP_STRING_PER_LONG] = accum;
+      accum = 0;
+    }
+
+    if (*next != ',') {
+      if (*next || count > 0)
+	goto failed;
+      else
+	break;
+    }
+    current = (const char*) next+1;
+  }
+
+  set->infinite = infinite; /* set at the end, to avoid spurious realloc with filled new ulongs */
+
+  return 0;
+
+ failed:
+  /* failure to parse */
+  hwloc_bitmap_zero(set);
+  return -1;
+}
+
+int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  int prev = -1;
+  hwloc_bitmap_t reverse;
+  ssize_t size = buflen;
+  char *tmp = buf;
+  int res, ret = 0;
+  int needcomma = 0;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  reverse = hwloc_bitmap_alloc(); /* FIXME: add hwloc_bitmap_alloc_size() + hwloc_bitmap_init_allocated() to avoid malloc? */
+  hwloc_bitmap_not(reverse, set);
+
+  /* mark the end in case we do nothing later */
+  if (buflen > 0)
+    tmp[0] = '\0';
+
+  while (1) {
+    int begin, end;
+
+    begin = hwloc_bitmap_next(set, prev);
+    if (begin == -1)
+      break;
+    end = hwloc_bitmap_next(reverse, begin);
+
+    if (end == begin+1) {
+      res = hwloc_snprintf(tmp, size, needcomma ? ",%d" : "%d", begin);
+    } else if (end == -1) {
+      res = hwloc_snprintf(tmp, size, needcomma ? ",%d-" : "%d-", begin);
+    } else {
+      res = hwloc_snprintf(tmp, size, needcomma ? ",%d-%d" : "%d-%d", begin, end-1);
+    }
+    if (res < 0) {
+      hwloc_bitmap_free(reverse);
+      return -1;
+    }
+    ret += res;
+
+    if (res >= size)
+      res = size>0 ? size - 1 : 0;
+
+    tmp += res;
+    size -= res;
+    needcomma = 1;
+
+    if (end == -1)
+      break;
+    else
+      prev = end - 1;
+  }
+
+  hwloc_bitmap_free(reverse);
+
+  return ret;
+}
+
+int hwloc_bitmap_list_asprintf(char ** strp, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  int len;
+  char *buf;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  len = hwloc_bitmap_list_snprintf(NULL, 0, set);
+  buf = malloc(len+1);
+  *strp = buf;
+  return hwloc_bitmap_list_snprintf(buf, len+1, set);
+}
+
+int hwloc_bitmap_list_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_restrict string)
+{
+  const char * current = string;
+  char *next;
+  long begin = -1, val;
+
+  hwloc_bitmap_zero(set);
+
+  while (*current != '\0') {
+
+    /* ignore empty ranges */
+    while (*current == ',')
+      current++;
+
+    val = strtoul(current, &next, 0);
+    /* make sure we got at least one digit */
+    if (next == current)
+      goto failed;
+
+    if (begin != -1) {
+      /* finishing a range */
+      hwloc_bitmap_set_range(set, begin, val);
+      begin = -1;
+
+    } else if (*next == '-') {
+      /* starting a new range */
+      if (*(next+1) == '\0') {
+	/* infinite range */
+	hwloc_bitmap_set_range(set, val, -1);
+        break;
+      } else {
+	/* normal range */
+	begin = val;
+      }
+
+    } else if (*next == ',' || *next == '\0') {
+      /* single digit */
+      hwloc_bitmap_set(set, val);
+    }
+
+    if (*next == '\0')
+      break;
+    current = next+1;
+  }
+
+  return 0;
+
+ failed:
+  /* failure to parse */
+  hwloc_bitmap_zero(set);
+  return -1;
+}
+
+int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  ssize_t size = buflen;
+  char *tmp = buf;
+  int res, ret = 0;
+  int started = 0;
+  int i;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  /* mark the end in case we do nothing later */
+  if (buflen > 0)
+    tmp[0] = '\0';
+
+  if (set->infinite) {
+    res = hwloc_snprintf(tmp, size, "0xf...f");
+    started = 1;
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= size)
+      res = size>0 ? size - 1 : 0;
+    tmp += res;
+    size -= res;
+  }
+
+  i=set->ulongs_count-1;
+
+  if (set->infinite) {
+    /* ignore starting FULL since we have 0xf...f already */
+    while (i>=0 && set->ulongs[i] == HWLOC_SUBBITMAP_FULL)
+      i--;
+  } else {
+    /* ignore starting ZERO except the last one */
+    while (i>=1 && set->ulongs[i] == HWLOC_SUBBITMAP_ZERO)
+      i--;
+  }
+
+  while (i>=0) {
+    unsigned long val = set->ulongs[i--];
+    if (started) {
+      /* print the whole subset */
+#if HWLOC_BITS_PER_LONG == 64
+      res = hwloc_snprintf(tmp, size, "%016lx", val);
+#else
+      res = hwloc_snprintf(tmp, size, "%08lx", val);
+#endif
+    } else if (val || i == -1) {
+      res = hwloc_snprintf(tmp, size, "0x%lx", val);
+      started = 1;
+    } else {
+      res = 0;
+    }
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= size)
+      res = size>0 ? size - 1 : 0;
+    tmp += res;
+    size -= res;
+  }
+
+  /* if didn't display anything, display 0x0 */
+  if (!ret) {
+    res = hwloc_snprintf(tmp, size, "0x0");
+    if (res < 0)
+      return -1;
+    ret += res;
+  }
+
+  return ret;
+}
+
+int hwloc_bitmap_taskset_asprintf(char ** strp, const struct hwloc_bitmap_s * __hwloc_restrict set)
+{
+  int len;
+  char *buf;
+
+  HWLOC__BITMAP_CHECK(set);
+
+  len = hwloc_bitmap_taskset_snprintf(NULL, 0, set);
+  buf = malloc(len+1);
+  *strp = buf;
+  return hwloc_bitmap_taskset_snprintf(buf, len+1, set);
+}
+
+int hwloc_bitmap_taskset_sscanf(struct hwloc_bitmap_s *set, const char * __hwloc_restrict string)
+{
+  const char * current = string;
+  int chars;
+  int count;
+  int infinite = 0;
+
+  current = string;
+  if (!strncmp("0xf...f", current, 7)) {
+    /* infinite bitmap */
+    infinite = 1;
+    current += 7;
+    if (*current == '\0') {
+      /* special case for infinite/full bitmap */
+      hwloc_bitmap_fill(set);
+      return 0;
+    }
+  } else {
+    /* finite bitmap */
+    if (!strncmp("0x", current, 2))
+      current += 2;
+    if (*current == '\0') {
+      /* special case for empty bitmap */
+      hwloc_bitmap_zero(set);
+      return 0;
+    }
+  }
+  /* we know there are other characters now */
+
+  chars = strlen(current);
+  count = (chars * 4 + HWLOC_BITS_PER_LONG - 1) / HWLOC_BITS_PER_LONG;
+
+  hwloc_bitmap_reset_by_ulongs(set, count);
+  set->infinite = 0;
+
+  while (*current != '\0') {
+    int tmpchars;
+    char ustr[17];
+    unsigned long val;
+    char *next;
+
+    tmpchars = chars % (HWLOC_BITS_PER_LONG/4);
+    if (!tmpchars)
+      tmpchars = (HWLOC_BITS_PER_LONG/4);
+
+    memcpy(ustr, current, tmpchars);
+    ustr[tmpchars] = '\0';
+    val = strtoul(ustr, &next, 16);
+    if (*next != '\0')
+      goto failed;
+
+    set->ulongs[count-1] = val;
+
+    current += tmpchars;
+    chars -= tmpchars;
+    count--;
+  }
+
+  set->infinite = infinite; /* set at the end, to avoid spurious realloc with filled new ulongs */
+
+  return 0;
+
+ failed:
+  /* failure to parse */
+  hwloc_bitmap_zero(set);
+  return -1;
+}
+
+static void hwloc_bitmap__zero(struct hwloc_bitmap_s *set)
+{
+	unsigned i;
+	for(i=0; i<set->ulongs_count; i++)
+		set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
+	set->infinite = 0;
+}
+
+void hwloc_bitmap_zero(struct hwloc_bitmap_s * set)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_reset_by_ulongs(set, 1);
+	hwloc_bitmap__zero(set);
+}
+
+static void hwloc_bitmap__fill(struct hwloc_bitmap_s * set)
+{
+	unsigned i;
+	for(i=0; i<set->ulongs_count; i++)
+		set->ulongs[i] = HWLOC_SUBBITMAP_FULL;
+	set->infinite = 1;
+}
+
+void hwloc_bitmap_fill(struct hwloc_bitmap_s * set)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_reset_by_ulongs(set, 1);
+	hwloc_bitmap__fill(set);
+}
+
+void hwloc_bitmap_from_ulong(struct hwloc_bitmap_s *set, unsigned long mask)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_reset_by_ulongs(set, 1);
+	set->ulongs[0] = mask; /* there's always at least one ulong allocated */
+	set->infinite = 0;
+}
+
+void hwloc_bitmap_from_ith_ulong(struct hwloc_bitmap_s *set, unsigned i, unsigned long mask)
+{
+	unsigned j;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_reset_by_ulongs(set, i+1);
+	set->ulongs[i] = mask;
+	for(j=0; j<i; j++)
+		set->ulongs[j] = HWLOC_SUBBITMAP_ZERO;
+	set->infinite = 0;
+}
+
+unsigned long hwloc_bitmap_to_ulong(const struct hwloc_bitmap_s *set)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	return set->ulongs[0]; /* there's always at least one ulong allocated */
+}
+
+unsigned long hwloc_bitmap_to_ith_ulong(const struct hwloc_bitmap_s *set, unsigned i)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	return HWLOC_SUBBITMAP_READULONG(set, i);
+}
+
+void hwloc_bitmap_only(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_reset_by_cpu_index(set, cpu);
+	hwloc_bitmap__zero(set);
+	set->ulongs[index_] |= HWLOC_SUBBITMAP_CPU(cpu);
+}
+
+void hwloc_bitmap_allbut(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_reset_by_cpu_index(set, cpu);
+	hwloc_bitmap__fill(set);
+	set->ulongs[index_] &= ~HWLOC_SUBBITMAP_CPU(cpu);
+}
+
+void hwloc_bitmap_set(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	/* nothing to do if setting inside the infinite part of the bitmap */
+	if (set->infinite && cpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+		return;
+
+	hwloc_bitmap_realloc_by_cpu_index(set, cpu);
+	set->ulongs[index_] |= HWLOC_SUBBITMAP_CPU(cpu);
+}
+
+void hwloc_bitmap_set_range(struct hwloc_bitmap_s * set, unsigned begincpu, int _endcpu)
+{
+	unsigned i;
+	unsigned beginset,endset;
+	unsigned endcpu = (unsigned) _endcpu;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (_endcpu == -1) {
+		set->infinite = 1;
+		/* keep endcpu == -1 since this unsigned is actually larger than anything else */
+	}
+
+	if (set->infinite) {
+		/* truncate the range according to the infinite part of the bitmap */
+		if (endcpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+			endcpu = set->ulongs_count * HWLOC_BITS_PER_LONG - 1;
+		if (begincpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+			return;
+	}
+	if (endcpu < begincpu)
+		return;
+	hwloc_bitmap_realloc_by_cpu_index(set, endcpu);
+
+	beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
+	endset = HWLOC_SUBBITMAP_INDEX(endcpu);
+	for(i=beginset+1; i<endset; i++)
+		set->ulongs[i] = HWLOC_SUBBITMAP_FULL;
+	if (beginset == endset) {
+		set->ulongs[beginset] |= HWLOC_SUBBITMAP_ULBIT_FROMTO(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu), HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+	} else {
+		set->ulongs[beginset] |= HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
+		set->ulongs[endset] |= HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+	}
+}
+
+void hwloc_bitmap_set_ith_ulong(struct hwloc_bitmap_s *set, unsigned i, unsigned long mask)
+{
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_realloc_by_ulongs(set, i+1);
+	set->ulongs[i] = mask;
+}
+
+void hwloc_bitmap_clr(struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	/* nothing to do if clearing inside the infinitely-unset part of the bitmap */
+	if (!set->infinite && cpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+		return;
+
+	hwloc_bitmap_realloc_by_cpu_index(set, cpu);
+	set->ulongs[index_] &= ~HWLOC_SUBBITMAP_CPU(cpu);
+}
+
+void hwloc_bitmap_clr_range(struct hwloc_bitmap_s * set, unsigned begincpu, int _endcpu)
+{
+	unsigned i;
+	unsigned beginset,endset;
+	unsigned endcpu = (unsigned) _endcpu;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (_endcpu == -1) {
+		set->infinite = 0;
+		/* keep endcpu == -1 since this unsigned is actually larger than anything else */
+	}
+
+	if (!set->infinite) {
+		/* truncate the range according to the infinitely-unset part of the bitmap */
+		if (endcpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+			endcpu = set->ulongs_count * HWLOC_BITS_PER_LONG - 1;
+		if (begincpu >= set->ulongs_count * HWLOC_BITS_PER_LONG)
+			return;
+	}
+	if (endcpu < begincpu)
+		return;
+	hwloc_bitmap_realloc_by_cpu_index(set, endcpu);
+
+	beginset = HWLOC_SUBBITMAP_INDEX(begincpu);
+	endset = HWLOC_SUBBITMAP_INDEX(endcpu);
+	for(i=beginset+1; i<endset; i++)
+		set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
+	if (beginset == endset) {
+		set->ulongs[beginset] &= ~HWLOC_SUBBITMAP_ULBIT_FROMTO(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu), HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+	} else {
+		set->ulongs[beginset] &= ~HWLOC_SUBBITMAP_ULBIT_FROM(HWLOC_SUBBITMAP_CPU_ULBIT(begincpu));
+		set->ulongs[endset] &= ~HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(endcpu));
+	}
+}
+
+int hwloc_bitmap_isset(const struct hwloc_bitmap_s * set, unsigned cpu)
+{
+	unsigned index_ = HWLOC_SUBBITMAP_INDEX(cpu);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	return (HWLOC_SUBBITMAP_READULONG(set, index_) & HWLOC_SUBBITMAP_CPU(cpu)) != 0;
+}
+
+int hwloc_bitmap_iszero(const struct hwloc_bitmap_s *set)
+{
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (set->infinite)
+		return 0;
+	for(i=0; i<set->ulongs_count; i++)
+		if (set->ulongs[i] != HWLOC_SUBBITMAP_ZERO)
+			return 0;
+	return 1;
+}
+
+int hwloc_bitmap_isfull(const struct hwloc_bitmap_s *set)
+{
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (!set->infinite)
+		return 0;
+	for(i=0; i<set->ulongs_count; i++)
+		if (set->ulongs[i] != HWLOC_SUBBITMAP_FULL)
+			return 0;
+	return 1;
+}
+
+int hwloc_bitmap_isequal (const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned min_count = count1 < count2 ? count1 : count2;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	for(i=0; i<min_count; i++)
+		if (set1->ulongs[i] != set2->ulongs[i])
+			return 0;
+
+	if (count1 != count2) {
+		unsigned long w1 = set1->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+		unsigned long w2 = set2->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+		for(i=min_count; i<count1; i++) {
+			if (set1->ulongs[i] != w2)
+				return 0;
+		}
+		for(i=min_count; i<count2; i++) {
+			if (set2->ulongs[i] != w1)
+				return 0;
+		}
+	}
+
+	if (set1->infinite != set2->infinite)
+		return 0;
+
+	return 1;
+}
+
+int hwloc_bitmap_intersects (const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned min_count = count1 < count2 ? count1 : count2;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	for(i=0; i<min_count; i++)
+		if (set1->ulongs[i] & set2->ulongs[i])
+			return 1;
+
+	if (count1 != count2) {
+		if (set2->infinite) {
+			for(i=min_count; i<set1->ulongs_count; i++)
+				if (set1->ulongs[i])
+					return 1;
+		}
+		if (set1->infinite) {
+			for(i=min_count; i<set2->ulongs_count; i++)
+				if (set2->ulongs[i])
+					return 1;
+		}
+	}
+
+	if (set1->infinite && set2->infinite)
+		return 1;
+
+	return 0;
+}
+
+int hwloc_bitmap_isincluded (const struct hwloc_bitmap_s *sub_set, const struct hwloc_bitmap_s *super_set)
+{
+	unsigned super_count = super_set->ulongs_count;
+	unsigned sub_count = sub_set->ulongs_count;
+	unsigned min_count = super_count < sub_count ? super_count : sub_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(sub_set);
+	HWLOC__BITMAP_CHECK(super_set);
+
+	for(i=0; i<min_count; i++)
+		if (super_set->ulongs[i] != (super_set->ulongs[i] | sub_set->ulongs[i]))
+			return 0;
+
+	if (super_count != sub_count) {
+		if (!super_set->infinite)
+			for(i=min_count; i<sub_count; i++)
+				if (sub_set->ulongs[i])
+					return 0;
+		if (sub_set->infinite)
+			for(i=min_count; i<super_count; i++)
+				if (super_set->ulongs[i] != HWLOC_SUBBITMAP_FULL)
+					return 0;
+	}
+
+	if (sub_set->infinite && !super_set->infinite)
+		return 0;
+
+	return 1;
+}
+
+void hwloc_bitmap_or (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	/* cache counts so that we can reset res even if it's also set1 or set2 */
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	hwloc_bitmap_reset_by_ulongs(res, max_count);
+
+	for(i=0; i<min_count; i++)
+		res->ulongs[i] = set1->ulongs[i] | set2->ulongs[i];
+
+	if (count1 != count2) {
+		if (min_count < count1) {
+			if (set2->infinite) {
+				res->ulongs_count = min_count;
+			} else {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set1->ulongs[i];
+			}
+		} else {
+			if (set1->infinite) {
+				res->ulongs_count = min_count;
+			} else {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set2->ulongs[i];
+			}
+		}
+	}
+
+	res->infinite = set1->infinite || set2->infinite;
+}
+
+void hwloc_bitmap_and (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	/* cache counts so that we can reset res even if it's also set1 or set2 */
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	hwloc_bitmap_reset_by_ulongs(res, max_count);
+
+	for(i=0; i<min_count; i++)
+		res->ulongs[i] = set1->ulongs[i] & set2->ulongs[i];
+
+	if (count1 != count2) {
+		if (min_count < count1) {
+			if (set2->infinite) {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set1->ulongs[i];
+			} else {
+				res->ulongs_count = min_count;
+			}
+		} else {
+			if (set1->infinite) {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set2->ulongs[i];
+			} else {
+				res->ulongs_count = min_count;
+			}
+		}
+	}
+
+	res->infinite = set1->infinite && set2->infinite;
+}
+
+void hwloc_bitmap_andnot (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	/* cache counts so that we can reset res even if it's also set1 or set2 */
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	hwloc_bitmap_reset_by_ulongs(res, max_count);
+
+	for(i=0; i<min_count; i++)
+		res->ulongs[i] = set1->ulongs[i] & ~set2->ulongs[i];
+
+	if (count1 != count2) {
+		if (min_count < count1) {
+			if (!set2->infinite) {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = set1->ulongs[i];
+			} else {
+				res->ulongs_count = min_count;
+			}
+		} else {
+			if (set1->infinite) {
+				for(i=min_count; i<max_count; i++)
+					res->ulongs[i] = ~set2->ulongs[i];
+			} else {
+				res->ulongs_count = min_count;
+			}
+		}
+	}
+
+	res->infinite = set1->infinite && !set2->infinite;
+}
+
+void hwloc_bitmap_xor (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set1, const struct hwloc_bitmap_s *set2)
+{
+	/* cache counts so that we can reset res even if it's also set1 or set2 */
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	hwloc_bitmap_reset_by_ulongs(res, max_count);
+
+	for(i=0; i<min_count; i++)
+		res->ulongs[i] = set1->ulongs[i] ^ set2->ulongs[i];
+
+	if (count1 != count2) {
+		if (min_count < count1) {
+			unsigned long w2 = set2->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+			for(i=min_count; i<max_count; i++)
+				res->ulongs[i] = set1->ulongs[i] ^ w2;
+		} else {
+			unsigned long w1 = set1->infinite ? HWLOC_SUBBITMAP_FULL : HWLOC_SUBBITMAP_ZERO;
+			for(i=min_count; i<max_count; i++)
+				res->ulongs[i] = set2->ulongs[i] ^ w1;
+		}
+	}
+
+	res->infinite = (!set1->infinite) != (!set2->infinite);
+}
+
+void hwloc_bitmap_not (struct hwloc_bitmap_s *res, const struct hwloc_bitmap_s *set)
+{
+	unsigned count = set->ulongs_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(res);
+	HWLOC__BITMAP_CHECK(set);
+
+	hwloc_bitmap_reset_by_ulongs(res, count);
+
+	for(i=0; i<count; i++)
+		res->ulongs[i] = ~set->ulongs[i];
+
+	res->infinite = !set->infinite;
+}
+
+int hwloc_bitmap_first(const struct hwloc_bitmap_s * set)
+{
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	for(i=0; i<set->ulongs_count; i++) {
+		/* subsets are unsigned longs, use ffsl */
+		unsigned long w = set->ulongs[i];
+		if (w)
+			return hwloc_ffsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	if (set->infinite)
+		return set->ulongs_count * HWLOC_BITS_PER_LONG;
+
+	return -1;
+}
+
+int hwloc_bitmap_last(const struct hwloc_bitmap_s * set)
+{
+	int i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (set->infinite)
+		return -1;
+
+	for(i=set->ulongs_count-1; i>=0; i--) {
+		/* subsets are unsigned longs, use flsl */
+		unsigned long w = set->ulongs[i];
+		if (w)
+			return hwloc_flsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	return -1;
+}
+
+int hwloc_bitmap_next(const struct hwloc_bitmap_s * set, int prev_cpu)
+{
+	unsigned i = HWLOC_SUBBITMAP_INDEX(prev_cpu + 1);
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (i >= set->ulongs_count) {
+		if (set->infinite)
+			return prev_cpu + 1;
+		else
+			return -1;
+	}
+
+	for(; i<set->ulongs_count; i++) {
+		/* subsets are unsigned longs, use ffsl */
+		unsigned long w = set->ulongs[i];
+
+		/* if the prev cpu is in the same word as the possible next one,
+		   we need to mask out previous cpus */
+		if (prev_cpu >= 0 && HWLOC_SUBBITMAP_INDEX((unsigned) prev_cpu) == i)
+			w &= ~HWLOC_SUBBITMAP_ULBIT_TO(HWLOC_SUBBITMAP_CPU_ULBIT(prev_cpu));
+
+		if (w)
+			return hwloc_ffsl(w) - 1 + HWLOC_BITS_PER_LONG*i;
+	}
+
+	if (set->infinite)
+		return set->ulongs_count * HWLOC_BITS_PER_LONG;
+
+	return -1;
+}
+
+void hwloc_bitmap_singlify(struct hwloc_bitmap_s * set)
+{
+	unsigned i;
+	int found = 0;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	for(i=0; i<set->ulongs_count; i++) {
+		if (found) {
+			set->ulongs[i] = HWLOC_SUBBITMAP_ZERO;
+			continue;
+		} else {
+			/* subsets are unsigned longs, use ffsl */
+			unsigned long w = set->ulongs[i];
+			if (w) {
+				int _ffs = hwloc_ffsl(w);
+				set->ulongs[i] = HWLOC_SUBBITMAP_CPU(_ffs-1);
+				found = 1;
+			}
+		}
+	}
+
+	if (set->infinite) {
+		if (found) {
+			set->infinite = 0;
+		} else {
+			/* set the first non allocated bit */
+			unsigned first = set->ulongs_count * HWLOC_BITS_PER_LONG;
+			set->infinite = 0; /* do not let realloc fill the newly allocated sets */
+			hwloc_bitmap_set(set, first);
+		}
+	}
+}
+
+int hwloc_bitmap_compare_first(const struct hwloc_bitmap_s * set1, const struct hwloc_bitmap_s * set2)
+{
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	for(i=0; i<min_count; i++) {
+		unsigned long w1 = set1->ulongs[i];
+		unsigned long w2 = set2->ulongs[i];
+		if (w1 || w2) {
+			int _ffs1 = hwloc_ffsl(w1);
+			int _ffs2 = hwloc_ffsl(w2);
+			/* if both have a bit set, compare for real */
+			if (_ffs1 && _ffs2)
+				return _ffs1-_ffs2;
+			/* one is empty, and it is considered higher, so reverse-compare them */
+			return _ffs2-_ffs1;
+		}
+	}
+
+	if (count1 != count2) {
+		if (min_count < count2) {
+			for(i=min_count; i<count2; i++) {
+				unsigned long w2 = set2->ulongs[i];
+				if (set1->infinite)
+					return -!(w2 & 1);
+				else if (w2)
+					return 1;
+			}
+		} else {
+			for(i=min_count; i<count1; i++) {
+				unsigned long w1 = set1->ulongs[i];
+				if (set2->infinite)
+					return !(w1 & 1);
+				else if (w1)
+					return -1;
+			}
+		}
+	}
+
+	return !!set1->infinite - !!set2->infinite;
+}
+
+int hwloc_bitmap_compare(const struct hwloc_bitmap_s * set1, const struct hwloc_bitmap_s * set2)
+{
+	unsigned count1 = set1->ulongs_count;
+	unsigned count2 = set2->ulongs_count;
+	unsigned max_count = count1 > count2 ? count1 : count2;
+	unsigned min_count = count1 + count2 - max_count;
+	int i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	if ((!set1->infinite) != (!set2->infinite))
+		return !!set1->infinite - !!set2->infinite;
+
+	if (count1 != count2) {
+		if (min_count < count2) {
+			unsigned long val1 = set1->infinite ? HWLOC_SUBBITMAP_FULL :  HWLOC_SUBBITMAP_ZERO;
+			for(i=max_count-1; i>=(signed) min_count; i--) {
+				unsigned long val2 = set2->ulongs[i];
+				if (val1 == val2)
+					continue;
+				return val1 < val2 ? -1 : 1;
+			}
+		} else {
+			unsigned long val2 = set2->infinite ? HWLOC_SUBBITMAP_FULL :  HWLOC_SUBBITMAP_ZERO;
+			for(i=max_count-1; i>=(signed) min_count; i--) {
+				unsigned long val1 = set1->ulongs[i];
+				if (val1 == val2)
+					continue;
+				return val1 < val2 ? -1 : 1;
+			}
+		}
+	}
+
+	for(i=min_count-1; i>=0; i--) {
+		unsigned long val1 = set1->ulongs[i];
+		unsigned long val2 = set2->ulongs[i];
+		if (val1 == val2)
+			continue;
+		return val1 < val2 ? -1 : 1;
+	}
+
+	return 0;
+}
+
+int hwloc_bitmap_weight(const struct hwloc_bitmap_s * set)
+{
+	int weight = 0;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set);
+
+	if (set->infinite)
+		return -1;
+
+	for(i=0; i<set->ulongs_count; i++)
+		weight += hwloc_weight_long(set->ulongs[i]);
+	return weight;
+}
+
+int hwloc_bitmap_compare_inclusion(const struct hwloc_bitmap_s * set1, const struct hwloc_bitmap_s * set2)
+{
+	unsigned max_count = set1->ulongs_count > set2->ulongs_count ? set1->ulongs_count : set2->ulongs_count;
+	int result = HWLOC_BITMAP_EQUAL; /* means empty sets return equal */
+	int empty1 = 1;
+	int empty2 = 1;
+	unsigned i;
+
+	HWLOC__BITMAP_CHECK(set1);
+	HWLOC__BITMAP_CHECK(set2);
+
+	for(i=0; i<max_count; i++) {
+	  unsigned long val1 = HWLOC_SUBBITMAP_READULONG(set1, (unsigned) i);
+	  unsigned long val2 = HWLOC_SUBBITMAP_READULONG(set2, (unsigned) i);
+
+	  if (!val1) {
+	    if (!val2)
+	      /* both empty, no change */
+	      continue;
+
+	    /* val1 empty, val2 not */
+	    if (result == HWLOC_BITMAP_CONTAINS) {
+	      if (!empty2)
+		return HWLOC_BITMAP_INTERSECTS;
+	      result = HWLOC_BITMAP_DIFFERENT;
+	    } else if (result == HWLOC_BITMAP_EQUAL) {
+	      result = HWLOC_BITMAP_INCLUDED;
+	    }
+	    /* no change otherwise */
+
+	  } else if (!val2) {
+	    /* val2 empty, val1 not */
+	    if (result == HWLOC_BITMAP_INCLUDED) {
+	      if (!empty1)
+		return HWLOC_BITMAP_INTERSECTS;
+	      result = HWLOC_BITMAP_DIFFERENT;
+	    } else if (result == HWLOC_BITMAP_EQUAL) {
+	      result = HWLOC_BITMAP_CONTAINS;
+	    }
+	    /* no change otherwise */
+
+	  } else if (val1 == val2) {
+	    /* equal and not empty */
+	    if (result == HWLOC_BITMAP_DIFFERENT)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    /* equal/contains/included unchanged */
+
+	  } else if ((val1 & val2) == val1) {
+	    /* included and not empty */
+	    if (result == HWLOC_BITMAP_CONTAINS || result == HWLOC_BITMAP_DIFFERENT)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    /* equal/included unchanged */
+	    result = HWLOC_BITMAP_INCLUDED;
+
+	  } else if ((val1 & val2) == val2) {
+	    /* contains and not empty */
+	    if (result == HWLOC_BITMAP_INCLUDED || result == HWLOC_BITMAP_DIFFERENT)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    /* equal/contains unchanged */
+	    result = HWLOC_BITMAP_CONTAINS;
+
+	  } else if ((val1 & val2) != 0) {
+	    /* intersects and not empty */
+	    return HWLOC_BITMAP_INTERSECTS;
+
+	  } else {
+	    /* different and not empty */
+
+	    /* equal/included/contains with non-empty sets means intersects */
+	    if (result == HWLOC_BITMAP_EQUAL && !empty1 /* implies !empty2 */)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    if (result == HWLOC_BITMAP_INCLUDED && !empty1)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    if (result == HWLOC_BITMAP_CONTAINS && !empty2)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    /* otherwise means different */
+	    result = HWLOC_BITMAP_DIFFERENT;
+	  }
+
+	  empty1 &= !val1;
+	  empty2 &= !val2;
+	}
+
+	if (!set1->infinite) {
+	  if (set2->infinite) {
+	    /* set2 infinite only */
+	    if (result == HWLOC_BITMAP_CONTAINS) {
+	      if (!empty2)
+		return HWLOC_BITMAP_INTERSECTS;
+	      result = HWLOC_BITMAP_DIFFERENT;
+	    } else if (result == HWLOC_BITMAP_EQUAL) {
+	      result = HWLOC_BITMAP_INCLUDED;
+	    }
+	    /* no change otherwise */
+	  }
+	} else if (!set2->infinite) {
+	  /* set1 infinite only */
+	  if (result == HWLOC_BITMAP_INCLUDED) {
+	    if (!empty1)
+	      return HWLOC_BITMAP_INTERSECTS;
+	    result = HWLOC_BITMAP_DIFFERENT;
+	  } else if (result == HWLOC_BITMAP_EQUAL) {
+	    result = HWLOC_BITMAP_CONTAINS;
+	  }
+	  /* no change otherwise */
+	} else {
+	  /* both infinite */
+	  if (result == HWLOC_BITMAP_DIFFERENT)
+	    return HWLOC_BITMAP_INTERSECTS;
+	  /* equal/contains/included unchanged */
+	}
+
+	return result;
+}
diff --git a/ext/hwloc/hwloc/components.c b/ext/hwloc/hwloc/components.c
new file mode 100644
index 0000000..7aa3b9d
--- /dev/null
+++ b/ext/hwloc/hwloc/components.c
@@ -0,0 +1,792 @@
+/*
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2012 Université Bordeau 1
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/xml.h>
+
+#define HWLOC_COMPONENT_STOP_NAME "stop"
+#define HWLOC_COMPONENT_EXCLUDE_CHAR '-'
+#define HWLOC_COMPONENT_SEPS ","
+
+/* list of all registered discovery components, sorted by priority, higher priority first.
+ * noos is last because its priority is 0.
+ * others' priority is 10.
+ */
+static struct hwloc_disc_component * hwloc_disc_components = NULL;
+
+static unsigned hwloc_components_users = 0; /* first one initializes, last ones destroys */
+
+static int hwloc_components_verbose = 0;
+#ifdef HWLOC_HAVE_PLUGINS
+static int hwloc_plugins_verbose = 0;
+#endif
+
+/* hwloc_components_mutex serializes:
+ * - loading/unloading plugins, and modifications of the hwloc_plugins list
+ * - calls to ltdl, including in hwloc_check_plugin_namespace()
+ * - registration of components with hwloc_disc_component_register()
+ *   and hwloc_xml_callbacks_register()
+ */
+#ifdef HWLOC_WIN_SYS
+/* Basic mutex on top of InterlockedCompareExchange() on windows,
+ * Far from perfect, but easy to maintain, and way enough given that this code will never be needed for real. */
+#include <windows.h>
+static LONG hwloc_components_mutex = 0;
+#define HWLOC_COMPONENTS_LOCK() do {						\
+  while (InterlockedCompareExchange(&hwloc_components_mutex, 1, 0) != 0)	\
+    SwitchToThread();								\
+} while (0)
+#define HWLOC_COMPONENTS_UNLOCK() do {						\
+  assert(hwloc_components_mutex == 1);						\
+  hwloc_components_mutex = 0;							\
+} while (0)
+
+#elif defined HWLOC_HAVE_PTHREAD_MUTEX
+/* pthread mutex if available (except on windows) */
+#include <pthread.h>
+static pthread_mutex_t hwloc_components_mutex = PTHREAD_MUTEX_INITIALIZER;
+#define HWLOC_COMPONENTS_LOCK() pthread_mutex_lock(&hwloc_components_mutex)
+#define HWLOC_COMPONENTS_UNLOCK() pthread_mutex_unlock(&hwloc_components_mutex)
+
+#else /* HWLOC_WIN_SYS || HWLOC_HAVE_PTHREAD_MUTEX */
+#error No mutex implementation available
+#endif
+
+
+#ifdef HWLOC_HAVE_PLUGINS
+
+#include <ltdl.h>
+
+/* array of pointers to dynamically loaded plugins */
+static struct hwloc__plugin_desc {
+  char *name;
+  struct hwloc_component *component;
+  char *filename;
+  lt_dlhandle handle;
+  struct hwloc__plugin_desc *next;
+} *hwloc_plugins = NULL;
+
+static int
+hwloc__dlforeach_cb(const char *filename, void *_data __hwloc_attribute_unused)
+{
+  const char *basename;
+  lt_dlhandle handle;
+  char *componentsymbolname = NULL;
+  struct hwloc_component *component;
+  struct hwloc__plugin_desc *desc, **prevdesc;
+
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Plugin dlforeach found `%s'\n", filename);
+
+  basename = strrchr(filename, '/');
+  if (!basename)
+    basename = filename;
+  else
+    basename++;
+
+  /* dlopen and get the component structure */
+  handle = lt_dlopenext(filename);
+  if (!handle) {
+    if (hwloc_plugins_verbose)
+      fprintf(stderr, "Failed to load plugin: %s\n", lt_dlerror());
+    goto out;
+  }
+  componentsymbolname = malloc(strlen(basename)+10+1);
+  sprintf(componentsymbolname, "%s_component", basename);
+  component = lt_dlsym(handle, componentsymbolname);
+  if (!component) {
+    if (hwloc_plugins_verbose)
+      fprintf(stderr, "Failed to find component symbol `%s'\n",
+	      componentsymbolname);
+    goto out_with_handle;
+  }
+  if (component->abi != HWLOC_COMPONENT_ABI) {
+    if (hwloc_plugins_verbose)
+      fprintf(stderr, "Plugin symbol ABI %u instead of %u\n",
+	      component->abi, HWLOC_COMPONENT_ABI);
+    goto out_with_handle;
+  }
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Plugin contains expected symbol `%s'\n",
+	    componentsymbolname);
+  free(componentsymbolname);
+  componentsymbolname = NULL;
+
+  if (HWLOC_COMPONENT_TYPE_DISC == component->type) {
+    if (strncmp(basename, "hwloc_", 6)) {
+      if (hwloc_plugins_verbose)
+	fprintf(stderr, "Plugin name `%s' doesn't match its type DISCOVERY\n", basename);
+      goto out_with_handle;
+    }
+  } else if (HWLOC_COMPONENT_TYPE_XML == component->type) {
+    if (strncmp(basename, "hwloc_xml_", 10)) {
+      if (hwloc_plugins_verbose)
+	fprintf(stderr, "Plugin name `%s' doesn't match its type XML\n", basename);
+      goto out_with_handle;
+    }
+  } else {
+    if (hwloc_plugins_verbose)
+      fprintf(stderr, "Plugin name `%s' has invalid type %u\n",
+	      basename, (unsigned) component->type);
+    goto out_with_handle;
+  }
+
+  /* allocate a plugin_desc and queue it */
+  desc = malloc(sizeof(*desc));
+  if (!desc)
+    goto out_with_handle;
+  desc->name = strdup(basename);
+  desc->filename = strdup(filename);
+  desc->component = component;
+  desc->handle = handle;
+  desc->next = NULL;
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Plugin descriptor `%s' ready\n", basename);
+
+  /* append to the list */
+  prevdesc = &hwloc_plugins;
+  while (*prevdesc)
+    prevdesc = &((*prevdesc)->next);
+  *prevdesc = desc;
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Plugin descriptor `%s' queued\n", basename);
+  return 0;
+
+ out_with_handle:
+  lt_dlclose(handle);
+  free(componentsymbolname); /* NULL if already freed */
+ out:
+  return 0;
+}
+
+static void
+hwloc_plugins_exit(void)
+{
+  struct hwloc__plugin_desc *desc, *next;
+
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Closing all plugins\n");
+
+  desc = hwloc_plugins;
+  while (desc) {
+    next = desc->next;
+    lt_dlclose(desc->handle);
+    free(desc->name);
+    free(desc->filename);
+    free(desc);
+    desc = next;
+  }
+  hwloc_plugins = NULL;
+
+  lt_dlexit();
+}
+
+static int
+hwloc_plugins_init(void)
+{
+  const char *verboseenv;
+  char *path = HWLOC_PLUGINS_PATH;
+  const char *env;
+  int err;
+
+  verboseenv = getenv("HWLOC_PLUGINS_VERBOSE");
+  hwloc_plugins_verbose = verboseenv ? atoi(verboseenv) : 0;
+
+  err = lt_dlinit();
+  if (err)
+    goto out;
+
+  env = getenv("HWLOC_PLUGINS_PATH");
+  if (env)
+    path = env;
+
+  hwloc_plugins = NULL;
+
+  if (hwloc_plugins_verbose)
+    fprintf(stderr, "Starting plugin dlforeach in %s\n", path);
+  err = lt_dlforeachfile(path, hwloc__dlforeach_cb, NULL);
+  if (err)
+    goto out_with_init;
+
+  return 0;
+
+ out_with_init:
+  hwloc_plugins_exit();
+ out:
+  return -1;
+}
+
+#endif /* HWLOC_HAVE_PLUGINS */
+
+static const char *
+hwloc_disc_component_type_string(hwloc_disc_component_type_t type)
+{
+  switch (type) {
+  case HWLOC_DISC_COMPONENT_TYPE_CPU: return "cpu";
+  case HWLOC_DISC_COMPONENT_TYPE_GLOBAL: return "global";
+  case HWLOC_DISC_COMPONENT_TYPE_MISC: return "misc";
+  default: return "**unknown**";
+  }
+}
+
+static int
+hwloc_disc_component_register(struct hwloc_disc_component *component,
+			      const char *filename)
+{
+  struct hwloc_disc_component **prev;
+
+  /* check that the component name is valid */
+  if (!strcmp(component->name, HWLOC_COMPONENT_STOP_NAME)) {
+    if (hwloc_components_verbose)
+      fprintf(stderr, "Cannot register discovery component with reserved name `" HWLOC_COMPONENT_STOP_NAME "'\n");
+    return -1;
+  }
+  if (strchr(component->name, HWLOC_COMPONENT_EXCLUDE_CHAR)
+      || strcspn(component->name, HWLOC_COMPONENT_SEPS) != strlen(component->name)) {
+    if (hwloc_components_verbose)
+      fprintf(stderr, "Cannot register discovery component with name `%s' containing reserved characters `%c" HWLOC_COMPONENT_SEPS "'\n",
+	      component->name, HWLOC_COMPONENT_EXCLUDE_CHAR);
+    return -1;
+  }
+  /* check that the component type is valid */
+  switch ((unsigned) component->type) {
+  case HWLOC_DISC_COMPONENT_TYPE_CPU:
+  case HWLOC_DISC_COMPONENT_TYPE_GLOBAL:
+  case HWLOC_DISC_COMPONENT_TYPE_MISC:
+    break;
+  default:
+    fprintf(stderr, "Cannot register discovery component `%s' with unknown type %u\n",
+	    component->name, (unsigned) component->type);
+    return -1;
+  }
+
+  prev = &hwloc_disc_components;
+  while (NULL != *prev) {
+    if (!strcmp((*prev)->name, component->name)) {
+      /* if two components have the same name, only keep the highest priority one */
+      if ((*prev)->priority < component->priority) {
+	/* drop the existing component */
+	if (hwloc_components_verbose)
+	  fprintf(stderr, "Dropping previously registered discovery component `%s', priority %u lower than new one %u\n",
+		  (*prev)->name, (*prev)->priority, component->priority);
+	*prev = (*prev)->next;
+      } else {
+	/* drop the new one */
+	if (hwloc_components_verbose)
+	  fprintf(stderr, "Ignoring new discovery component `%s', priority %u lower than previously registered one %u\n",
+		  component->name, component->priority, (*prev)->priority);
+	return -1;
+      }
+    }
+    prev = &((*prev)->next);
+  }
+  if (hwloc_components_verbose)
+    fprintf(stderr, "Registered %s discovery component `%s' with priority %u (%s%s)\n",
+	    hwloc_disc_component_type_string(component->type), component->name, component->priority,
+	    filename ? "from plugin " : "statically build", filename ? filename : "");
+
+  prev = &hwloc_disc_components;
+  while (NULL != *prev) {
+    if ((*prev)->priority < component->priority)
+      break;
+    prev = &((*prev)->next);
+  }
+  component->next = *prev;
+  *prev = component;
+  return 0;
+}
+
+#include <static-components.h>
+
+static void (**hwloc_component_finalize_cbs)(unsigned long);
+static unsigned hwloc_component_finalize_cb_count;
+
+void
+hwloc_components_init(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+#ifdef HWLOC_HAVE_PLUGINS
+  struct hwloc__plugin_desc *desc;
+#endif
+  const char *verboseenv;
+  unsigned i;
+
+  HWLOC_COMPONENTS_LOCK();
+  assert((unsigned) -1 != hwloc_components_users);
+  if (0 != hwloc_components_users++) {
+    HWLOC_COMPONENTS_UNLOCK();
+    goto ok;
+  }
+
+  verboseenv = getenv("HWLOC_COMPONENTS_VERBOSE");
+  hwloc_components_verbose = verboseenv ? atoi(verboseenv) : 0;
+
+#ifdef HWLOC_HAVE_PLUGINS
+  hwloc_plugins_init();
+#endif
+
+  hwloc_component_finalize_cbs = NULL;
+  hwloc_component_finalize_cb_count = 0;
+  /* count the max number of finalize callbacks */
+  for(i=0; NULL != hwloc_static_components[i]; i++)
+    hwloc_component_finalize_cb_count++;
+#ifdef HWLOC_HAVE_PLUGINS
+  for(desc = hwloc_plugins; NULL != desc; desc = desc->next)
+    hwloc_component_finalize_cb_count++;
+#endif
+  if (hwloc_component_finalize_cb_count) {
+    hwloc_component_finalize_cbs = calloc(hwloc_component_finalize_cb_count,
+					  sizeof(*hwloc_component_finalize_cbs));
+    assert(hwloc_component_finalize_cbs);
+    /* forget that max number and recompute the real one below */
+    hwloc_component_finalize_cb_count = 0;
+  }
+
+  /* hwloc_static_components is created by configure in static-components.h */
+  for(i=0; NULL != hwloc_static_components[i]; i++) {
+    if (hwloc_static_components[i]->flags) {
+      fprintf(stderr, "Ignoring static component with invalid flags %lx\n",
+	      hwloc_static_components[i]->flags);
+      continue;
+    }
+
+    /* initialize the component */
+    if (hwloc_static_components[i]->init && hwloc_static_components[i]->init(0) < 0) {
+      if (hwloc_components_verbose)
+	fprintf(stderr, "Ignoring static component, failed to initialize\n");
+      continue;
+    }
+    /* queue ->finalize() callback if any */
+    if (hwloc_static_components[i]->finalize)
+      hwloc_component_finalize_cbs[hwloc_component_finalize_cb_count++] = hwloc_static_components[i]->finalize;
+
+    /* register for real now */
+    if (HWLOC_COMPONENT_TYPE_DISC == hwloc_static_components[i]->type)
+      hwloc_disc_component_register(hwloc_static_components[i]->data, NULL);
+    /*else if (HWLOC_COMPONENT_TYPE_XML == hwloc_static_components[i]->type)
+      hwloc_xml_callbacks_register(hwloc_static_components[i]->data);*/
+    else
+      assert(0);
+  }
+
+  /* dynamic plugins */
+#ifdef HWLOC_HAVE_PLUGINS
+  for(desc = hwloc_plugins; NULL != desc; desc = desc->next) {
+    if (desc->component->flags) {
+      fprintf(stderr, "Ignoring plugin `%s' component with invalid flags %lx\n",
+	      desc->name, desc->component->flags);
+      continue;
+    }
+
+    /* initialize the component */
+    if (desc->component->init && desc->component->init(0) < 0) {
+      if (hwloc_components_verbose)
+	fprintf(stderr, "Ignoring plugin `%s', failed to initialize\n", desc->name);
+      continue;
+    }
+    /* queue ->finalize() callback if any */
+    if (desc->component->finalize)
+      hwloc_component_finalize_cbs[hwloc_component_finalize_cb_count++] = desc->component->finalize;
+
+    /* register for real now */
+    if (HWLOC_COMPONENT_TYPE_DISC == desc->component->type)
+      hwloc_disc_component_register(desc->component->data, desc->filename);
+    /*else if (HWLOC_COMPONENT_TYPE_XML == desc->component->type)
+      hwloc_xml_callbacks_register(desc->component->data);*/
+    else
+      assert(0);
+  }
+#endif
+
+  HWLOC_COMPONENTS_UNLOCK();
+
+ ok:
+  topology->backends = NULL;
+}
+
+static struct hwloc_disc_component *
+hwloc_disc_component_find(int type /* hwloc_disc_component_type_t or -1 if any */,
+			       const char *name /* name of NULL if any */)
+{
+  struct hwloc_disc_component *comp = hwloc_disc_components;
+  while (NULL != comp) {
+    if ((-1 == type || type == (int) comp->type)
+       && (NULL == name || !strcmp(name, comp->name)))
+      return comp;
+    comp = comp->next;
+  }
+  return NULL;
+}
+
+/* used by set_xml(), set_synthetic(), ... environment variables, ... to force the first backend */
+int
+hwloc_disc_component_force_enable(struct hwloc_topology *topology,
+				  int envvar_forced,
+				  int type, const char *name,
+				  const void *data1, const void *data2, const void *data3)
+{
+  struct hwloc_disc_component *comp;
+  struct hwloc_backend *backend;
+
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
+  comp = hwloc_disc_component_find(type, name);
+  if (!comp) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  backend = comp->instantiate(comp, data1, data2, data3);
+  if (backend) {
+    backend->envvar_forced = envvar_forced;
+    if (topology->backends)
+      hwloc_backends_disable_all(topology);
+    return hwloc_backend_enable(topology, backend);
+  } else
+    return -1;
+}
+
+static int
+hwloc_disc_component_try_enable(struct hwloc_topology *topology,
+				struct hwloc_disc_component *comp,
+				const char *comparg,
+				unsigned *excludes,
+				int envvar_forced,
+				int verbose_errors)
+{
+  struct hwloc_backend *backend;
+  int err;
+
+  if ((*excludes) & comp->type) {
+    if (hwloc_components_verbose || verbose_errors)
+      fprintf(stderr, "Excluding %s discovery component `%s', conflicts with excludes 0x%x\n",
+	      hwloc_disc_component_type_string(comp->type), comp->name, *excludes);
+    return -1;
+  }
+
+  backend = comp->instantiate(comp, comparg, NULL, NULL);
+  if (!backend) {
+    if (hwloc_components_verbose || verbose_errors)
+      fprintf(stderr, "Failed to instantiate discovery component `%s'\n", comp->name);
+    return -1;
+  }
+
+  backend->envvar_forced = envvar_forced;
+  err = hwloc_backend_enable(topology, backend);
+  if (err < 0)
+    return -1;
+
+  *excludes |= comp->excludes;
+
+  return 0;
+}
+
+void
+hwloc_disc_components_enable_others(struct hwloc_topology *topology)
+{
+  struct hwloc_disc_component *comp;
+  struct hwloc_backend *backend;
+  unsigned excludes = 0;
+  int tryall = 1;
+  const char *_env;
+  char *env; /* we'll to modify the env value, so duplicate it */
+
+  _env = getenv("HWLOC_COMPONENTS");
+  env = _env ? strdup(_env) : NULL;
+
+  /* compute current excludes */
+  backend = topology->backends;
+  while (backend) {
+    excludes |= backend->component->excludes;
+    backend = backend->next;
+  }
+
+  /* enable explicitly listed components */
+  if (env) {
+    char *curenv = env;
+    size_t s;
+
+    if (topology->backends) {
+      hwloc_backends_disable_all(topology);
+      excludes = 0;
+    }
+
+    while (*curenv) {
+      s = strcspn(curenv, HWLOC_COMPONENT_SEPS);
+      if (s) {
+	char c;
+
+	if (curenv[0] == HWLOC_COMPONENT_EXCLUDE_CHAR)
+	  goto nextname;
+
+	if (!strncmp(curenv, HWLOC_COMPONENT_STOP_NAME, s)) {
+	  tryall = 0;
+	  break;
+	}
+
+	/* save the last char and replace with \0 */
+	c = curenv[s];
+	curenv[s] = '\0';
+
+	comp = hwloc_disc_component_find(-1, curenv);
+	if (comp) {
+	  hwloc_disc_component_try_enable(topology, comp, NULL, &excludes, 1 /* envvar forced */, 1 /* envvar forced need warnings */);
+	} else {
+	  fprintf(stderr, "Cannot find discovery component `%s'\n", curenv);
+	}
+
+	/* restore chars (the second loop below needs env to be unmodified) */
+	curenv[s] = c;
+      }
+
+nextname:
+      curenv += s;
+      if (*curenv)
+	/* Skip comma */
+	curenv++;
+    }
+  }
+
+  /* env is still the same, the above loop didn't modify it */
+
+  /* now enable remaining components (except the explicitly '-'-listed ones) */
+  if (tryall) {
+    comp = hwloc_disc_components;
+    while (NULL != comp) {
+      /* check if this component was explicitly excluded in env */
+      if (env) {
+	char *curenv = env;
+	while (*curenv) {
+	  size_t s = strcspn(curenv, HWLOC_COMPONENT_SEPS);
+	  if (curenv[0] == HWLOC_COMPONENT_EXCLUDE_CHAR && !strncmp(curenv+1, comp->name, s-1)) {
+	    if (hwloc_components_verbose)
+	      fprintf(stderr, "Excluding %s discovery component `%s' because of HWLOC_COMPONENTS environment variable\n",
+	    hwloc_disc_component_type_string(comp->type), comp->name);
+	    goto nextcomp;
+	  }
+	  curenv += s;
+	  if (*curenv)
+	    /* Skip comma */
+	    curenv++;
+	}
+      }
+      hwloc_disc_component_try_enable(topology, comp, NULL, &excludes, 0 /* defaults, not envvar forced */, 0 /* defaults don't need warnings on conflicts */);
+nextcomp:
+      comp = comp->next;
+    }
+  }
+
+  if (hwloc_components_verbose) {
+    /* print a summary */
+    int first = 1;
+    backend = topology->backends;
+    fprintf(stderr, "Final list of enabled discovery components: ");
+    while (backend != NULL) {
+      fprintf(stderr, "%s%s", first ? "" : ",", backend->component->name);
+      backend = backend->next;
+      first = 0;
+    }
+    fprintf(stderr, "\n");
+  }
+
+  if (env)
+    free(env);
+}
+
+void
+hwloc_components_destroy_all(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+  unsigned i;
+
+  HWLOC_COMPONENTS_LOCK();
+  assert(0 != hwloc_components_users);
+  if (0 != --hwloc_components_users) {
+    HWLOC_COMPONENTS_UNLOCK();
+    return;
+  }
+
+  for(i=0; i<hwloc_component_finalize_cb_count; i++)
+    hwloc_component_finalize_cbs[hwloc_component_finalize_cb_count-i-1](0);
+  free(hwloc_component_finalize_cbs);
+  hwloc_component_finalize_cbs = NULL;
+  hwloc_component_finalize_cb_count = 0;
+
+  /* no need to unlink/free the list of components, they'll be unloaded below */
+
+  hwloc_disc_components = NULL;
+//  hwloc_xml_callbacks_reset();
+
+#ifdef HWLOC_HAVE_PLUGINS
+  hwloc_plugins_exit();
+#endif
+
+  HWLOC_COMPONENTS_UNLOCK();
+}
+
+struct hwloc_backend *
+hwloc_backend_alloc(struct hwloc_disc_component *component)
+{
+  struct hwloc_backend * backend = malloc(sizeof(*backend));
+  if (!backend) {
+    errno = ENOMEM;
+    return NULL;
+  }
+  backend->component = component;
+  backend->flags = 0;
+  backend->discover = NULL;
+  backend->get_obj_cpuset = NULL;
+  backend->notify_new_object = NULL;
+  backend->disable = NULL;
+  backend->is_thissystem = -1;
+  backend->next = NULL;
+  backend->envvar_forced = 0;
+  return backend;
+}
+
+static void
+hwloc_backend_disable(struct hwloc_backend *backend)
+{
+  if (backend->disable)
+    backend->disable(backend);
+  free(backend);
+}
+
+int
+hwloc_backend_enable(struct hwloc_topology *topology, struct hwloc_backend *backend)
+{
+  struct hwloc_backend **pprev;
+
+  /* check backend flags */
+  if (backend->flags & (~(HWLOC_BACKEND_FLAG_NEED_LEVELS))) {
+    fprintf(stderr, "Cannot enable %s discovery component `%s' with unknown flags %lx\n",
+	    hwloc_disc_component_type_string(backend->component->type), backend->component->name, backend->flags);
+    return -1;
+  }
+
+  /* make sure we didn't already enable this backend, we don't want duplicates */
+  pprev = &topology->backends;
+  while (NULL != *pprev) {
+    if ((*pprev)->component == backend->component) {
+      if (hwloc_components_verbose)
+	fprintf(stderr, "Cannot enable %s discovery component `%s' twice\n",
+		hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+      hwloc_backend_disable(backend);
+      errno = EBUSY;
+      return -1;
+    }
+    pprev = &((*pprev)->next);
+  }
+
+  if (hwloc_components_verbose)
+    fprintf(stderr, "Enabling %s discovery component `%s'\n",
+	    hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+
+  /* enqueue at the end */
+  pprev = &topology->backends;
+  while (NULL != *pprev)
+    pprev = &((*pprev)->next);
+  backend->next = *pprev;
+  *pprev = backend;
+
+  backend->topology = topology;
+
+  return 0;
+}
+
+void
+hwloc_backends_is_thissystem(struct hwloc_topology *topology)
+{
+  struct hwloc_backend *backend;
+  const char *local_env;
+
+  /* Apply is_thissystem topology flag before we enforce envvar backends.
+   * If the application changed the backend with set_foo(),
+   * it may use set_flags() update the is_thissystem flag here.
+   * If it changes the backend with environment variables below,
+   * it may use HWLOC_THISSYSTEM envvar below as well.
+   */
+
+  topology->is_thissystem = 1;
+
+  /* apply thissystem from normally-given backends (envvar_forced=0, either set_foo() or defaults) */
+  backend = topology->backends;
+  while (backend != NULL) {
+    if (backend->envvar_forced == 0 && backend->is_thissystem != -1) {
+      assert(backend->is_thissystem == 0);
+      topology->is_thissystem = 0;
+    }
+    backend = backend->next;
+  }
+
+  /* override set_foo() with flags */
+  if (topology->flags & HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM)
+    topology->is_thissystem = 1;
+
+  /* now apply envvar-forced backend (envvar_forced=1) */
+  backend = topology->backends;
+  while (backend != NULL) {
+    if (backend->envvar_forced == 1 && backend->is_thissystem != -1) {
+      assert(backend->is_thissystem == 0);
+      topology->is_thissystem = 0;
+    }
+    backend = backend->next;
+  }
+
+  /* override with envvar-given flag */
+  local_env = getenv("HWLOC_THISSYSTEM");
+  if (local_env)
+    topology->is_thissystem = atoi(local_env);
+}
+
+int
+hwloc_backends_get_obj_cpuset(struct hwloc_backend *caller, struct hwloc_obj *obj, hwloc_bitmap_t cpuset)
+{
+  struct hwloc_topology *topology = caller->topology;
+  struct hwloc_backend *backend = topology->backends;
+  /* use the first backend's get_obj_cpuset callback */
+  while (backend != NULL) {
+    if (backend->get_obj_cpuset)
+      return backend->get_obj_cpuset(backend, caller, obj, cpuset);
+    backend = backend->next;
+  }
+  return -1;
+}
+
+int
+hwloc_backends_notify_new_object(struct hwloc_backend *caller, struct hwloc_obj *obj)
+{
+  struct hwloc_backend *backend;
+  int res = 0;
+
+  backend = caller->topology->backends;
+  while (NULL != backend) {
+    if (backend != caller && backend->notify_new_object)
+      res += backend->notify_new_object(backend, caller, obj);
+    backend = backend->next;
+  }
+
+  return res;
+}
+
+void
+hwloc_backends_disable_all(struct hwloc_topology *topology)
+{
+  struct hwloc_backend *backend;
+
+  while (NULL != (backend = topology->backends)) {
+    struct hwloc_backend *next = backend->next;
+    if (hwloc_components_verbose)
+      fprintf(stderr, "Disabling %s discovery component `%s'\n",
+	      hwloc_disc_component_type_string(backend->component->type), backend->component->name);
+    hwloc_backend_disable(backend);
+    topology->backends = next;
+  }
+  topology->backends = NULL;
+}
diff --git a/ext/hwloc/hwloc/diff.c b/ext/hwloc/hwloc/diff.c
new file mode 100644
index 0000000..ee401d2
--- /dev/null
+++ b/ext/hwloc/hwloc/diff.c
@@ -0,0 +1,426 @@
+/*
+ * Copyright © 2013-2015 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <private/private.h>
+#include <private/misc.h>
+
+int hwloc_topology_diff_destroy(hwloc_topology_t topology __hwloc_attribute_unused,
+				hwloc_topology_diff_t diff)
+{
+	hwloc_topology_diff_t next;
+	while (diff) {
+		next = diff->generic.next;
+		switch (diff->generic.type) {
+		default:
+			break;
+		case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR:
+			switch (diff->obj_attr.diff.generic.type) {
+			default:
+				break;
+			case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME:
+			case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO:
+				free(diff->obj_attr.diff.string.name);
+				free(diff->obj_attr.diff.string.oldvalue);
+				free(diff->obj_attr.diff.string.newvalue);
+				break;
+			}
+			break;
+		}
+		free(diff);
+		diff = next;
+	}
+	return 0;
+}
+
+/************************
+ * Computing diffs
+ */
+
+static void hwloc_append_diff(hwloc_topology_diff_t newdiff,
+			      hwloc_topology_diff_t *firstdiffp,
+			      hwloc_topology_diff_t *lastdiffp)
+{
+	if (*firstdiffp)
+		(*lastdiffp)->generic.next = newdiff;
+	else
+		*firstdiffp = newdiff;
+	*lastdiffp = newdiff;
+	newdiff->generic.next = NULL;
+}
+
+static int hwloc_append_diff_too_complex(hwloc_obj_t obj1,
+					 hwloc_topology_diff_t *firstdiffp,
+					 hwloc_topology_diff_t *lastdiffp)
+{
+	hwloc_topology_diff_t newdiff;
+	newdiff = malloc(sizeof(*newdiff));
+	if (!newdiff)
+		return -1;
+
+	newdiff->too_complex.type = HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX;
+	newdiff->too_complex.obj_depth = obj1->depth;
+	newdiff->too_complex.obj_index = obj1->logical_index;
+	hwloc_append_diff(newdiff, firstdiffp, lastdiffp);
+	return 0;
+}
+
+static int hwloc_append_diff_obj_attr_string(hwloc_obj_t obj,
+					     hwloc_topology_diff_obj_attr_type_t type,
+					     const char *name,
+					     const char *oldvalue,
+					     const char *newvalue,
+					     hwloc_topology_diff_t *firstdiffp,
+					     hwloc_topology_diff_t *lastdiffp)
+{
+	hwloc_topology_diff_t newdiff;
+	newdiff = malloc(sizeof(*newdiff));
+	if (!newdiff)
+		return -1;
+
+	newdiff->obj_attr.type = HWLOC_TOPOLOGY_DIFF_OBJ_ATTR;
+	newdiff->obj_attr.obj_depth = obj->depth;
+	newdiff->obj_attr.obj_index = obj->logical_index;
+	newdiff->obj_attr.diff.string.type = type;
+	newdiff->obj_attr.diff.string.name = name ? strdup(name) : NULL;
+	newdiff->obj_attr.diff.string.oldvalue = oldvalue ? strdup(oldvalue) : NULL;
+	newdiff->obj_attr.diff.string.newvalue = newvalue ? strdup(newvalue) : NULL;
+	hwloc_append_diff(newdiff, firstdiffp, lastdiffp);
+	return 0;
+}
+
+static int hwloc_append_diff_obj_attr_uint64(hwloc_obj_t obj,
+					     hwloc_topology_diff_obj_attr_type_t type,
+					     hwloc_uint64_t idx,
+					     hwloc_uint64_t oldvalue,
+					     hwloc_uint64_t newvalue,
+					     hwloc_topology_diff_t *firstdiffp,
+					     hwloc_topology_diff_t *lastdiffp)
+{
+	hwloc_topology_diff_t newdiff;
+	newdiff = malloc(sizeof(*newdiff));
+	if (!newdiff)
+		return -1;
+
+	newdiff->obj_attr.type = HWLOC_TOPOLOGY_DIFF_OBJ_ATTR;
+	newdiff->obj_attr.obj_depth = obj->depth;
+	newdiff->obj_attr.obj_index = obj->logical_index;
+	newdiff->obj_attr.diff.uint64.type = type;
+	newdiff->obj_attr.diff.uint64.index = idx;
+	newdiff->obj_attr.diff.uint64.oldvalue = oldvalue;
+	newdiff->obj_attr.diff.uint64.newvalue = newvalue;
+	hwloc_append_diff(newdiff, firstdiffp, lastdiffp);
+	return 0;
+}
+
+static int
+hwloc_diff_trees(hwloc_topology_t topo1, hwloc_obj_t obj1,
+		 hwloc_topology_t topo2, hwloc_obj_t obj2,
+		 unsigned flags,
+		 hwloc_topology_diff_t *firstdiffp, hwloc_topology_diff_t *lastdiffp)
+{
+	unsigned i;
+	int err;
+	hwloc_obj_t child1, child2;
+
+	if (obj1->depth != obj2->depth)
+		goto out_too_complex;
+	if (obj1->type != obj2->type)
+		goto out_too_complex;
+
+	if (obj1->os_index != obj2->os_index)
+		/* we could allow different os_index for non-PU non-NUMAnode objects
+		 * but it's likely useless anyway */
+		goto out_too_complex;
+
+#define _SETS_DIFFERENT(_set1, _set2) \
+ (   ( !(_set1) != !(_set2) ) \
+  || ( (_set1) && !hwloc_bitmap_isequal(_set1, _set2) ) )
+#define SETS_DIFFERENT(_set, _obj1, _obj2) _SETS_DIFFERENT((_obj1)->_set, (_obj2)->_set)
+	if (SETS_DIFFERENT(cpuset, obj1, obj2)
+	    || SETS_DIFFERENT(complete_cpuset, obj1, obj2)
+	    || SETS_DIFFERENT(allowed_cpuset, obj1, obj2)
+	    || SETS_DIFFERENT(nodeset, obj1, obj2)
+	    || SETS_DIFFERENT(complete_nodeset, obj1, obj2)
+	    || SETS_DIFFERENT(allowed_nodeset, obj1, obj2))
+		goto out_too_complex;
+
+	/* no need to check logical_index, sibling_rank, symmetric_subtree,
+	 * the parents did it */
+
+	if ((!obj1->name) != (!obj2->name)
+	    || (obj1->name && strcmp(obj1->name, obj2->name))) {
+		err = hwloc_append_diff_obj_attr_string(obj1,
+						       HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME,
+						       NULL,
+						       obj1->name,
+						       obj2->name,
+						       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+
+	/* memory */
+	if (obj1->memory.local_memory != obj2->memory.local_memory) {
+		err = hwloc_append_diff_obj_attr_uint64(obj1,
+						       HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE,
+						       0,
+						       obj1->memory.local_memory,
+						       obj2->memory.local_memory,
+						       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+	/* ignore memory page_types */
+
+	/* type-specific attrs */
+	switch (obj1->type) {
+	default:
+		break;
+	case HWLOC_OBJ_CACHE:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->cache)))
+			goto out_too_complex;
+		break;
+	case HWLOC_OBJ_GROUP:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->group)))
+			goto out_too_complex;
+		break;
+	case HWLOC_OBJ_PCI_DEVICE:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->pcidev)))
+			goto out_too_complex;
+		break;
+	case HWLOC_OBJ_BRIDGE:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->bridge)))
+			goto out_too_complex;
+		break;
+	case HWLOC_OBJ_OS_DEVICE:
+		if (memcmp(obj1->attr, obj2->attr, sizeof(obj1->attr->osdev)))
+			goto out_too_complex;
+		break;
+	}
+
+	/* distances */
+	if (obj1->distances_count != obj2->distances_count)
+		goto out_too_complex;
+	for(i=0; i<obj1->distances_count; i++) {
+		struct hwloc_distances_s *d1 = obj1->distances[i], *d2 = obj2->distances[i];
+		if (d1->relative_depth != d2->relative_depth
+		    || d1->nbobjs != d2->nbobjs
+		    || d1->latency_max != d2->latency_max
+		    || d1->latency_base != d2->latency_base
+		    || memcmp(d1->latency, d2->latency, d1->nbobjs * d1->nbobjs * sizeof(*d1->latency)))
+			goto out_too_complex;
+	}
+
+	/* infos */
+	if (obj1->infos_count != obj2->infos_count)
+		goto out_too_complex;
+	for(i=0; i<obj1->infos_count; i++) {
+		if (strcmp(obj1->infos[i].name, obj2->infos[i].name))
+			goto out_too_complex;
+		if (strcmp(obj1->infos[i].value, obj2->infos[i].value)) {
+			err = hwloc_append_diff_obj_attr_string(obj1,
+							       HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO,
+							       obj1->infos[i].name,
+							       obj1->infos[i].value,
+							       obj2->infos[i].value,
+							       firstdiffp, lastdiffp);
+			if (err < 0)
+				return err;
+		}
+	}
+
+	/* ignore userdata */
+
+	/* children */
+	for(child1 = obj1->first_child, child2 = obj2->first_child;
+	    child1 != NULL && child2 != NULL;
+	    child1 = child1->next_sibling, child2 = child2->next_sibling) {
+		err = hwloc_diff_trees(topo1, child1,
+				       topo2, child2,
+				       flags,
+				       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+	if (child1 || child2)
+		goto out_too_complex;
+
+	/* I/O children */
+	for(child1 = obj1->io_first_child, child2 = obj2->io_first_child;
+	    child1 != NULL && child2 != NULL;
+	    child1 = child1->next_sibling, child2 = child2->next_sibling) {
+		err = hwloc_diff_trees(topo1, child1,
+				       topo2, child2,
+				       flags,
+				       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+	if (child1 || child2)
+		goto out_too_complex;
+
+	/* misc children */
+	for(child1 = obj1->misc_first_child, child2 = obj2->misc_first_child;
+	    child1 != NULL && child2 != NULL;
+	    child1 = child1->next_sibling, child2 = child2->next_sibling) {
+		err = hwloc_diff_trees(topo1, child1,
+				       topo2, child2,
+				       flags,
+				       firstdiffp, lastdiffp);
+		if (err < 0)
+			return err;
+	}
+	if (child1 || child2)
+		goto out_too_complex;
+
+	return 0;
+
+out_too_complex:
+	hwloc_append_diff_too_complex(obj1, firstdiffp, lastdiffp);
+	return 0;
+}
+
+int hwloc_topology_diff_build(hwloc_topology_t topo1,
+			      hwloc_topology_t topo2,
+			      unsigned long flags,
+			      hwloc_topology_diff_t *diffp)
+{
+	hwloc_topology_diff_t lastdiff, tmpdiff;
+	int err;
+
+	if (flags != 0) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	*diffp = NULL;
+	err = hwloc_diff_trees(topo1, hwloc_get_root_obj(topo1),
+			       topo2, hwloc_get_root_obj(topo2),
+			       flags,
+			       diffp, &lastdiff);
+
+	if (!err) {
+		tmpdiff = *diffp;
+		while (tmpdiff) {
+			if (tmpdiff->generic.type == HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX) {
+				err = 1;
+				break;
+			}
+			tmpdiff = tmpdiff->generic.next;
+		}
+	}
+
+	return err;
+}
+
+/********************
+ * Applying diffs
+ */
+
+static int
+hwloc_apply_diff_one(hwloc_topology_t topology,
+		     hwloc_topology_diff_t diff,
+		     unsigned long flags)
+{
+	int reverse = !!(flags & HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE);
+
+	switch (diff->generic.type) {
+	case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR: {
+		struct hwloc_topology_diff_obj_attr_s *obj_attr = &diff->obj_attr;
+		hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, obj_attr->obj_depth, obj_attr->obj_index);
+		if (!obj)
+			return -1;
+
+		switch (obj_attr->diff.generic.type) {
+		case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE: {
+			hwloc_obj_t tmpobj;
+			hwloc_uint64_t oldvalue = reverse ? obj_attr->diff.uint64.newvalue : obj_attr->diff.uint64.oldvalue;
+			hwloc_uint64_t newvalue = reverse ? obj_attr->diff.uint64.oldvalue : obj_attr->diff.uint64.newvalue;
+			hwloc_uint64_t valuediff = newvalue - oldvalue;
+			if (obj->memory.local_memory != oldvalue)
+				return -1;
+			obj->memory.local_memory = newvalue;
+			tmpobj = obj;
+			while (tmpobj) {
+				tmpobj->memory.total_memory += valuediff;
+				tmpobj = tmpobj->parent;
+			}
+			break;
+		}
+		case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME: {
+			const char *oldvalue = reverse ? obj_attr->diff.string.newvalue : obj_attr->diff.string.oldvalue;
+			const char *newvalue = reverse ? obj_attr->diff.string.oldvalue : obj_attr->diff.string.newvalue;
+			if (!obj->name || strcmp(obj->name, oldvalue))
+				return -1;
+			free(obj->name);
+			obj->name = strdup(newvalue);
+			break;
+		}
+		case HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO: {
+			const char *name = obj_attr->diff.string.name;
+			const char *oldvalue = reverse ? obj_attr->diff.string.newvalue : obj_attr->diff.string.oldvalue;
+			const char *newvalue = reverse ? obj_attr->diff.string.oldvalue : obj_attr->diff.string.newvalue;
+			unsigned i;
+			int found = 0;
+			for(i=0; i<obj->infos_count; i++) {
+				if (!strcmp(obj->infos[i].name, name)
+				    && !strcmp(obj->infos[i].value, oldvalue)) {
+					free(obj->infos[i].value);
+					obj->infos[i].value = strdup(newvalue);
+					found = 1;
+					break;
+				}
+			}
+			if (!found)
+				return -1;
+			break;
+		}
+		default:
+			return -1;
+		}
+
+		break;
+	}
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+int hwloc_topology_diff_apply(hwloc_topology_t topology,
+			      hwloc_topology_diff_t diff,
+			      unsigned long flags)
+{
+	hwloc_topology_diff_t tmpdiff, tmpdiff2;
+	int err, nr;
+
+	if (flags & ~HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	tmpdiff = diff;
+	nr = 0;
+	while (tmpdiff) {
+		nr++;
+		err = hwloc_apply_diff_one(topology, tmpdiff, flags);
+		if (err < 0)
+			goto cancel;
+		tmpdiff = tmpdiff->generic.next;
+	}
+	return 0;
+
+cancel:
+	tmpdiff2 = tmpdiff;
+	tmpdiff = diff;
+	while (tmpdiff != tmpdiff2) {
+		hwloc_apply_diff_one(topology, tmpdiff, flags ^ HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE);
+		tmpdiff = tmpdiff->generic.next;
+	}
+	errno = EINVAL;
+	return -nr; /* return the index (starting at 1) of the first element that couldn't be applied */
+}
diff --git a/ext/hwloc/hwloc/distances.c b/ext/hwloc/hwloc/distances.c
new file mode 100644
index 0000000..51382b1
--- /dev/null
+++ b/ext/hwloc/hwloc/distances.c
@@ -0,0 +1,995 @@
+/*
+ * Copyright © 2010-2015 Inria.  All rights reserved.
+ * Copyright © 2011-2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+#include <float.h>
+#include <math.h>
+
+/**************************
+ * Main Init/Clear/Destroy
+ */
+
+/* called during topology init */
+void hwloc_distances_init(struct hwloc_topology *topology)
+{
+  topology->first_osdist = topology->last_osdist = NULL;
+}
+
+/* called during topology destroy */
+void hwloc_distances_destroy(struct hwloc_topology * topology)
+{
+  struct hwloc_os_distances_s *osdist, *next = topology->first_osdist;
+  while ((osdist = next) != NULL) {
+    next = osdist->next;
+    /* remove final distance matrics AND physically-ordered ones */
+    free(osdist->indexes);
+    free(osdist->objs);
+    free(osdist->distances);
+    free(osdist);
+  }
+  topology->first_osdist = topology->last_osdist = NULL;
+}
+
+/******************************************************
+ * Inserting distances in the topology
+ * from a backend, from the environment or by the user
+ */
+
+/* insert a distance matrix in the topology.
+ * the caller gives us those pointers, we take care of freeing them later and so on.
+ */
+void hwloc_distances_set(hwloc_topology_t __hwloc_restrict topology, hwloc_obj_type_t type,
+			 unsigned nbobjs, unsigned *indexes, hwloc_obj_t *objs, float *distances,
+			 int force)
+{
+  struct hwloc_os_distances_s *osdist, *next = topology->first_osdist;
+  /* look for existing distances for the same type */
+  while ((osdist = next) != NULL) {
+    next = osdist->next;
+    if (osdist->type == type) {
+      if (osdist->forced && !force) {
+	/* there is a forced distance element, ignore the new non-forced one */
+	free(indexes);
+	free(objs);
+	free(distances);
+	return;
+      } else if (force) {
+	/* we're forcing a new distance, remove the old ones */
+	free(osdist->indexes);
+	free(osdist->objs);
+	free(osdist->distances);
+	/* remove current object */
+	if (osdist->prev)
+	  osdist->prev->next = next;
+	else
+	  topology->first_osdist = next;
+	if (next)
+	  next->prev = osdist->prev;
+	else
+	  topology->last_osdist = osdist->prev;
+	/* free current object */
+	free(osdist);
+      }
+    }
+  }
+
+  if (!nbobjs)
+    /* we're just clearing, return now */
+    return;
+
+  /* create the new element */
+  osdist = malloc(sizeof(struct hwloc_os_distances_s));
+  osdist->nbobjs = nbobjs;
+  osdist->indexes = indexes;
+  osdist->objs = objs;
+  osdist->distances = distances;
+  osdist->forced = force;
+  osdist->type = type;
+  /* insert it */
+  osdist->next = NULL;
+  osdist->prev = topology->last_osdist;
+  if (topology->last_osdist)
+    topology->last_osdist->next = osdist;
+  else
+    topology->first_osdist = osdist;
+  topology->last_osdist = osdist;
+}
+
+/* make sure a user-given distance matrix is sane */
+static int hwloc_distances__check_matrix(hwloc_topology_t __hwloc_restrict topology __hwloc_attribute_unused, hwloc_obj_type_t type __hwloc_attribute_unused,
+					 unsigned nbobjs, unsigned *indexes, hwloc_obj_t *objs __hwloc_attribute_unused, float *distances __hwloc_attribute_unused)
+{
+  unsigned i,j;
+  /* make sure we don't have the same index twice */
+  for(i=0; i<nbobjs; i++)
+    for(j=i+1; j<nbobjs; j++)
+      if (indexes[i] == indexes[j]) {
+	errno = EINVAL;
+	return -1;
+      }
+  return 0;
+}
+
+static void hwloc_distances__set_from_string(struct hwloc_topology *topology,
+					     hwloc_obj_type_t type, const char *string)
+{
+  /* the string format is: "index[0],...,index[N-1]:distance[0],...,distance[N*N-1]"
+   * or "index[0],...,index[N-1]:X*Y" or "index[0],...,index[N-1]:X*Y*Z"
+   */
+  const char *tmp = string, *next;
+  unsigned *indexes;
+  float *distances;
+  unsigned nbobjs = 0, i, j, x, y, z;
+
+  if (!strcmp(string, "none")) {
+    hwloc_distances_set(topology, type, 0, NULL, NULL, NULL, 1 /* force */);
+    return;
+  }
+
+  if (sscanf(string, "%u-%u:", &i, &j) == 2) {
+    /* range i-j */
+    nbobjs = j-i+1;
+    indexes = calloc(nbobjs, sizeof(unsigned));
+    distances = calloc(nbobjs*nbobjs, sizeof(float));
+    /* make sure the user didn't give a veeeeery large range */
+    if (!indexes || !distances) {
+      free(indexes);
+      free(distances);
+      return;
+    }
+    for(j=0; j<nbobjs; j++)
+      indexes[j] = j+i;
+    tmp = strchr(string, ':') + 1;
+
+  } else {
+    /* explicit list of indexes, count them */
+    while (1) {
+      size_t size = strspn(tmp, "0123456789");
+      if (tmp[size] != ',') {
+	/* last element */
+	tmp += size;
+	nbobjs++;
+	break;
+      }
+      /* another index */
+      tmp += size+1;
+      nbobjs++;
+    }
+
+    if (*tmp != ':') {
+      fprintf(stderr, "Ignoring %s distances from environment variable, missing colon\n",
+	      hwloc_obj_type_string(type));
+      return;
+    }
+
+    indexes = calloc(nbobjs, sizeof(unsigned));
+    distances = calloc(nbobjs*nbobjs, sizeof(float));
+    tmp = string;
+
+    /* parse indexes */
+    for(i=0; i<nbobjs; i++) {
+      indexes[i] = strtoul(tmp, (char **) &next, 0);
+      tmp = next+1;
+    }
+  }
+
+
+  /* parse distances */
+  z=1; /* default if sscanf finds only 2 values below */
+  if (sscanf(tmp, "%u*%u*%u", &x, &y, &z) >= 2) {
+    /* generate the matrix to create x groups of y elements */
+    if (x*y*z != nbobjs) {
+      fprintf(stderr, "Ignoring %s distances from environment variable, invalid grouping (%u*%u*%u=%u instead of %u)\n",
+	      hwloc_obj_type_string(type), x, y, z, x*y*z, nbobjs);
+      free(indexes);
+      free(distances);
+      return;
+    }
+    for(i=0; i<nbobjs; i++)
+      for(j=0; j<nbobjs; j++)
+	if (i==j)
+	  distances[i*nbobjs+j] = 1;
+	else if (i/z == j/z)
+	  distances[i*nbobjs+j] = 2;
+	else if (i/z/y == j/z/y)
+	  distances[i*nbobjs+j] = 4;
+	else
+	  distances[i*nbobjs+j] = 8;
+
+  } else {
+    /* parse a comma separated list of distances */
+    for(i=0; i<nbobjs*nbobjs; i++) {
+      distances[i] = (float) atof(tmp);
+      next = strchr(tmp, ',');
+      if (next) {
+        tmp = next+1;
+      } else if (i!=nbobjs*nbobjs-1) {
+	fprintf(stderr, "Ignoring %s distances from environment variable, not enough values (%u out of %u)\n",
+		hwloc_obj_type_string(type), i+1, nbobjs*nbobjs);
+	free(indexes);
+	free(distances);
+	return;
+      }
+    }
+  }
+
+  if (hwloc_distances__check_matrix(topology, type, nbobjs, indexes, NULL, distances) < 0) {
+    fprintf(stderr, "Ignoring invalid %s distances from environment variable\n", hwloc_obj_type_string(type));
+    free(indexes);
+    free(distances);
+    return;
+  }
+
+  hwloc_distances_set(topology, type, nbobjs, indexes, NULL, distances, 1 /* force */);
+}
+
+/* take distances in the environment, store them as is in the topology.
+ * we'll convert them into object later once the tree is filled
+ */
+void hwloc_distances_set_from_env(struct hwloc_topology *topology)
+{
+  hwloc_obj_type_t type;
+  for(type = HWLOC_OBJ_SYSTEM; type < HWLOC_OBJ_TYPE_MAX; type++) {
+    const char *env;
+    char envname[64];
+    snprintf(envname, sizeof(envname), "HWLOC_%s_DISTANCES", hwloc_obj_type_string(type));
+    env = getenv(envname);
+    if (env) {
+      hwloc_localeswitch_declare;
+      hwloc_localeswitch_init();
+      hwloc_distances__set_from_string(topology, type, env);
+      hwloc_localeswitch_fini();
+    }
+  }
+}
+
+/* The actual set() function exported to the user
+ *
+ * take the given distance, store them as is in the topology.
+ * we'll convert them into object later once the tree is filled.
+ */
+int hwloc_topology_set_distance_matrix(hwloc_topology_t __hwloc_restrict topology, hwloc_obj_type_t type,
+				       unsigned nbobjs, unsigned *indexes, float *distances)
+{
+  unsigned *_indexes;
+  float *_distances;
+
+  if (!nbobjs && !indexes && !distances) {
+    hwloc_distances_set(topology, type, 0, NULL, NULL, NULL, 1 /* force */);
+    return 0;
+  }
+
+  if (!nbobjs || !indexes || !distances)
+    return -1;
+
+  if (hwloc_distances__check_matrix(topology, type, nbobjs, indexes, NULL, distances) < 0)
+    return -1;
+
+  /* copy the input arrays and give them to the topology */
+  _indexes = malloc(nbobjs*sizeof(unsigned));
+  memcpy(_indexes, indexes, nbobjs*sizeof(unsigned));
+  _distances = malloc(nbobjs*nbobjs*sizeof(float));
+  memcpy(_distances, distances, nbobjs*nbobjs*sizeof(float));
+  hwloc_distances_set(topology, type, nbobjs, _indexes, NULL, _distances, 1 /* force */);
+
+  return 0;
+}
+
+/************************
+ * Restricting distances
+ */
+
+/* called when some objects have been removed because empty/ignored/cgroup/restrict,
+ * we must rebuild the list of objects from indexes (in hwloc_distances_finalize_os())
+ */
+void hwloc_distances_restrict_os(struct hwloc_topology *topology)
+{
+  struct hwloc_os_distances_s * osdist;
+  for(osdist = topology->first_osdist; osdist; osdist = osdist->next) {
+    /* remove the objs array, we'll rebuild it from the indexes
+     * depending on remaining objects */
+    free(osdist->objs);
+    osdist->objs = NULL;
+  }
+}
+
+
+/* cleanup everything we created from distances so that we may rebuild them
+ * at the end of restrict()
+ */
+void hwloc_distances_restrict(struct hwloc_topology *topology, unsigned long flags)
+{
+  if (flags & HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES) {
+    /* some objects may have been removed, clear objects arrays so that finalize_os rebuilds them properly */
+    hwloc_distances_restrict_os(topology);
+  } else {
+    /* if not adapting distances, drop everything */
+    hwloc_distances_destroy(topology);
+  }
+}
+
+/**************************************************************
+ * Convert user/env given array of indexes into actual objects
+ */
+
+static hwloc_obj_t hwloc_find_obj_by_type_and_os_index(hwloc_obj_t root, hwloc_obj_type_t type, unsigned os_index)
+{
+  hwloc_obj_t child;
+  if (root->type == type && root->os_index == os_index)
+    return root;
+  child = root->first_child;
+  while (child) {
+    hwloc_obj_t found = hwloc_find_obj_by_type_and_os_index(child, type, os_index);
+    if (found)
+      return found;
+    child = child->next_sibling;
+  }
+  return NULL;
+}
+
+/* convert distance indexes that were previously stored in the topology
+ * into actual objects if not done already.
+ * it's already done when distances come from backends (this function should not be called then).
+ * it's not done when distances come from the user.
+ *
+ * returns -1 if the matrix was invalid
+ */
+static int
+hwloc_distances__finalize_os(struct hwloc_topology *topology, struct hwloc_os_distances_s *osdist)
+{
+  unsigned nbobjs = osdist->nbobjs;
+  unsigned *indexes = osdist->indexes;
+  float *distances = osdist->distances;
+  unsigned i, j;
+  hwloc_obj_type_t type = osdist->type;
+  hwloc_obj_t *objs = calloc(nbobjs, sizeof(hwloc_obj_t));
+
+  assert(!osdist->objs);
+
+  /* traverse the topology and look for the relevant objects */
+  for(i=0; i<nbobjs; i++) {
+    hwloc_obj_t obj = hwloc_find_obj_by_type_and_os_index(topology->levels[0][0], type, indexes[i]);
+    if (!obj) {
+
+      /* shift the matrix */
+#define OLDPOS(i,j) (distances+(i)*nbobjs+(j))
+#define NEWPOS(i,j) (distances+(i)*(nbobjs-1)+(j))
+      if (i>0) {
+	/** no need to move beginning of 0th line */
+	for(j=0; j<i-1; j++)
+	  /** move end of jth line + beginning of (j+1)th line */
+	  memmove(NEWPOS(j,i), OLDPOS(j,i+1), (nbobjs-1)*sizeof(*distances));
+	/** move end of (i-1)th line */
+	memmove(NEWPOS(i-1,i), OLDPOS(i-1,i+1), (nbobjs-i-1)*sizeof(*distances));
+      }
+      if (i<nbobjs-1) {
+	/** move beginning of (i+1)th line */
+	memmove(NEWPOS(i,0), OLDPOS(i+1,0), i*sizeof(*distances));
+	/** move end of jth line + beginning of (j+1)th line */
+	for(j=i; j<nbobjs-2; j++)
+	  memmove(NEWPOS(j,i), OLDPOS(j+1,i+1), (nbobjs-1)*sizeof(*distances));
+	/** move end of (nbobjs-2)th line */
+	memmove(NEWPOS(nbobjs-2,i), OLDPOS(nbobjs-1,i+1), (nbobjs-i-1)*sizeof(*distances));
+      }
+
+      /* shift the indexes array */
+      memmove(indexes+i, indexes+i+1, (nbobjs-i-1)*sizeof(*indexes));
+
+      /* update counters */
+      nbobjs--;
+      i--;
+      continue;
+    }
+    objs[i] = obj;
+  }
+
+  osdist->nbobjs = nbobjs;
+  if (!nbobjs) {
+    /* the whole matrix was invalid, let the caller remove this distances */
+    free(objs);
+    return -1;
+  }
+
+  /* setup the objs array */
+  osdist->objs = objs;
+  return 0;
+}
+
+
+void hwloc_distances_finalize_os(struct hwloc_topology *topology)
+{
+  struct hwloc_os_distances_s *osdist, *next = topology->first_osdist;
+  while ((osdist = next) != NULL) {
+    int err;
+    next = osdist->next;
+
+    /* remove final distance matrics AND physically-ordered ones */
+
+    if (osdist->objs)
+      /* nothing to do, switch to the next element */
+      continue;
+
+    err = hwloc_distances__finalize_os(topology, osdist);
+    if (!err)
+      /* convert ok, switch to the next element */
+      continue;
+
+    /* remove this element */
+    free(osdist->indexes);
+    free(osdist->distances);
+    /* remove current object */
+    if (osdist->prev)
+      osdist->prev->next = next;
+    else
+      topology->first_osdist = next;
+    if (next)
+      next->prev = osdist->prev;
+    else
+      topology->last_osdist = osdist->prev;
+    /* free current object */
+    free(osdist);
+  }
+}
+
+/***********************************************************
+ * Convert internal distances given by the backend/env/user
+ * into exported logical distances attached to objects
+ */
+
+static void
+hwloc_distances__finalize_logical(struct hwloc_topology *topology,
+				  unsigned nbobjs,
+				  hwloc_obj_t *objs, float *osmatrix)
+{
+  unsigned i, j, li, lj, minl;
+  float min = FLT_MAX, max = FLT_MIN;
+  hwloc_obj_t root;
+  float *matrix;
+  hwloc_cpuset_t cpuset, complete_cpuset;
+  hwloc_nodeset_t nodeset, complete_nodeset;
+  unsigned relative_depth;
+  int idx;
+
+  /* find the root */
+  cpuset = hwloc_bitmap_alloc();
+  complete_cpuset = hwloc_bitmap_alloc();
+  nodeset = hwloc_bitmap_alloc();
+  complete_nodeset = hwloc_bitmap_alloc();
+  for(i=0; i<nbobjs; i++) {
+    hwloc_bitmap_or(cpuset, cpuset, objs[i]->cpuset);
+    hwloc_bitmap_or(complete_cpuset, complete_cpuset, objs[i]->complete_cpuset);
+    hwloc_bitmap_or(nodeset, nodeset, objs[i]->nodeset);
+    hwloc_bitmap_or(complete_nodeset, complete_nodeset, objs[i]->complete_nodeset);
+  }
+  /* find the object covering cpuset, we'll take care of the nodeset later */
+  root = hwloc_get_obj_covering_cpuset(topology, cpuset);
+  /* walk up to find a parent that also covers the nodeset and complete sets */
+  while (root &&
+	 (!hwloc_bitmap_isincluded(nodeset, root->nodeset)
+	  || !hwloc_bitmap_isincluded(complete_nodeset, root->complete_nodeset)
+	  || !hwloc_bitmap_isincluded(complete_cpuset, root->complete_cpuset)))
+    root = root->parent;
+  if (!root) {
+    /* should not happen, ignore the distance matrix and report an error. */
+    if (!hwloc_hide_errors()) {
+      char *a, *b;
+      hwloc_bitmap_asprintf(&a, cpuset);
+      hwloc_bitmap_asprintf(&b, nodeset);
+      fprintf(stderr, "****************************************************************************\n");
+      fprintf(stderr, "* hwloc %s has encountered an error when adding a distance matrix to the topology.\n", HWLOC_VERSION);
+      fprintf(stderr, "*\n");
+      fprintf(stderr, "* hwloc_distances__finalize_logical() could not find any object covering\n");
+      fprintf(stderr, "* cpuset %s and nodeset %s\n", a, b);
+      fprintf(stderr, "*\n");
+      fprintf(stderr, "* Please report this error message to the hwloc user's mailing list,\n");
+#ifdef HWLOC_LINUX_SYS
+      fprintf(stderr, "* along with the output from the hwloc-gather-topology script.\n");
+#else
+      fprintf(stderr, "* along with any relevant topology information from your platform.\n");
+#endif
+      fprintf(stderr, "****************************************************************************\n");
+      free(a);
+      free(b);
+    }
+    hwloc_bitmap_free(cpuset);
+    hwloc_bitmap_free(complete_cpuset);
+    hwloc_bitmap_free(nodeset);
+    hwloc_bitmap_free(complete_nodeset);
+    return;
+  }
+  /* ideally, root has the exact cpuset and nodeset.
+   * but ignoring or other things that remove objects may cause the object array to reduce */
+  assert(hwloc_bitmap_isincluded(cpuset, root->cpuset));
+  assert(hwloc_bitmap_isincluded(complete_cpuset, root->complete_cpuset));
+  assert(hwloc_bitmap_isincluded(nodeset, root->nodeset));
+  assert(hwloc_bitmap_isincluded(complete_nodeset, root->complete_nodeset));
+  hwloc_bitmap_free(cpuset);
+  hwloc_bitmap_free(complete_cpuset);
+  hwloc_bitmap_free(nodeset);
+  hwloc_bitmap_free(complete_nodeset);
+  if (root->depth >= objs[0]->depth) {
+    /* strange topology led us to find invalid relative depth, ignore */
+    return;
+  }
+  relative_depth = objs[0]->depth - root->depth; /* this assume that we have distances between objects of the same level */
+
+  if (nbobjs != hwloc_get_nbobjs_inside_cpuset_by_depth(topology, root->cpuset, root->depth + relative_depth))
+    /* the root does not cover the right number of objects, maybe we failed to insert a root (bad intersect or so). */
+    return;
+
+  /* get the logical index offset, it's the min of all logical indexes */
+  minl = UINT_MAX;
+  for(i=0; i<nbobjs; i++)
+    if (minl > objs[i]->logical_index)
+      minl = objs[i]->logical_index;
+
+  /* compute/check min/max values */
+  for(i=0; i<nbobjs; i++)
+    for(j=0; j<nbobjs; j++) {
+      float val = osmatrix[i*nbobjs+j];
+      if (val < min)
+	min = val;
+      if (val > max)
+	max = val;
+    }
+  if (!min) {
+    /* Linux up to 2.6.36 reports ACPI SLIT distances, which should be memory latencies.
+     * Except of SGI IP27 (SGI Origin 200/2000 with MIPS processors) where the distances
+     * are the number of hops between routers.
+     */
+    hwloc_debug("%s", "minimal distance is 0, matrix does not seem to contain latencies, ignoring\n");
+    return;
+  }
+
+  /* store the normalized latency matrix in the root object */
+  idx = root->distances_count++;
+  root->distances = realloc(root->distances, root->distances_count * sizeof(struct hwloc_distances_s *));
+  root->distances[idx] = malloc(sizeof(struct hwloc_distances_s));
+  root->distances[idx]->relative_depth = relative_depth;
+  root->distances[idx]->nbobjs = nbobjs;
+  root->distances[idx]->latency = matrix = malloc(nbobjs*nbobjs*sizeof(float));
+  root->distances[idx]->latency_base = (float) min;
+#define NORMALIZE_LATENCY(d) ((d)/(min))
+  root->distances[idx]->latency_max = NORMALIZE_LATENCY(max);
+  for(i=0; i<nbobjs; i++) {
+    li = objs[i]->logical_index - minl;
+    matrix[li*nbobjs+li] = NORMALIZE_LATENCY(osmatrix[i*nbobjs+i]);
+    for(j=i+1; j<nbobjs; j++) {
+      lj = objs[j]->logical_index - minl;
+      matrix[li*nbobjs+lj] = NORMALIZE_LATENCY(osmatrix[i*nbobjs+j]);
+      matrix[lj*nbobjs+li] = NORMALIZE_LATENCY(osmatrix[j*nbobjs+i]);
+    }
+  }
+}
+
+/* convert internal distances into logically-ordered distances
+ * that can be exposed in the API
+ */
+void
+hwloc_distances_finalize_logical(struct hwloc_topology *topology)
+{
+  unsigned nbobjs;
+  int depth;
+  struct hwloc_os_distances_s * osdist;
+  for(osdist = topology->first_osdist; osdist; osdist = osdist->next) {
+
+    nbobjs = osdist->nbobjs;
+    if (!nbobjs)
+      continue;
+
+    depth = hwloc_get_type_depth(topology, osdist->type);
+    if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+      continue;
+
+    if (osdist->objs) {
+      assert(osdist->distances);
+      hwloc_distances__finalize_logical(topology, nbobjs,
+					osdist->objs,
+					osdist->distances);
+    }
+  }
+}
+
+/***************************************************
+ * Destroying logical distances attached to objects
+ */
+
+/* destroy an object distances structure */
+void
+hwloc_clear_object_distances_one(struct hwloc_distances_s * distances)
+{
+  free(distances->latency);
+  free(distances);
+}
+
+void
+hwloc_clear_object_distances(hwloc_obj_t obj)
+{
+  unsigned i;
+  for (i=0; i<obj->distances_count; i++)
+    hwloc_clear_object_distances_one(obj->distances[i]);
+  free(obj->distances);
+  obj->distances = NULL;
+  obj->distances_count = 0;
+}
+
+/******************************************
+ * Grouping objects according to distances
+ */
+
+static void hwloc_report_user_distance_error(const char *msg, int line)
+{
+    static int reported = 0;
+
+    if (!reported && !hwloc_hide_errors()) {
+        fprintf(stderr, "****************************************************************************\n");
+        fprintf(stderr, "* hwloc %s has encountered what looks like an error from user-given distances.\n", HWLOC_VERSION);
+        fprintf(stderr, "*\n");
+        fprintf(stderr, "* %s\n", msg);
+        fprintf(stderr, "* Error occurred in topology.c line %d\n", line);
+        fprintf(stderr, "*\n");
+        fprintf(stderr, "* Please make sure that distances given through the interface or environment\n");
+        fprintf(stderr, "* variables do not contradict any other topology information.\n");
+        fprintf(stderr, "****************************************************************************\n");
+        reported = 1;
+    }
+}
+
+static int hwloc_compare_distances(float a, float b, float accuracy)
+{
+  if (accuracy != 0.0 && fabsf(a-b) < a * accuracy)
+    return 0;
+  return a < b ? -1 : a == b ? 0 : 1;
+}
+
+/*
+ * Place objects in groups if they are in a transitive graph of minimal distances.
+ * Return how many groups were created, or 0 if some incomplete distance graphs were found.
+ */
+static unsigned
+hwloc__find_groups_by_min_distance(unsigned nbobjs,
+				   float *_distances,
+				   float accuracy,
+				   unsigned *groupids,
+				   int verbose)
+{
+  float min_distance = FLT_MAX;
+  unsigned groupid = 1;
+  unsigned i,j,k;
+  unsigned skipped = 0;
+
+#define DISTANCE(i, j) _distances[(i) * nbobjs + (j)]
+
+  memset(groupids, 0, nbobjs*sizeof(*groupids));
+
+  /* find the minimal distance */
+  for(i=0; i<nbobjs; i++)
+    for(j=0; j<nbobjs; j++) /* check the entire matrix, it may not be perfectly symmetric depending on the accuracy */
+      if (i != j && DISTANCE(i, j) < min_distance) /* no accuracy here, we want the real minimal */
+        min_distance = DISTANCE(i, j);
+  hwloc_debug("found minimal distance %f between objects\n", min_distance);
+
+  if (min_distance == FLT_MAX)
+    return 0;
+
+  /* build groups of objects connected with this distance */
+  for(i=0; i<nbobjs; i++) {
+    unsigned size;
+    int firstfound;
+
+    /* if already grouped, skip */
+    if (groupids[i])
+      continue;
+
+    /* start a new group */
+    groupids[i] = groupid;
+    size = 1;
+    firstfound = i;
+
+    while (firstfound != -1) {
+      /* we added new objects to the group, the first one was firstfound.
+       * rescan all connections from these new objects (starting at first found) to any other objects,
+       * so as to find new objects minimally-connected by transivity.
+       */
+      int newfirstfound = -1;
+      for(j=firstfound; j<nbobjs; j++)
+	if (groupids[j] == groupid)
+	  for(k=0; k<nbobjs; k++)
+              if (!groupids[k] && !hwloc_compare_distances(DISTANCE(j, k), min_distance, accuracy)) {
+	      groupids[k] = groupid;
+	      size++;
+	      if (newfirstfound == -1)
+		newfirstfound = k;
+	      if (i == j)
+		hwloc_debug("object %u is minimally connected to %u\n", k, i);
+	      else
+	        hwloc_debug("object %u is minimally connected to %u through %u\n", k, i, j);
+	    }
+      firstfound = newfirstfound;
+    }
+
+    if (size == 1) {
+      /* cancel this useless group, ignore this object and try from the next one */
+      groupids[i] = 0;
+      skipped++;
+      continue;
+    }
+
+    /* valid this group */
+    groupid++;
+    if (verbose)
+      fprintf(stderr, "Found transitive graph with %u objects with minimal distance %f accuracy %f\n",
+	      size, min_distance, accuracy);
+  }
+
+  if (groupid == 2 && !skipped)
+    /* we created a single group containing all objects, ignore it */
+    return 0;
+
+  /* return the last id, since it's also the number of used group ids */
+  return groupid-1;
+}
+
+/* check that the matrix is ok */
+static int
+hwloc__check_grouping_matrix(unsigned nbobjs, float *_distances, float accuracy, int verbose)
+{
+  unsigned i,j;
+  for(i=0; i<nbobjs; i++) {
+    for(j=i+1; j<nbobjs; j++) {
+      /* should be symmetric */
+      if (hwloc_compare_distances(DISTANCE(i, j), DISTANCE(j, i), accuracy)) {
+	if (verbose)
+	  fprintf(stderr, "Distance matrix asymmetric ([%u,%u]=%f != [%u,%u]=%f), aborting\n",
+		  i, j, DISTANCE(i, j), j, i, DISTANCE(j, i));
+	return -1;
+      }
+      /* diagonal is smaller than everything else */
+      if (hwloc_compare_distances(DISTANCE(i, j), DISTANCE(i, i), accuracy) <= 0) {
+	if (verbose)
+	  fprintf(stderr, "Distance to self not strictly minimal ([%u,%u]=%f <= [%u,%u]=%f), aborting\n",
+		  i, j, DISTANCE(i, j), i, i, DISTANCE(i, i));
+	return -1;
+      }
+    }
+  }
+  return 0;
+}
+
+/*
+ * Look at object physical distances to group them.
+ */
+static void
+hwloc__groups_by_distances(struct hwloc_topology *topology,
+			   unsigned nbobjs,
+			   struct hwloc_obj **objs,
+			   float *_distances,
+			   unsigned nbaccuracies, float *accuracies,
+			   int fromuser,
+			   int needcheck,
+			   int verbose)
+{
+  unsigned *groupids = NULL;
+  unsigned nbgroups = 0;
+  unsigned i,j;
+
+  if (nbobjs <= 2) {
+      return;
+  }
+
+  groupids = malloc(sizeof(unsigned) * nbobjs);
+  if (NULL == groupids) {
+      return;
+  }
+
+  for(i=0; i<nbaccuracies; i++) {
+    if (verbose)
+      fprintf(stderr, "Trying to group %u %s objects according to physical distances with accuracy %f\n",
+	      nbobjs, hwloc_obj_type_string(objs[0]->type), accuracies[i]);
+    if (needcheck && hwloc__check_grouping_matrix(nbobjs, _distances, accuracies[i], verbose) < 0)
+      continue;
+    nbgroups = hwloc__find_groups_by_min_distance(nbobjs, _distances, accuracies[i], groupids, verbose);
+    if (nbgroups)
+      break;
+  }
+  if (!nbgroups)
+    goto outter_free;
+
+  /* For convenience, put these declarations inside a block.  It's a
+     crying shame we can't use C99 syntax here, and have to do a bunch
+     of mallocs. :-( */
+  {
+      hwloc_obj_t *groupobjs = NULL;
+      unsigned *groupsizes = NULL;
+      float *groupdistances = NULL;
+      unsigned failed = 0;
+
+      groupobjs = malloc(sizeof(hwloc_obj_t) * nbgroups);
+      groupsizes = malloc(sizeof(unsigned) * nbgroups);
+      groupdistances = malloc(sizeof(float) * nbgroups * nbgroups);
+      if (NULL == groupobjs || NULL == groupsizes || NULL == groupdistances) {
+          goto inner_free;
+      }
+      /* create new Group objects and record their size */
+      memset(&(groupsizes[0]), 0, sizeof(groupsizes[0]) * nbgroups);
+      for(i=0; i<nbgroups; i++) {
+          /* create the Group object */
+          hwloc_obj_t group_obj, res_obj;
+          group_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
+          group_obj->cpuset = hwloc_bitmap_alloc();
+          group_obj->attr->group.depth = topology->next_group_depth;
+          for (j=0; j<nbobjs; j++)
+	    if (groupids[j] == i+1) {
+	      /* assemble the group sets */
+	      hwloc_obj_add_other_obj_sets(group_obj, objs[j]);
+              groupsizes[i]++;
+            }
+          hwloc_debug_1arg_bitmap("adding Group object with %u objects and cpuset %s\n",
+                                  groupsizes[i], group_obj->cpuset);
+          res_obj = hwloc__insert_object_by_cpuset(topology, group_obj,
+						   fromuser ? hwloc_report_user_distance_error : hwloc_report_os_error);
+	  /* res_obj may be NULL on failure to insert. */
+	  if (!res_obj)
+	    failed++;
+	  /* or it may be different from groupobjs if we got groups from XML import before grouping */
+          groupobjs[i] = res_obj;
+      }
+
+      if (failed)
+	/* don't try to group above if we got a NULL group here, just keep this incomplete level */
+	goto inner_free;
+
+      /* factorize distances */
+      memset(&(groupdistances[0]), 0, sizeof(groupdistances[0]) * nbgroups * nbgroups);
+#undef DISTANCE
+#define DISTANCE(i, j) _distances[(i) * nbobjs + (j)]
+#define GROUP_DISTANCE(i, j) groupdistances[(i) * nbgroups + (j)]
+      for(i=0; i<nbobjs; i++)
+	if (groupids[i])
+	  for(j=0; j<nbobjs; j++)
+	    if (groupids[j])
+                GROUP_DISTANCE(groupids[i]-1, groupids[j]-1) += DISTANCE(i, j);
+      for(i=0; i<nbgroups; i++)
+          for(j=0; j<nbgroups; j++) {
+              unsigned groupsize = groupsizes[i]*groupsizes[j];
+              float groupsizef = (float) groupsize;
+              GROUP_DISTANCE(i, j) /= groupsizef;
+          }
+#ifdef HWLOC_DEBUG
+      hwloc_debug("%s", "generated new distance matrix between groups:\n");
+      hwloc_debug("%s", "  index");
+      for(j=0; j<nbgroups; j++)
+	hwloc_debug(" % 5d", (int) j); /* print index because os_index is -1 for Groups */
+      hwloc_debug("%s", "\n");
+      for(i=0; i<nbgroups; i++) {
+	hwloc_debug("  % 5d", (int) i);
+	for(j=0; j<nbgroups; j++)
+	  hwloc_debug(" %2.3f", GROUP_DISTANCE(i, j));
+	hwloc_debug("%s", "\n");
+      }
+#endif
+
+      topology->next_group_depth++;
+      hwloc__groups_by_distances(topology, nbgroups, groupobjs, (float*) groupdistances, nbaccuracies, accuracies, fromuser, 0 /* no need to check generated matrix */, verbose);
+
+  inner_free:
+      /* Safely free everything */
+      if (NULL != groupobjs) {
+          free(groupobjs);
+      }
+      if (NULL != groupsizes) {
+          free(groupsizes);
+      }
+      if (NULL != groupdistances) {
+          free(groupdistances);
+      }
+  }
+
+ outter_free:
+  if (NULL != groupids) {
+      free(groupids);
+  }
+}
+
+void
+hwloc_group_by_distances(struct hwloc_topology *topology)
+{
+  unsigned nbobjs;
+  struct hwloc_os_distances_s * osdist;
+  const char *env;
+  float accuracies[5] = { 0.0f, 0.01f, 0.02f, 0.05f, 0.1f };
+  unsigned nbaccuracies = 5;
+  hwloc_obj_t group_obj;
+  int verbose = 0;
+  unsigned i;
+  hwloc_localeswitch_declare;
+#ifdef HWLOC_DEBUG
+  unsigned j;
+#endif
+
+  env = getenv("HWLOC_GROUPING");
+  if (env && !atoi(env))
+    return;
+  /* backward compat with v1.2 */
+  if (getenv("HWLOC_IGNORE_DISTANCES"))
+    return;
+
+  hwloc_localeswitch_init();
+  env = getenv("HWLOC_GROUPING_ACCURACY");
+  if (!env) {
+    /* only use 0.0 */
+    nbaccuracies = 1;
+  } else if (strcmp(env, "try")) {
+    /* use the given value */
+    nbaccuracies = 1;
+    accuracies[0] = (float) atof(env);
+  } /* otherwise try all values */
+  hwloc_localeswitch_fini();
+
+#ifdef HWLOC_DEBUG
+  verbose = 1;
+#else
+  env = getenv("HWLOC_GROUPING_VERBOSE");
+  if (env)
+    verbose = atoi(env);
+#endif
+
+  for(osdist = topology->first_osdist; osdist; osdist = osdist->next) {
+
+    nbobjs = osdist->nbobjs;
+    if (!nbobjs)
+      continue;
+
+    if (osdist->objs) {
+      /* if we have objs, we must have distances as well,
+       * thanks to hwloc_convert_distances_indexes_into_objects()
+       */
+      assert(osdist->distances);
+
+#ifdef HWLOC_DEBUG
+      hwloc_debug("%s", "trying to group objects using distance matrix:\n");
+      hwloc_debug("%s", "  index");
+      for(j=0; j<nbobjs; j++)
+	hwloc_debug(" % 5d", (int) osdist->objs[j]->os_index);
+      hwloc_debug("%s", "\n");
+      for(i=0; i<nbobjs; i++) {
+	hwloc_debug("  % 5d", (int) osdist->objs[i]->os_index);
+	for(j=0; j<nbobjs; j++)
+	  hwloc_debug(" %2.3f", osdist->distances[i*nbobjs + j]);
+	hwloc_debug("%s", "\n");
+      }
+#endif
+
+      hwloc__groups_by_distances(topology, nbobjs,
+				 osdist->objs,
+				 osdist->distances,
+				 nbaccuracies, accuracies,
+				 osdist->indexes != NULL,
+				 1 /* check the first matrice */,
+				 verbose);
+
+      /* add a final group object covering everybody so that the distance matrix can be stored somewhere.
+       * this group will be merged into a regular object if the matrix isn't strangely incomplete
+       */
+      group_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
+      group_obj->attr->group.depth = (unsigned) -1;
+      group_obj->cpuset = hwloc_bitmap_alloc();
+      for(i=0; i<nbobjs; i++) {
+	/* assemble the group sets */
+	hwloc_obj_add_other_obj_sets(group_obj, osdist->objs[i]);
+      }
+      hwloc_debug_1arg_bitmap("adding Group object (as root of distance matrix with %u objects) with cpuset %s\n",
+			      nbobjs, group_obj->cpuset);
+      hwloc__insert_object_by_cpuset(topology, group_obj,
+				     osdist->indexes != NULL ? hwloc_report_user_distance_error : hwloc_report_os_error);
+    }
+  }
+}
diff --git a/ext/hwloc/hwloc/dolib.c b/ext/hwloc/hwloc/dolib.c
new file mode 100644
index 0000000..0b2835a
--- /dev/null
+++ b/ext/hwloc/hwloc/dolib.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009 inria.  All rights reserved.
+ * Copyright © 2009, 2012 Université Bordeaux
+ * See COPYING in top-level directory.
+ */
+
+/* Wrapper to avoid msys' tendency to turn / into \ and : into ;  */
+
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <stdio.h>
+#include <stdlib.h>
+
+int main(int argc, char *argv[]) {
+  char *prog, *arch, *def, *version, *lib;
+  char s[1024];
+  char name[16];
+  int current, age, revision;
+
+  if (argc != 6) {
+    fprintf(stderr,"bad number of arguments");
+    exit(EXIT_FAILURE);
+  }
+
+  prog = argv[1];
+  arch = argv[2];
+  def = argv[3];
+  version = argv[4];
+  lib = argv[5];
+
+  if (sscanf(version, "%d:%d:%d", &current, &revision, &age) != 3)
+    exit(EXIT_FAILURE);
+
+  _snprintf(name, sizeof(name), "libhwloc-%d", current - age);
+  printf("using soname %s\n", name);
+
+  _snprintf(s, sizeof(s), "\"%s\" /machine:%s /def:%s /name:%s /out:%s",
+      prog, arch, def, name, lib);
+  if (system(s)) {
+    fprintf(stderr, "%s failed\n", s);
+    exit(EXIT_FAILURE);
+  }
+
+  exit(EXIT_SUCCESS);
+}
diff --git a/ext/hwloc/hwloc/misc.c b/ext/hwloc/hwloc/misc.c
new file mode 100644
index 0000000..3da6687
--- /dev/null
+++ b/ext/hwloc/hwloc/misc.c
@@ -0,0 +1,166 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <private/private.h>
+#include <private/misc.h>
+
+#include <stdarg.h>
+#ifdef HAVE_SYS_UTSNAME_H
+#include <sys/utsname.h>
+#endif
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <ctype.h>
+
+#ifdef HAVE_PROGRAM_INVOCATION_NAME
+#include <errno.h>
+extern char *program_invocation_name;
+#endif
+#ifdef HAVE___PROGNAME
+extern char *__progname;
+#endif
+
+int hwloc_snprintf(char *str, size_t size, const char *format, ...)
+{
+  int ret;
+  va_list ap;
+  static char bin;
+  size_t fakesize;
+  char *fakestr;
+
+  /* Some systems crash on str == NULL */
+  if (!size) {
+    str = &bin;
+    size = 1;
+  }
+
+  va_start(ap, format);
+  ret = vsnprintf(str, size, format, ap);
+  va_end(ap);
+
+  if (ret >= 0 && (size_t) ret != size-1)
+    return ret;
+
+  /* vsnprintf returned size-1 or -1. That could be a system which reports the
+   * written data and not the actually required room. Try increasing buffer
+   * size to get the latter. */
+
+  fakesize = size;
+  fakestr = NULL;
+  do {
+    fakesize *= 2;
+    free(fakestr);
+    fakestr = malloc(fakesize);
+    if (NULL == fakestr)
+      return -1;
+    va_start(ap, format);
+    errno = 0;
+    ret = vsnprintf(fakestr, fakesize, format, ap);
+    va_end(ap);
+  } while ((size_t) ret == fakesize-1 || (ret < 0 && (!errno || errno == ERANGE)));
+
+  if (ret >= 0 && size) {
+    if (size > (size_t) ret+1)
+      size = ret+1;
+    memcpy(str, fakestr, size-1);
+    str[size-1] = 0;
+  }
+  free(fakestr);
+
+  return ret;
+}
+
+int hwloc_namecoloncmp(const char *haystack, const char *needle, size_t n)
+{
+  size_t i = 0;
+  while (*haystack && *haystack != ':') {
+    int ha = *haystack++;
+    int low_h = tolower(ha);
+    int ne = *needle++;
+    int low_n = tolower(ne);
+    if (low_h != low_n)
+      return 1;
+    i++;
+  }
+  return i < n;
+}
+
+void hwloc_add_uname_info(struct hwloc_topology *topology __hwloc_attribute_unused,
+			  void *cached_uname __hwloc_attribute_unused)
+{
+#ifdef HAVE_UNAME
+  struct utsname _utsname, *utsname;
+
+  if (hwloc_obj_get_info_by_name(topology->levels[0][0], "OSName"))
+    /* don't annotate twice */
+    return;
+
+  if (cached_uname)
+    utsname = (struct utsname *) cached_uname;
+  else {
+    utsname = &_utsname;
+    if (uname(utsname) < 0)
+      return;
+  }
+
+  if (*utsname->sysname)
+    hwloc_obj_add_info(topology->levels[0][0], "OSName", utsname->sysname);
+  if (*utsname->release)
+    hwloc_obj_add_info(topology->levels[0][0], "OSRelease", utsname->release);
+  if (*utsname->version)
+    hwloc_obj_add_info(topology->levels[0][0], "OSVersion", utsname->version);
+  if (*utsname->nodename)
+    hwloc_obj_add_info(topology->levels[0][0], "HostName", utsname->nodename);
+  if (*utsname->machine)
+    hwloc_obj_add_info(topology->levels[0][0], "Architecture", utsname->machine);
+#endif /* HAVE_UNAME */
+}
+
+char *
+hwloc_progname(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+#if HAVE_DECL_GETMODULEFILENAME
+  char name[256], *basename;
+  unsigned res = GetModuleFileName(NULL, name, sizeof(name));
+  if (res == sizeof(name) || !res)
+    return NULL;
+  basename = strrchr(name, '\\');
+  if (!basename)
+    basename = name;
+  else
+    basename++;
+  return strdup(basename);
+#else /* !HAVE_GETMODULEFILENAME */
+  const char *name, *basename;
+#if HAVE_DECL_GETPROGNAME
+  name = getprogname(); /* FreeBSD, NetBSD, some Solaris */
+#elif HAVE_DECL_GETEXECNAME
+  name = getexecname(); /* Solaris */
+#elif defined HAVE_PROGRAM_INVOCATION_NAME
+  name = program_invocation_name; /* Glibc. BGQ CNK. */
+  /* could use program_invocation_short_name directly, but we have the code to remove the path below anyway */
+#elif defined HAVE___PROGNAME
+  name = __progname; /* fallback for most unix, used for OpenBSD */
+#else
+  /* TODO: _NSGetExecutablePath(path, &size) on Darwin */
+  /* TODO: AIX, HPUX, OSF */
+  name = NULL;
+#endif
+  if (!name)
+    return NULL;
+  basename = strrchr(name, '/');
+  if (!basename)
+    basename = name;
+  else
+    basename++;
+  return strdup(basename);
+#endif /* !HAVE_GETMODULEFILENAME */
+}
diff --git a/ext/hwloc/hwloc/pci-common.c b/ext/hwloc/hwloc/pci-common.c
new file mode 100644
index 0000000..1000ca1
--- /dev/null
+++ b/ext/hwloc/hwloc/pci-common.c
@@ -0,0 +1,482 @@
+/*
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/plugins.h>
+#include <private/debug.h>
+
+#ifdef HWLOC_DEBUG
+static void
+hwloc_pci_traverse_print_cb(void * cbdata __hwloc_attribute_unused,
+			    struct hwloc_obj *pcidev)
+{
+  char busid[14];
+  hwloc_obj_t parent;
+
+  /* indent */
+  parent = pcidev->parent;
+  while (parent) {
+    hwloc_debug("%s", "  ");
+    parent = parent->parent;
+  }
+
+  snprintf(busid, sizeof(busid), "%04x:%02x:%02x.%01x",
+           pcidev->attr->pcidev.domain, pcidev->attr->pcidev.bus, pcidev->attr->pcidev.dev, pcidev->attr->pcidev.func);
+
+  if (pcidev->type == HWLOC_OBJ_BRIDGE) {
+    if (pcidev->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST)
+      hwloc_debug("HostBridge");
+    else
+      hwloc_debug("Bridge [%04x:%04x]", busid,
+		  pcidev->attr->pcidev.vendor_id, pcidev->attr->pcidev.device_id);
+    hwloc_debug(" to %04x:[%02x:%02x]\n",
+		pcidev->attr->bridge.downstream.pci.domain, pcidev->attr->bridge.downstream.pci.secondary_bus, pcidev->attr->bridge.downstream.pci.subordinate_bus);
+  } else
+    hwloc_debug("%s Device [%04x:%04x (%04x:%04x) rev=%02x class=%04x]\n", busid,
+		pcidev->attr->pcidev.vendor_id, pcidev->attr->pcidev.device_id,
+		pcidev->attr->pcidev.subvendor_id, pcidev->attr->pcidev.subdevice_id,
+		pcidev->attr->pcidev.revision, pcidev->attr->pcidev.class_id);
+}
+#endif /* HWLOC_DEBUG */
+
+static void
+hwloc_pci_traverse_lookuposdevices_cb(void * cbdata,
+				      struct hwloc_obj *pcidev)
+{
+  struct hwloc_backend *backend = cbdata;
+
+  if (pcidev->type == HWLOC_OBJ_BRIDGE)
+    return;
+
+  hwloc_backends_notify_new_object(backend, pcidev);
+}
+
+static void
+hwloc_pci__traverse(void * cbdata, struct hwloc_obj *root,
+		    void (*cb)(void * cbdata, struct hwloc_obj *))
+{
+  struct hwloc_obj *child = root->io_first_child;
+  while (child) {
+    cb(cbdata, child);
+    if (child->type == HWLOC_OBJ_BRIDGE)
+      hwloc_pci__traverse(cbdata, child, cb);
+    child = child->next_sibling;
+  }
+}
+
+static void
+hwloc_pci_traverse(void * cbdata, struct hwloc_obj *root,
+		   void (*cb)(void * cbdata, struct hwloc_obj *))
+{
+  hwloc_pci__traverse(cbdata, root, cb);
+}
+
+enum hwloc_pci_busid_comparison_e {
+  HWLOC_PCI_BUSID_LOWER,
+  HWLOC_PCI_BUSID_HIGHER,
+  HWLOC_PCI_BUSID_INCLUDED,
+  HWLOC_PCI_BUSID_SUPERSET
+};
+
+static enum hwloc_pci_busid_comparison_e
+hwloc_pci_compare_busids(struct hwloc_obj *a, struct hwloc_obj *b)
+{
+  if (a->type == HWLOC_OBJ_BRIDGE)
+    assert(a->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI);
+  if (b->type == HWLOC_OBJ_BRIDGE)
+    assert(b->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI);
+
+  if (a->attr->pcidev.domain < b->attr->pcidev.domain)
+    return HWLOC_PCI_BUSID_LOWER;
+  if (a->attr->pcidev.domain > b->attr->pcidev.domain)
+    return HWLOC_PCI_BUSID_HIGHER;
+
+  if (a->type == HWLOC_OBJ_BRIDGE
+      && b->attr->pcidev.bus >= a->attr->bridge.downstream.pci.secondary_bus
+      && b->attr->pcidev.bus <= a->attr->bridge.downstream.pci.subordinate_bus)
+    return HWLOC_PCI_BUSID_SUPERSET;
+  if (b->type == HWLOC_OBJ_BRIDGE
+      && a->attr->pcidev.bus >= b->attr->bridge.downstream.pci.secondary_bus
+      && a->attr->pcidev.bus <= b->attr->bridge.downstream.pci.subordinate_bus)
+    return HWLOC_PCI_BUSID_INCLUDED;
+
+  if (a->attr->pcidev.bus < b->attr->pcidev.bus)
+    return HWLOC_PCI_BUSID_LOWER;
+  if (a->attr->pcidev.bus > b->attr->pcidev.bus)
+    return HWLOC_PCI_BUSID_HIGHER;
+
+  if (a->attr->pcidev.dev < b->attr->pcidev.dev)
+    return HWLOC_PCI_BUSID_LOWER;
+  if (a->attr->pcidev.dev > b->attr->pcidev.dev)
+    return HWLOC_PCI_BUSID_HIGHER;
+
+  if (a->attr->pcidev.func < b->attr->pcidev.func)
+    return HWLOC_PCI_BUSID_LOWER;
+  if (a->attr->pcidev.func > b->attr->pcidev.func)
+    return HWLOC_PCI_BUSID_HIGHER;
+
+  /* Should never reach here.  Abort on both debug builds and
+     non-debug builds */
+  assert(0);
+  fprintf(stderr, "Bad assertion in hwloc %s:%d (aborting)\n", __FILE__, __LINE__);
+  exit(1);
+}
+
+static void
+hwloc_pci_add_object(struct hwloc_obj *root, struct hwloc_obj *new)
+{
+  struct hwloc_obj **curp, **childp;
+
+  curp = &root->io_first_child;
+  while (*curp) {
+    enum hwloc_pci_busid_comparison_e comp = hwloc_pci_compare_busids(new, *curp);
+    switch (comp) {
+    case HWLOC_PCI_BUSID_HIGHER:
+      /* go further */
+      curp = &(*curp)->next_sibling;
+      continue;
+    case HWLOC_PCI_BUSID_INCLUDED:
+      /* insert new below current bridge */
+      hwloc_pci_add_object(*curp, new);
+      return;
+    case HWLOC_PCI_BUSID_LOWER:
+    case HWLOC_PCI_BUSID_SUPERSET: {
+      /* insert new before current */
+      new->next_sibling = *curp;
+      *curp = new;
+      new->parent = root;
+      if (new->type == HWLOC_OBJ_BRIDGE) {
+	/* look at remaining siblings and move some below new */
+	childp = &new->io_first_child;
+	curp = &new->next_sibling;
+	while (*curp) {
+	  if (hwloc_pci_compare_busids(new, *curp) == HWLOC_PCI_BUSID_LOWER) {
+	    /* this sibling remains under root, after new */
+	    curp = &(*curp)->next_sibling;
+	    /* even if the list is sorted by busid, we can't break because the current bridge creates a bus that may be higher. some object may have to go there */
+	  } else {
+	    /* this sibling goes under new */
+	    *childp = *curp;
+	    *curp = (*curp)->next_sibling;
+	    (*childp)->parent = new;
+	    (*childp)->next_sibling = NULL;
+	    childp = &(*childp)->next_sibling;
+	  }
+	}
+      }
+      return;
+    }
+    }
+  }
+  /* add to the end of the list if higher than everybody */
+  new->parent = root;
+  new->next_sibling = NULL;
+  *curp = new;
+}
+
+static struct hwloc_obj *
+hwloc_pci_fixup_hostbridge_parent(struct hwloc_topology *topology __hwloc_attribute_unused,
+				  struct hwloc_obj *hostbridge,
+				  struct hwloc_obj *parent)
+{
+  /* Xeon E5v3 in cluster-on-die mode only have PCI on the first NUMA node of each package.
+   * but many dual-processor host report the second PCI hierarchy on 2nd NUMA of first package.
+   */
+  if (parent->depth >= 2
+      && parent->type == HWLOC_OBJ_NUMANODE
+      && parent->sibling_rank == 1 && parent->parent->arity == 2
+      && parent->parent->type == HWLOC_OBJ_PACKAGE
+      && parent->parent->sibling_rank == 0 && parent->parent->parent->arity == 2) {
+    const char *cpumodel = hwloc_obj_get_info_by_name(parent->parent, "CPUModel");
+    if (cpumodel && strstr(cpumodel, "Xeon")) {
+      if (!hwloc_hide_errors()) {
+	fprintf(stderr, "****************************************************************************\n");
+	fprintf(stderr, "* hwloc %s has encountered an incorrect PCI locality information.\n", HWLOC_VERSION);
+	fprintf(stderr, "* PCI bus %04x:%02x is supposedly close to 2nd NUMA node of 1st package,\n",
+		hostbridge->io_first_child->attr->pcidev.domain, hostbridge->io_first_child->attr->pcidev.bus);
+	fprintf(stderr, "* however hwloc believes this is impossible on this architecture.\n");
+	fprintf(stderr, "* Therefore the PCI bus will be moved to 1st NUMA node of 2nd package.\n");
+	fprintf(stderr, "*\n");
+	fprintf(stderr, "* If you feel this fixup is wrong, disable it by setting in your environment\n");
+	fprintf(stderr, "* HWLOC_PCI_%04x_%02x_LOCALCPUS= (empty value), and report the problem\n",
+		hostbridge->io_first_child->attr->pcidev.domain, hostbridge->io_first_child->attr->pcidev.bus);
+	fprintf(stderr, "* to the hwloc's user mailing list together with the XML output of lstopo.\n");
+	fprintf(stderr, "*\n");
+	fprintf(stderr, "* You may silence this message by setting HWLOC_HIDE_ERRORS=1 in your environment.\n");
+	fprintf(stderr, "****************************************************************************\n");
+      }
+      return parent->parent->next_sibling->first_child;
+    }
+  }
+
+  return parent;
+}
+
+static struct hwloc_obj *
+hwloc_pci_find_hostbridge_parent(struct hwloc_topology *topology, struct hwloc_backend *backend,
+				 struct hwloc_obj *hostbridge)
+{
+  hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
+  hwloc_obj_t group_obj, parent;
+  const char *env;
+  int err;
+
+  /* override the cpuset with the environment if given */
+  int forced = 0;
+  char envname[256];
+  snprintf(envname, sizeof(envname), "HWLOC_PCI_%04x_%02x_LOCALCPUS",
+	   hostbridge->io_first_child->attr->pcidev.domain, hostbridge->io_first_child->attr->pcidev.bus);
+  env = getenv(envname);
+  if (env)
+    /* if env exists but is empty, don't let quirks change what the OS reports */
+    forced = 1;
+  if (env && *env) {
+    /* force the hostbridge cpuset */
+    hwloc_debug("Overriding localcpus using %s in the environment\n", envname);
+    hwloc_bitmap_sscanf(cpuset, env);
+  } else {
+    /* get the hostbridge cpuset by acking the OS backend.
+     * it's not a PCI device, so we use its first child locality info.
+     */
+    err = hwloc_backends_get_obj_cpuset(backend, hostbridge->io_first_child, cpuset);
+    if (err < 0)
+      /* if we got nothing, assume the hostbridge is attached to the top of hierarchy */
+      hwloc_bitmap_copy(cpuset, hwloc_topology_get_topology_cpuset(topology));
+  }
+
+  hwloc_debug_bitmap("Attaching hostbridge to cpuset %s\n", cpuset);
+
+  /* restrict to the existing complete cpuset to avoid errors later */
+  hwloc_bitmap_and(cpuset, cpuset, hwloc_topology_get_complete_cpuset(topology));
+
+  /* if the remaining cpuset is empty, take the root */
+  if (hwloc_bitmap_iszero(cpuset))
+    hwloc_bitmap_copy(cpuset, hwloc_topology_get_complete_cpuset(topology));
+
+  group_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
+  if (group_obj) {
+    group_obj->complete_cpuset = hwloc_bitmap_dup(cpuset);
+    hwloc_bitmap_and(cpuset, cpuset, hwloc_topology_get_topology_cpuset(topology));
+    group_obj->cpuset = hwloc_bitmap_dup(cpuset);
+    group_obj->attr->group.depth = (unsigned) -1;
+    parent = hwloc__insert_object_by_cpuset(topology, group_obj, hwloc_report_os_error);
+    if (parent == group_obj) {
+      /* group inserted without being merged, setup its sets */
+      hwloc_obj_add_children_sets(group_obj);
+    } else if (!parent) {
+      /* Failed to insert the parent, maybe a conflicting cpuset, attach to the root object instead */
+      parent = hwloc_get_root_obj(topology);
+    } else {
+      /* Got merged. This object has the right cpuset, but it could be a cache or so,
+       * go up as long as the (complete)cpuset is the same.
+       */
+      while (parent->parent) {
+	if (parent->complete_cpuset && parent->parent->complete_cpuset) {
+	  if (!hwloc_bitmap_isequal(parent->complete_cpuset, parent->parent->complete_cpuset))
+	    break;
+	} else {
+	  if (!hwloc_bitmap_isequal(parent->cpuset, parent->parent->cpuset))
+	    break;
+	}
+	parent = parent->parent;
+      }
+
+      if (!forced)
+	parent = hwloc_pci_fixup_hostbridge_parent(topology, hostbridge, parent);
+    }
+  } else {
+    /* Failed to create the Group, attach to the root object instead */
+    parent = hwloc_get_root_obj(topology);
+  }
+
+  hwloc_bitmap_free(cpuset);
+
+  return parent;
+}
+
+int
+hwloc_insert_pci_device_list(struct hwloc_backend *backend,
+			     struct hwloc_obj *first_obj)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_obj fakeparent;
+  struct hwloc_obj *obj;
+  unsigned current_hostbridge;
+
+  if (!first_obj)
+    /* found nothing, exit */
+    return 0;
+
+  /* first, organise object as tree under a fake parent object */
+  fakeparent.parent = NULL;
+  fakeparent.io_first_child = NULL;
+  while (first_obj) {
+    obj = first_obj;
+    first_obj = obj->next_sibling;
+    hwloc_pci_add_object(&fakeparent, obj);
+  }
+
+#ifdef HWLOC_DEBUG
+  hwloc_debug("%s", "\nPCI hierarchy under fake parent:\n");
+  hwloc_pci_traverse(NULL, &fakeparent, hwloc_pci_traverse_print_cb);
+  hwloc_debug("%s", "\n");
+#endif
+
+  /* walk the hierarchy, and lookup OS devices */
+  hwloc_pci_traverse(backend, &fakeparent, hwloc_pci_traverse_lookuposdevices_cb);
+
+  /*
+   * fakeparent lists all objects connected to any upstream bus in the machine.
+   * We now create one real hostbridge object per upstream bus.
+   * It's not actually a PCI device so we have to create it.
+   */
+  current_hostbridge = 0;
+  while (fakeparent.io_first_child) {
+    /* start a new host bridge */
+    struct hwloc_obj *hostbridge = hwloc_alloc_setup_object(HWLOC_OBJ_BRIDGE, current_hostbridge++);
+    struct hwloc_obj **dstnextp = &hostbridge->io_first_child;
+    struct hwloc_obj **srcnextp = &fakeparent.io_first_child;
+    struct hwloc_obj *child = *srcnextp;
+    struct hwloc_obj *parent;
+    unsigned short current_domain = child->attr->pcidev.domain;
+    unsigned char current_bus = child->attr->pcidev.bus;
+    unsigned char current_subordinate = current_bus;
+
+    hwloc_debug("Starting new PCI hostbridge %04x:%02x\n", current_domain, current_bus);
+
+  next_child:
+    /* remove next child from fakeparent */
+    *srcnextp = child->next_sibling;
+    /* append it to hostbridge */
+    *dstnextp = child;
+    child->parent = hostbridge;
+    child->next_sibling = NULL;
+    dstnextp = &child->next_sibling;
+
+    /* compute hostbridge secondary/subordinate buses */
+    if (child->type == HWLOC_OBJ_BRIDGE
+	&& child->attr->bridge.downstream.pci.subordinate_bus > current_subordinate)
+      current_subordinate = child->attr->bridge.downstream.pci.subordinate_bus;
+
+    /* use next child if it has the same domains/bus */
+    child = *srcnextp;
+    if (child
+	&& child->attr->pcidev.domain == current_domain
+	&& child->attr->pcidev.bus == current_bus)
+      goto next_child;
+
+    /* finish setting up this hostbridge */
+    hostbridge->attr->bridge.upstream_type = HWLOC_OBJ_BRIDGE_HOST;
+    hostbridge->attr->bridge.downstream_type = HWLOC_OBJ_BRIDGE_PCI;
+    hostbridge->attr->bridge.downstream.pci.domain = current_domain;
+    hostbridge->attr->bridge.downstream.pci.secondary_bus = current_bus;
+    hostbridge->attr->bridge.downstream.pci.subordinate_bus = current_subordinate;
+    hwloc_debug("New PCI hostbridge %04x:[%02x-%02x]\n",
+		current_domain, current_bus, current_subordinate);
+
+    /* attach the hostbridge where it belongs */
+    parent = hwloc_pci_find_hostbridge_parent(topology, backend, hostbridge);
+    hwloc_insert_object_by_parent(topology, parent, hostbridge);
+  }
+
+  return 1;
+}
+
+#define HWLOC_PCI_STATUS 0x06
+#define HWLOC_PCI_STATUS_CAP_LIST 0x10
+#define HWLOC_PCI_CAPABILITY_LIST 0x34
+#define HWLOC_PCI_CAP_LIST_ID 0
+#define HWLOC_PCI_CAP_LIST_NEXT 1
+
+unsigned
+hwloc_pci_find_cap(const unsigned char *config, unsigned cap)
+{
+  unsigned char seen[256] = { 0 };
+  unsigned char ptr; /* unsigned char to make sure we stay within the 256-byte config space */
+
+  if (!(config[HWLOC_PCI_STATUS] & HWLOC_PCI_STATUS_CAP_LIST))
+    return 0;
+
+  for (ptr = config[HWLOC_PCI_CAPABILITY_LIST] & ~3;
+       ptr; /* exit if next is 0 */
+       ptr = config[ptr + HWLOC_PCI_CAP_LIST_NEXT] & ~3) {
+    unsigned char id;
+
+    /* Looped around! */
+    if (seen[ptr])
+      break;
+    seen[ptr] = 1;
+
+    id = config[ptr + HWLOC_PCI_CAP_LIST_ID];
+    if (id == cap)
+      return ptr;
+    if (id == 0xff) /* exit if id is 0 or 0xff */
+      break;
+  }
+  return 0;
+}
+
+#define HWLOC_PCI_EXP_LNKSTA 0x12
+#define HWLOC_PCI_EXP_LNKSTA_SPEED 0x000f
+#define HWLOC_PCI_EXP_LNKSTA_WIDTH 0x03f0
+
+int
+hwloc_pci_find_linkspeed(const unsigned char *config,
+			 unsigned offset, float *linkspeed)
+{
+  unsigned linksta, speed, width;
+  float lanespeed;
+
+  memcpy(&linksta, &config[offset + HWLOC_PCI_EXP_LNKSTA], 4);
+  speed = linksta & HWLOC_PCI_EXP_LNKSTA_SPEED; /* PCIe generation */
+  width = (linksta & HWLOC_PCI_EXP_LNKSTA_WIDTH) >> 4; /* how many lanes */
+  /* PCIe Gen1 = 2.5GT/s signal-rate per lane with 8/10 encoding    = 0.25GB/s data-rate per lane
+   * PCIe Gen2 = 5  GT/s signal-rate per lane with 8/10 encoding    = 0.5 GB/s data-rate per lane
+   * PCIe Gen3 = 8  GT/s signal-rate per lane with 128/130 encoding = 1   GB/s data-rate per lane
+   */
+  lanespeed = speed <= 2 ? 2.5 * speed * 0.8 : 8.0 * 128/130; /* Gbit/s per lane */
+  *linkspeed = lanespeed * width / 8; /* GB/s */
+  return 0;
+}
+
+#define HWLOC_PCI_HEADER_TYPE 0x0e
+#define HWLOC_PCI_HEADER_TYPE_BRIDGE 1
+#define HWLOC_PCI_CLASS_BRIDGE_PCI 0x0604
+#define HWLOC_PCI_PRIMARY_BUS 0x18
+#define HWLOC_PCI_SECONDARY_BUS 0x19
+#define HWLOC_PCI_SUBORDINATE_BUS 0x1a
+
+int
+hwloc_pci_prepare_bridge(hwloc_obj_t obj,
+			 const unsigned char *config)
+{
+  unsigned char headertype;
+  unsigned isbridge;
+  struct hwloc_pcidev_attr_s *pattr = &obj->attr->pcidev;
+  struct hwloc_bridge_attr_s *battr;
+
+  headertype = config[HWLOC_PCI_HEADER_TYPE] & 0x7f;
+  isbridge = (pattr->class_id == HWLOC_PCI_CLASS_BRIDGE_PCI
+	      && headertype == HWLOC_PCI_HEADER_TYPE_BRIDGE);
+
+  if (!isbridge)
+    return 0;
+
+  battr = &obj->attr->bridge;
+
+  if (config[HWLOC_PCI_PRIMARY_BUS] != pattr->bus)
+    hwloc_debug("  %04x:%02x:%02x.%01x bridge with (ignored) invalid PCI_PRIMARY_BUS %02x\n",
+		pattr->domain, pattr->bus, pattr->dev, pattr->func, config[HWLOC_PCI_PRIMARY_BUS]);
+
+  obj->type = HWLOC_OBJ_BRIDGE;
+  battr->upstream_type = HWLOC_OBJ_BRIDGE_PCI;
+  battr->downstream_type = HWLOC_OBJ_BRIDGE_PCI;
+  battr->downstream.pci.domain = pattr->domain;
+  battr->downstream.pci.secondary_bus = config[HWLOC_PCI_SECONDARY_BUS];
+  battr->downstream.pci.subordinate_bus = config[HWLOC_PCI_SUBORDINATE_BUS];
+
+  return 0;
+}
diff --git a/ext/hwloc/hwloc/topology-bgq.cb b/ext/hwloc/hwloc/topology-bgq.cb
new file mode 100644
index 0000000..3998f31
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-bgq.cb
@@ -0,0 +1,246 @@
+/*
+ * Copyright © 2013-2015 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+#include <stdlib.h>
+#include <pthread.h>
+#include <sys/utsname.h>
+#include <spi/include/kernel/location.h>
+#include <spi/include/kernel/process.h>
+
+static int
+hwloc_look_bgq(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  unsigned i;
+  const char *env;
+
+  if (!topology->levels[0][0]->cpuset) {
+    /* Nobody created objects yet, setup everything */
+    hwloc_bitmap_t set;
+    hwloc_obj_t obj;
+
+#define HWLOC_BGQ_CORES 17 /* spare core ignored for now */
+
+    hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+    /* mark the 17th core (OS-reserved) as disallowed */
+    hwloc_bitmap_clr_range(topology->levels[0][0]->allowed_cpuset, (HWLOC_BGQ_CORES-1)*4, HWLOC_BGQ_CORES*4-1);
+
+    env = getenv("BG_THREADMODEL");
+    if (!env || atoi(env) != 2) {
+      /* process cannot use cores/threads outside of its Kernel_ThreadMask() */
+      uint64_t bgmask = Kernel_ThreadMask(Kernel_MyTcoord());
+      /* the mask is reversed, manually reverse it */
+      for(i=0; i<64; i++)
+	if (((bgmask >> i) & 1) == 0)
+	  hwloc_bitmap_clr(topology->levels[0][0]->allowed_cpuset, 63-i);
+    }
+
+    /* a single memory bank */
+    obj = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, 0);
+    set = hwloc_bitmap_alloc();
+    hwloc_bitmap_set_range(set, 0, HWLOC_BGQ_CORES*4-1);
+    obj->cpuset = set;
+    set = hwloc_bitmap_alloc();
+    hwloc_bitmap_set(set, 0);
+    obj->nodeset = set;
+    obj->memory.local_memory = 16ULL*1024*1024*1024ULL;
+    hwloc_insert_object_by_cpuset(topology, obj);
+
+    /* package */
+    obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, 0);
+    set = hwloc_bitmap_alloc();
+    hwloc_bitmap_set_range(set, 0, HWLOC_BGQ_CORES*4-1);
+    obj->cpuset = set;
+    hwloc_obj_add_info(obj, "CPUModel", "IBM PowerPC A2");
+    hwloc_insert_object_by_cpuset(topology, obj);
+
+    /* shared L2 */
+    obj = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+    obj->cpuset = hwloc_bitmap_dup(set);
+    obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+    obj->attr->cache.depth = 2;
+    obj->attr->cache.size = 32*1024*1024;
+    obj->attr->cache.linesize = 128;
+    obj->attr->cache.associativity = 16;
+    hwloc_insert_object_by_cpuset(topology, obj);
+
+    /* Cores */
+    for(i=0; i<HWLOC_BGQ_CORES; i++) {
+      /* Core */
+      obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, i);
+      set = hwloc_bitmap_alloc();
+      hwloc_bitmap_set_range(set, i*4, i*4+3);
+      obj->cpuset = set;
+      hwloc_insert_object_by_cpuset(topology, obj);
+      /* L1d */
+      obj = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+      obj->cpuset = hwloc_bitmap_dup(set);
+      obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
+      obj->attr->cache.depth = 1;
+      obj->attr->cache.size = 16*1024;
+      obj->attr->cache.linesize = 64;
+      obj->attr->cache.associativity = 8;
+      hwloc_insert_object_by_cpuset(topology, obj);
+      /* L1i */
+      obj = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+      obj->cpuset = hwloc_bitmap_dup(set);
+      obj->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
+      obj->attr->cache.depth = 1;
+      obj->attr->cache.size = 16*1024;
+      obj->attr->cache.linesize = 64;
+      obj->attr->cache.associativity = 4;
+      hwloc_insert_object_by_cpuset(topology, obj);
+      /* there's also a L1p "prefetch cache" of 4kB with 128B lines */
+    }
+
+    /* PUs */
+    hwloc_setup_pu_level(topology, HWLOC_BGQ_CORES*4);
+  }
+
+  /* Add BGQ specific information */
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "BGQ");
+  if (topology->is_thissystem)
+    hwloc_add_uname_info(topology, NULL);
+  return 1;
+}
+
+static int
+hwloc_bgq_get_thread_cpubind(hwloc_topology_t topology, pthread_t thread, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  unsigned pu;
+  cpu_set_t bg_set;
+  int err;
+
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+  err = pthread_getaffinity_np(thread, sizeof(bg_set), &bg_set);
+  if (err) {
+    errno = err;
+    return -1;
+  }
+  for(pu=0; pu<64; pu++)
+    if (CPU_ISSET(pu, &bg_set)) {
+      /* the binding cannot contain multiple PUs */
+      hwloc_bitmap_only(hwloc_set, pu);
+      break;
+    }
+  return 0;
+}
+
+static int
+hwloc_bgq_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+  hwloc_bitmap_only(hwloc_set, Kernel_ProcessorID());
+  return 0;
+}
+
+static int
+hwloc_bgq_set_thread_cpubind(hwloc_topology_t topology, pthread_t thread, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  unsigned pu;
+  cpu_set_t bg_set;
+  int err;
+
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+  /* the binding cannot contain multiple PUs.
+   * keep the first PU only, and error out if STRICT.
+   */
+  if (hwloc_bitmap_weight(hwloc_set) != 1) {
+    if ((flags & HWLOC_CPUBIND_STRICT)) {
+      errno = ENOSYS;
+      return -1;
+    }
+  }
+  pu = hwloc_bitmap_first(hwloc_set);
+  CPU_ZERO(&bg_set);
+  CPU_SET(pu, &bg_set);
+  err = pthread_setaffinity_np(thread, sizeof(bg_set), &bg_set);
+  if (err) {
+    errno = err;
+    return -1;
+  }
+  return 0;
+}
+
+static int
+hwloc_bgq_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  return hwloc_bgq_set_thread_cpubind(topology, pthread_self(), hwloc_set, flags);
+}
+
+void
+hwloc_set_bgq_hooks(struct hwloc_binding_hooks *hooks __hwloc_attribute_unused,
+		    struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+  hooks->set_thisthread_cpubind = hwloc_bgq_set_thisthread_cpubind;
+  hooks->set_thread_cpubind = hwloc_bgq_set_thread_cpubind;
+  hooks->get_thisthread_cpubind = hwloc_bgq_get_thisthread_cpubind;
+  hooks->get_thread_cpubind = hwloc_bgq_get_thread_cpubind;
+  /* threads cannot be bound to more than one PU, so get_last_cpu_location == get_cpubind */
+  hooks->get_thisthread_last_cpu_location = hwloc_bgq_get_thisthread_cpubind;
+  /* hooks->get_thread_last_cpu_location = hwloc_bgq_get_thread_cpubind; */
+}
+
+static struct hwloc_backend *
+hwloc_bgq_component_instantiate(struct hwloc_disc_component *component,
+				const void *_data1 __hwloc_attribute_unused,
+				const void *_data2 __hwloc_attribute_unused,
+				const void *_data3 __hwloc_attribute_unused)
+{
+  struct utsname utsname;
+  struct hwloc_backend *backend;
+  const char *env;
+  int err;
+
+  env = getenv("HWLOC_FORCE_BGQ");
+  if (!env || !atoi(env)) {
+    err = uname(&utsname);
+    if (err || strcmp(utsname.sysname, "CNK") || strcmp(utsname.machine, "BGQ")) {
+      fprintf(stderr, "*** Found unexpected uname sysname `%s' machine `%s'\n", utsname.sysname, utsname.machine);
+      fprintf(stderr, "*** The BGQ backend is only enabled on compute nodes by default (sysname=CNK machine=BGQ)\n");
+      fprintf(stderr, "*** Set HWLOC_FORCE_BGQ=1 in the environment to enforce the BGQ backend anyway.\n");
+      return NULL;
+    }
+  }
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+  backend->discover = hwloc_look_bgq;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_bgq_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  "bgq",
+  ~0,
+  hwloc_bgq_component_instantiate,
+  50,
+  NULL
+};
+
+const struct hwloc_component hwloc_bgq_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_bgq_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-darwin.cb b/ext/hwloc/hwloc/topology-darwin.cb
new file mode 100644
index 0000000..1062a1d
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-darwin.cb
@@ -0,0 +1,307 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2013 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* Detect topology change: registering for power management changes and check
+ * if for example hw.activecpu changed */
+
+/* Apparently, Darwin people do not _want_ to provide binding functions.  */
+
+#include <private/autogen/config.h>
+
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+static int
+hwloc_look_darwin(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  int64_t _nprocs;
+  unsigned nprocs;
+  int64_t _npackages;
+  unsigned i, j, cpu;
+  struct hwloc_obj *obj;
+  size_t size;
+  int64_t l1dcachesize, l1icachesize;
+  int64_t cacheways[2];
+  int64_t l2cachesize;
+  int64_t cachelinesize;
+  int64_t memsize;
+  char cpumodel[64];
+
+  if (topology->levels[0][0]->cpuset)
+    /* somebody discovered things */
+    return 0;
+
+  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+
+  if (hwloc_get_sysctlbyname("hw.ncpu", &_nprocs) || _nprocs <= 0)
+    return -1;
+  nprocs = _nprocs;
+  topology->support.discovery->pu = 1;
+
+  hwloc_debug("%u procs\n", nprocs);
+
+  size = sizeof(cpumodel);
+  if (sysctlbyname("machdep.cpu.brand_string", cpumodel, &size, NULL, 0))
+    cpumodel[0] = '\0';
+
+  if (!hwloc_get_sysctlbyname("hw.packages", &_npackages) && _npackages > 0) {
+    unsigned npackages = _npackages;
+    int64_t _cores_per_package;
+    int64_t _logical_per_package;
+    unsigned logical_per_package;
+
+    hwloc_debug("%u packages\n", npackages);
+
+    if (!hwloc_get_sysctlbyname("machdep.cpu.logical_per_package", &_logical_per_package) && _logical_per_package > 0)
+      logical_per_package = _logical_per_package;
+    else
+      /* Assume the trivia.  */
+      logical_per_package = nprocs / npackages;
+
+    hwloc_debug("%u threads per package\n", logical_per_package);
+
+
+    if (nprocs == npackages * logical_per_package)
+      for (i = 0; i < npackages; i++) {
+        obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, i);
+        obj->cpuset = hwloc_bitmap_alloc();
+        for (cpu = i*logical_per_package; cpu < (i+1)*logical_per_package; cpu++)
+          hwloc_bitmap_set(obj->cpuset, cpu);
+
+        hwloc_debug_1arg_bitmap("package %u has cpuset %s\n",
+                   i, obj->cpuset);
+
+        if (cpumodel[0] != '\0')
+          hwloc_obj_add_info(obj, "CPUModel", cpumodel);
+        hwloc_insert_object_by_cpuset(topology, obj);
+      }
+    else
+      if (cpumodel[0] != '\0')
+        hwloc_obj_add_info(topology->levels[0][0], "CPUModel", cpumodel);
+
+    if (!hwloc_get_sysctlbyname("machdep.cpu.cores_per_package", &_cores_per_package) && _cores_per_package > 0) {
+      unsigned cores_per_package = _cores_per_package;
+      hwloc_debug("%u cores per package\n", cores_per_package);
+
+      if (!(logical_per_package % cores_per_package))
+        for (i = 0; i < npackages * cores_per_package; i++) {
+          obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, i);
+          obj->cpuset = hwloc_bitmap_alloc();
+          for (cpu = i*(logical_per_package/cores_per_package);
+               cpu < (i+1)*(logical_per_package/cores_per_package);
+               cpu++)
+            hwloc_bitmap_set(obj->cpuset, cpu);
+
+          hwloc_debug_1arg_bitmap("core %u has cpuset %s\n",
+                     i, obj->cpuset);
+          hwloc_insert_object_by_cpuset(topology, obj);
+        }
+    }
+  } else
+    if (cpumodel[0] != '\0')
+      hwloc_obj_add_info(topology->levels[0][0], "CPUModel", cpumodel);
+
+  if (hwloc_get_sysctlbyname("hw.l1dcachesize", &l1dcachesize))
+    l1dcachesize = 0;
+
+  if (hwloc_get_sysctlbyname("hw.l1icachesize", &l1icachesize))
+    l1icachesize = 0;
+
+  if (hwloc_get_sysctlbyname("hw.l2cachesize", &l2cachesize))
+    l2cachesize = 0;
+
+  if (hwloc_get_sysctlbyname("machdep.cpu.cache.L1_associativity", &cacheways[0]))
+    cacheways[0] = 0;
+  else if (cacheways[0] == 0xff)
+    cacheways[0] = -1;
+
+  if (hwloc_get_sysctlbyname("machdep.cpu.cache.L2_associativity", &cacheways[1]))
+    cacheways[1] = 0;
+  else if (cacheways[1] == 0xff)
+    cacheways[1] = -1;
+
+  if (hwloc_get_sysctlbyname("hw.cachelinesize", &cachelinesize))
+    cachelinesize = 0;
+
+  if (hwloc_get_sysctlbyname("hw.memsize", &memsize))
+    memsize = 0;
+
+  if (!sysctlbyname("hw.cacheconfig", NULL, &size, NULL, 0)) {
+    unsigned n = size / sizeof(uint32_t);
+    uint64_t *cacheconfig = NULL;
+    uint64_t *cachesize = NULL;
+    uint32_t *cacheconfig32 = NULL;
+
+    cacheconfig = malloc(sizeof(uint64_t) * n);
+    if (NULL == cacheconfig) {
+        goto out;
+    }
+    cachesize = malloc(sizeof(uint64_t) * n);
+    if (NULL == cachesize) {
+        goto out;
+    }
+    cacheconfig32 = malloc(sizeof(uint32_t) * n);
+    if (NULL == cacheconfig32) {
+        goto out;
+    }
+
+    if ((!sysctlbyname("hw.cacheconfig", cacheconfig, &size, NULL, 0))) {
+      /* Yeech. Darwin seemingly has changed from 32bit to 64bit integers for
+       * cacheconfig, with apparently no way for detection. Assume the machine
+       * won't have more than 4 billion cpus */
+      if (cacheconfig[0] > 0xFFFFFFFFUL) {
+        memcpy(cacheconfig32, cacheconfig, size);
+        for (i = 0 ; i < size / sizeof(uint32_t); i++)
+          cacheconfig[i] = cacheconfig32[i];
+      }
+
+      memset(cachesize, 0, sizeof(uint64_t) * n);
+      size = sizeof(uint64_t) * n;
+      if (sysctlbyname("hw.cachesize", cachesize, &size, NULL, 0)) {
+        if (n > 0)
+          cachesize[0] = memsize;
+        if (n > 1)
+          cachesize[1] = l1dcachesize;
+        if (n > 2)
+          cachesize[2] = l2cachesize;
+      }
+
+      hwloc_debug("%s", "caches");
+      for (i = 0; i < n && cacheconfig[i]; i++)
+        hwloc_debug(" %"PRIu64"(%"PRIu64"kB)", cacheconfig[i], cachesize[i] / 1024);
+
+      /* Now we know how many caches there are */
+      n = i;
+      hwloc_debug("\n%u cache levels\n", n - 1);
+
+      /* For each cache level (0 is memory) */
+      for (i = 0; i < n; i++) {
+        /* cacheconfig tells us how many cpus share it, let's iterate on each cache */
+        for (j = 0; j < (nprocs / cacheconfig[i]); j++) {
+          obj = hwloc_alloc_setup_object(i?HWLOC_OBJ_CACHE:HWLOC_OBJ_NUMANODE, j);
+          if (!i) {
+            obj->nodeset = hwloc_bitmap_alloc();
+            hwloc_bitmap_set(obj->nodeset, j);
+          }
+          obj->cpuset = hwloc_bitmap_alloc();
+          for (cpu = j*cacheconfig[i];
+               cpu < ((j+1)*cacheconfig[i]);
+               cpu++)
+            hwloc_bitmap_set(obj->cpuset, cpu);
+
+          if (i == 1 && l1icachesize) {
+            /* FIXME assuming that L1i and L1d are shared the same way. Darwin
+             * does not yet provide a way to know.  */
+            hwloc_obj_t l1i = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, j);
+            l1i->cpuset = hwloc_bitmap_dup(obj->cpuset);
+            hwloc_debug_1arg_bitmap("L1icache %u has cpuset %s\n",
+                j, l1i->cpuset);
+            l1i->attr->cache.depth = i;
+            l1i->attr->cache.size = l1icachesize;
+            l1i->attr->cache.linesize = cachelinesize;
+            l1i->attr->cache.associativity = 0;
+            l1i->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
+
+            hwloc_insert_object_by_cpuset(topology, l1i);
+          }
+          if (i) {
+            hwloc_debug_2args_bitmap("L%ucache %u has cpuset %s\n",
+                i, j, obj->cpuset);
+            obj->attr->cache.depth = i;
+            obj->attr->cache.size = cachesize[i];
+            obj->attr->cache.linesize = cachelinesize;
+            if (i <= sizeof(cacheways) / sizeof(cacheways[0]))
+              obj->attr->cache.associativity = cacheways[i-1];
+            else
+              obj->attr->cache.associativity = 0;
+            if (i == 1 && l1icachesize)
+              obj->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
+            else
+              obj->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+          } else {
+            hwloc_debug_1arg_bitmap("node %u has cpuset %s\n",
+                j, obj->cpuset);
+	    obj->memory.local_memory = cachesize[i];
+	    obj->memory.page_types_len = 2;
+	    obj->memory.page_types = malloc(2*sizeof(*obj->memory.page_types));
+	    memset(obj->memory.page_types, 0, 2*sizeof(*obj->memory.page_types));
+	    obj->memory.page_types[0].size = hwloc_getpagesize();
+#ifdef HAVE__SC_LARGE_PAGESIZE
+	    obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
+#endif
+          }
+
+          hwloc_insert_object_by_cpuset(topology, obj);
+        }
+      }
+    }
+  out:
+    if (NULL != cacheconfig) {
+        free(cacheconfig);
+    }
+    if (NULL != cachesize) {
+        free(cachesize);
+    }
+    if (NULL != cacheconfig32) {
+        free(cacheconfig32);
+    }
+  }
+
+
+  /* add PU objects */
+  hwloc_setup_pu_level(topology, nprocs);
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "Darwin");
+  if (topology->is_thissystem)
+    hwloc_add_uname_info(topology, NULL);
+  return 1;
+}
+
+void
+hwloc_set_darwin_hooks(struct hwloc_binding_hooks *hooks __hwloc_attribute_unused,
+		       struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+}
+
+static struct hwloc_backend *
+hwloc_darwin_component_instantiate(struct hwloc_disc_component *component,
+				   const void *_data1 __hwloc_attribute_unused,
+				   const void *_data2 __hwloc_attribute_unused,
+				   const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+  backend->discover = hwloc_look_darwin;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_darwin_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "darwin",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_darwin_component_instantiate,
+  50,
+  NULL
+};
+
+const struct hwloc_component hwloc_darwin_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_darwin_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-fake.c b/ext/hwloc/hwloc/topology-fake.c
new file mode 100644
index 0000000..e3e22a0
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-fake.c
@@ -0,0 +1,61 @@
+/*
+ * Copyright © 2012-2014 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+
+#include <stdlib.h>
+
+static struct hwloc_backend *
+hwloc_fake_component_instantiate(struct hwloc_disc_component *component __hwloc_attribute_unused,
+				 const void *_data1 __hwloc_attribute_unused,
+				 const void *_data2 __hwloc_attribute_unused,
+				 const void *_data3 __hwloc_attribute_unused)
+{
+  if (getenv("HWLOC_DEBUG_FAKE_COMPONENT"))
+    printf("fake component instantiated\n");
+  return NULL;
+}
+
+static struct hwloc_disc_component hwloc_fake_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_MISC, /* so that it's always enabled when using the OS discovery */
+  "fake",
+  0, /* nothing to exclude */
+  hwloc_fake_component_instantiate,
+  100, /* make sure it's loaded before anything conflicting excludes it */
+  NULL
+};
+
+static int
+hwloc_fake_component_init(unsigned long flags)
+{
+  if (flags)
+    return -1;
+  if (hwloc_plugin_check_namespace("fake", "hwloc_backend_alloc") < 0)
+    return -1;
+  if (getenv("HWLOC_DEBUG_FAKE_COMPONENT"))
+    printf("fake component initialized\n");
+  return 0;
+}
+
+static void
+hwloc_fake_component_finalize(unsigned long flags)
+{
+  if (flags)
+    return;
+  if (getenv("HWLOC_DEBUG_FAKE_COMPONENT"))
+    printf("fake component finalized\n");
+}
+
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_fake_component; /* never linked statically in the core */
+
+const struct hwloc_component hwloc_fake_component = {
+  HWLOC_COMPONENT_ABI,
+  hwloc_fake_component_init, hwloc_fake_component_finalize,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_fake_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-freebsd.cb b/ext/hwloc/hwloc/topology-freebsd.cb
new file mode 100644
index 0000000..d8d4c54
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-freebsd.cb
@@ -0,0 +1,255 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2010, 2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+
+#include <sys/types.h>
+#include <stdlib.h>
+#include <inttypes.h>
+#include <sys/param.h>
+#include <pthread.h>
+#ifdef HAVE_PTHREAD_NP_H
+#include <pthread_np.h>
+#endif
+#ifdef HAVE_SYS_CPUSET_H
+#include <sys/cpuset.h>
+#endif
+#ifdef HAVE_SYS_SYSCTL_H
+#include <sys/sysctl.h>
+#endif
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+#if defined(HAVE_SYS_CPUSET_H) && defined(HAVE_CPUSET_SETAFFINITY)
+static void
+hwloc_freebsd_bsd2hwloc(hwloc_bitmap_t hwloc_cpuset, const cpuset_t *cset)
+{
+  unsigned cpu;
+  hwloc_bitmap_zero(hwloc_cpuset);
+  for (cpu = 0; cpu < CPU_SETSIZE; cpu++)
+    if (CPU_ISSET(cpu, cset))
+      hwloc_bitmap_set(hwloc_cpuset, cpu);
+}
+
+static void
+hwloc_freebsd_hwloc2bsd(hwloc_const_bitmap_t hwloc_cpuset, cpuset_t *cset)
+{
+  unsigned cpu;
+  CPU_ZERO(cset);
+  for (cpu = 0; cpu < CPU_SETSIZE; cpu++)
+    if (hwloc_bitmap_isset(hwloc_cpuset, cpu))
+      CPU_SET(cpu, cset);
+}
+
+static int
+hwloc_freebsd_set_sth_affinity(hwloc_topology_t topology __hwloc_attribute_unused, cpulevel_t level, cpuwhich_t which, id_t id, hwloc_const_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
+{
+  cpuset_t cset;
+
+  hwloc_freebsd_hwloc2bsd(hwloc_cpuset, &cset);
+
+  if (cpuset_setaffinity(level, which, id, sizeof(cset), &cset))
+    return -1;
+
+  return 0;
+}
+
+static int
+hwloc_freebsd_get_sth_affinity(hwloc_topology_t topology __hwloc_attribute_unused, cpulevel_t level, cpuwhich_t which, id_t id, hwloc_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
+{
+  cpuset_t cset;
+
+  if (cpuset_getaffinity(level, which, id, sizeof(cset), &cset))
+    return -1;
+
+  hwloc_freebsd_bsd2hwloc(hwloc_cpuset, &cset);
+  return 0;
+}
+
+static int
+hwloc_freebsd_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_cpuset, int flags)
+{
+  return hwloc_freebsd_set_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_get_thisproc_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_cpuset, int flags)
+{
+  return hwloc_freebsd_get_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, -1, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_cpuset, int flags)
+{
+  return hwloc_freebsd_set_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_cpuset, int flags)
+{
+  return hwloc_freebsd_get_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_TID, -1, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t hwloc_cpuset, int flags)
+{
+  return hwloc_freebsd_set_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, hwloc_cpuset, flags);
+}
+
+static int
+hwloc_freebsd_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_bitmap_t hwloc_cpuset, int flags)
+{
+  return hwloc_freebsd_get_sth_affinity(topology, CPU_LEVEL_WHICH, CPU_WHICH_PID, pid, hwloc_cpuset, flags);
+}
+
+#ifdef hwloc_thread_t
+
+#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
+#pragma weak pthread_setaffinity_np
+static int
+hwloc_freebsd_set_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid, hwloc_const_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
+{
+  int err;
+  cpuset_t cset;
+
+  if (!pthread_setaffinity_np) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  hwloc_freebsd_hwloc2bsd(hwloc_cpuset, &cset);
+
+  err = pthread_setaffinity_np(tid, sizeof(cset), &cset);
+
+  if (err) {
+    errno = err;
+    return -1;
+  }
+
+  return 0;
+}
+#endif
+
+#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
+#pragma weak pthread_getaffinity_np
+static int
+hwloc_freebsd_get_thread_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_thread_t tid, hwloc_bitmap_t hwloc_cpuset, int flags __hwloc_attribute_unused)
+{
+  int err;
+  cpuset_t cset;
+
+  if (!pthread_getaffinity_np) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  err = pthread_getaffinity_np(tid, sizeof(cset), &cset);
+
+  if (err) {
+    errno = err;
+    return -1;
+  }
+
+  hwloc_freebsd_bsd2hwloc(hwloc_cpuset, &cset);
+  return 0;
+}
+#endif
+#endif
+#endif
+
+#if (defined HAVE_SYSCTL) && (defined HAVE_SYS_SYSCTL_H)
+static void
+hwloc_freebsd_node_meminfo_info(struct hwloc_topology *topology)
+{
+       int mib[2] = { CTL_HW, HW_PHYSMEM };
+       unsigned long physmem;
+       size_t len = sizeof(physmem);
+       sysctl(mib, 2, &physmem, &len, NULL, 0);
+       topology->levels[0][0]->memory.local_memory = physmem;
+       /* we don't know anything about NUMA nodes in this backend.
+        * let another backend or the core move that memory to the right NUMA node */
+}
+#endif
+
+static int
+hwloc_look_freebsd(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  unsigned nbprocs = hwloc_fallback_nbprocessors(topology);
+
+  if (!topology->levels[0][0]->cpuset) {
+    /* Nobody (even the x86 backend) created objects yet, setup basic objects */
+    hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+    hwloc_setup_pu_level(topology, nbprocs);
+  }
+
+  /* Add FreeBSD specific information */
+#if (defined HAVE_SYSCTL) && (defined HAVE_SYS_SYSCTL_H)
+  hwloc_freebsd_node_meminfo_info(topology);
+#endif
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "FreeBSD");
+  if (topology->is_thissystem)
+    hwloc_add_uname_info(topology, NULL);
+  return 1;
+}
+
+void
+hwloc_set_freebsd_hooks(struct hwloc_binding_hooks *hooks __hwloc_attribute_unused,
+			struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+#if defined(HAVE_SYS_CPUSET_H) && defined(HAVE_CPUSET_SETAFFINITY)
+  hooks->set_thisproc_cpubind = hwloc_freebsd_set_thisproc_cpubind;
+  hooks->get_thisproc_cpubind = hwloc_freebsd_get_thisproc_cpubind;
+  hooks->set_thisthread_cpubind = hwloc_freebsd_set_thisthread_cpubind;
+  hooks->get_thisthread_cpubind = hwloc_freebsd_get_thisthread_cpubind;
+  hooks->set_proc_cpubind = hwloc_freebsd_set_proc_cpubind;
+  hooks->get_proc_cpubind = hwloc_freebsd_get_proc_cpubind;
+#ifdef hwloc_thread_t
+#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
+  hooks->set_thread_cpubind = hwloc_freebsd_set_thread_cpubind;
+#endif
+#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
+  hooks->get_thread_cpubind = hwloc_freebsd_get_thread_cpubind;
+#endif
+#endif
+#endif
+  /* TODO: get_last_cpu_location: find out ki_lastcpu */
+}
+
+static struct hwloc_backend *
+hwloc_freebsd_component_instantiate(struct hwloc_disc_component *component,
+				    const void *_data1 __hwloc_attribute_unused,
+				    const void *_data2 __hwloc_attribute_unused,
+				    const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+  backend->discover = hwloc_look_freebsd;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_freebsd_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "freebsd",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_freebsd_component_instantiate,
+  50,
+  NULL
+};
+
+const struct hwloc_component hwloc_freebsd_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_freebsd_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-linux.c b/ext/hwloc/hwloc/topology-linux.c
new file mode 100644
index 0000000..82423ff
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-linux.c
@@ -0,0 +1,5133 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2013 Université Bordeaux
+ * Copyright © 2009-2014 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2015 Intel, Inc.  All rights reserved.
+ * Copyright © 2010 IBM
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/linux.h>
+#include <private/misc.h>
+#include <private/private.h>
+#include <private/misc.h>
+#include <private/debug.h>
+
+#include <limits.h>
+#include <stdio.h>
+#include <fcntl.h>
+#include <errno.h>
+#include <assert.h>
+#ifdef HAVE_DIRENT_H
+#include <dirent.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_LIBUDEV_H
+#include <libudev.h>
+#endif
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sched.h>
+#include <pthread.h>
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND
+#define migratepages migrate_pages /* workaround broken migratepages prototype in numaif.h before libnuma 2.0.2 */
+#include <numaif.h>
+#endif
+
+struct hwloc_linux_backend_data_s {
+  int root_fd; /* The file descriptor for the file system root, used when browsing, e.g., Linux' sysfs and procfs. */
+  int is_real_fsroot; /* Boolean saying whether root_fd points to the real filesystem root of the system */
+#ifdef HAVE_LIBUDEV_H
+  struct udev *udev; /* Global udev context */
+#endif
+
+  struct utsname utsname; /* fields contain \0 when unknown */
+
+  int deprecated_classlinks_model; /* -2 if never tried, -1 if unknown, 0 if new (device contains class/name), 1 if old (device contains class:name) */
+  int mic_need_directlookup; /* if not tried yet, 0 if not needed, 1 if needed */
+  unsigned mic_directlookup_id_max; /* -1 if not tried yet, 0 if none to lookup, maxid+1 otherwise */
+};
+
+
+
+/***************************
+ * Misc Abstraction layers *
+ ***************************/
+
+#if !(defined HWLOC_HAVE_SCHED_SETAFFINITY) && (defined HWLOC_HAVE__SYSCALL3)
+/* libc doesn't have support for sched_setaffinity, build system call
+ * ourselves: */
+#    include <linux/unistd.h>
+#    ifndef __NR_sched_setaffinity
+#       ifdef __i386__
+#         define __NR_sched_setaffinity 241
+#       elif defined(__x86_64__)
+#         define __NR_sched_setaffinity 203
+#       elif defined(__ia64__)
+#         define __NR_sched_setaffinity 1231
+#       elif defined(__hppa__)
+#         define __NR_sched_setaffinity 211
+#       elif defined(__alpha__)
+#         define __NR_sched_setaffinity 395
+#       elif defined(__s390__)
+#         define __NR_sched_setaffinity 239
+#       elif defined(__sparc__)
+#         define __NR_sched_setaffinity 261
+#       elif defined(__m68k__)
+#         define __NR_sched_setaffinity 311
+#       elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
+#         define __NR_sched_setaffinity 222
+#       elif defined(__arm__)
+#         define __NR_sched_setaffinity 241
+#       elif defined(__cris__)
+#         define __NR_sched_setaffinity 241
+/*#       elif defined(__mips__)
+  #         define __NR_sched_setaffinity TODO (32/64/nabi) */
+#       else
+#         warning "don't know the syscall number for sched_setaffinity on this architecture, will not support binding"
+#         define sched_setaffinity(pid, lg, mask) (errno = ENOSYS, -1)
+#       endif
+#    endif
+#    ifndef sched_setaffinity
+       _syscall3(int, sched_setaffinity, pid_t, pid, unsigned int, lg, const void *, mask)
+#    endif
+#    ifndef __NR_sched_getaffinity
+#       ifdef __i386__
+#         define __NR_sched_getaffinity 242
+#       elif defined(__x86_64__)
+#         define __NR_sched_getaffinity 204
+#       elif defined(__ia64__)
+#         define __NR_sched_getaffinity 1232
+#       elif defined(__hppa__)
+#         define __NR_sched_getaffinity 212
+#       elif defined(__alpha__)
+#         define __NR_sched_getaffinity 396
+#       elif defined(__s390__)
+#         define __NR_sched_getaffinity 240
+#       elif defined(__sparc__)
+#         define __NR_sched_getaffinity 260
+#       elif defined(__m68k__)
+#         define __NR_sched_getaffinity 312
+#       elif defined(__powerpc__) || defined(__ppc__) || defined(__PPC__) || defined(__powerpc64__) || defined(__ppc64__)
+#         define __NR_sched_getaffinity 223
+#       elif defined(__arm__)
+#         define __NR_sched_getaffinity 242
+#       elif defined(__cris__)
+#         define __NR_sched_getaffinity 242
+/*#       elif defined(__mips__)
+  #         define __NR_sched_getaffinity TODO (32/64/nabi) */
+#       else
+#         warning "don't know the syscall number for sched_getaffinity on this architecture, will not support getting binding"
+#         define sched_getaffinity(pid, lg, mask) (errno = ENOSYS, -1)
+#       endif
+#    endif
+#    ifndef sched_getaffinity
+       _syscall3(int, sched_getaffinity, pid_t, pid, unsigned int, lg, void *, mask)
+#    endif
+#endif
+
+/* Added for ntohl() */
+#include <arpa/inet.h>
+
+#ifdef HAVE_OPENAT
+/* Use our own filesystem functions if we have openat */
+
+static const char *
+hwloc_checkat(const char *path, int fsroot_fd)
+{
+  const char *relative_path;
+  if (fsroot_fd < 0) {
+    errno = EBADF;
+    return NULL;
+  }
+
+  /* Skip leading slashes.  */
+  for (relative_path = path; *relative_path == '/'; relative_path++);
+
+  return relative_path;
+}
+
+static int
+hwloc_openat(const char *path, int fsroot_fd)
+{
+  const char *relative_path;
+
+  relative_path = hwloc_checkat(path, fsroot_fd);
+  if (!relative_path)
+    return -1;
+
+  return openat (fsroot_fd, relative_path, O_RDONLY);
+}
+
+static FILE *
+hwloc_fopenat(const char *path, const char *mode, int fsroot_fd)
+{
+  int fd;
+
+  if (strcmp(mode, "r")) {
+    errno = ENOTSUP;
+    return NULL;
+  }
+
+  fd = hwloc_openat (path, fsroot_fd);
+  if (fd == -1)
+    return NULL;
+
+  return fdopen(fd, mode);
+}
+
+static int
+hwloc_accessat(const char *path, int mode, int fsroot_fd)
+{
+  const char *relative_path;
+
+  relative_path = hwloc_checkat(path, fsroot_fd);
+  if (!relative_path)
+    return -1;
+
+  return faccessat(fsroot_fd, relative_path, mode, 0);
+}
+
+static int
+hwloc_fstatat(const char *path, struct stat *st, int flags, int fsroot_fd)
+{
+  const char *relative_path;
+
+  relative_path = hwloc_checkat(path, fsroot_fd);
+  if (!relative_path)
+    return -1;
+
+  return fstatat(fsroot_fd, relative_path, st, flags);
+}
+
+static DIR*
+hwloc_opendirat(const char *path, int fsroot_fd)
+{
+  int dir_fd;
+  const char *relative_path;
+
+  relative_path = hwloc_checkat(path, fsroot_fd);
+  if (!relative_path)
+    return NULL;
+
+  dir_fd = openat(fsroot_fd, relative_path, O_RDONLY | O_DIRECTORY);
+  if (dir_fd < 0)
+    return NULL;
+
+  return fdopendir(dir_fd);
+}
+
+#endif /* HAVE_OPENAT */
+
+/* Static inline version of fopen so that we can use openat if we have
+   it, but still preserve compiler parameter checking */
+static __hwloc_inline int
+hwloc_open(const char *p, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+    return hwloc_openat(p, d);
+#else
+    return open(p, O_RDONLY);
+#endif
+}
+
+static __hwloc_inline FILE *
+hwloc_fopen(const char *p, const char *m, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+    return hwloc_fopenat(p, m, d);
+#else
+    return fopen(p, m);
+#endif
+}
+
+/* Static inline version of access so that we can use openat if we have
+   it, but still preserve compiler parameter checking */
+static __hwloc_inline int
+hwloc_access(const char *p, int m, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+    return hwloc_accessat(p, m, d);
+#else
+    return access(p, m);
+#endif
+}
+
+static __hwloc_inline int
+hwloc_stat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+    return hwloc_fstatat(p, st, 0, d);
+#else
+    return stat(p, st);
+#endif
+}
+
+static __hwloc_inline int
+hwloc_lstat(const char *p, struct stat *st, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+    return hwloc_fstatat(p, st, AT_SYMLINK_NOFOLLOW, d);
+#else
+    return lstat(p, st);
+#endif
+}
+
+/* Static inline version of opendir so that we can use openat if we have
+   it, but still preserve compiler parameter checking */
+static __hwloc_inline DIR *
+hwloc_opendir(const char *p, int d __hwloc_attribute_unused)
+{
+#ifdef HAVE_OPENAT
+    return hwloc_opendirat(p, d);
+#else
+    return opendir(p);
+#endif
+}
+
+
+/*****************************
+ ******* CpuBind Hooks *******
+ *****************************/
+
+int
+hwloc_linux_set_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_const_bitmap_t hwloc_set __hwloc_attribute_unused)
+{
+  /* The resulting binding is always strict */
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+  cpu_set_t *plinux_set;
+  unsigned cpu;
+  int last;
+  size_t setsize;
+  int err;
+
+  last = hwloc_bitmap_last(hwloc_set);
+  if (last == -1) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  setsize = CPU_ALLOC_SIZE(last+1);
+  plinux_set = CPU_ALLOC(last+1);
+
+  CPU_ZERO_S(setsize, plinux_set);
+  hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+    CPU_SET_S(cpu, setsize, plinux_set);
+  hwloc_bitmap_foreach_end();
+
+  err = sched_setaffinity(tid, setsize, plinux_set);
+
+  CPU_FREE(plinux_set);
+  return err;
+#elif defined(HWLOC_HAVE_CPU_SET)
+  cpu_set_t linux_set;
+  unsigned cpu;
+
+  CPU_ZERO(&linux_set);
+  hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+    CPU_SET(cpu, &linux_set);
+  hwloc_bitmap_foreach_end();
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+  return sched_setaffinity(tid, &linux_set);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  return sched_setaffinity(tid, sizeof(linux_set), &linux_set);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+#elif defined(HWLOC_HAVE__SYSCALL3)
+  unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+  return sched_setaffinity(tid, (void*) &mask);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  return sched_setaffinity(tid, sizeof(mask), (void*) &mask);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+#else /* !_SYSCALL3 */
+  errno = ENOSYS;
+  return -1;
+#endif /* !_SYSCALL3 */
+}
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+static int
+hwloc_linux_parse_cpuset_file(FILE *file, hwloc_bitmap_t set)
+{
+  unsigned long start, stop;
+
+  /* reset to zero first */
+  hwloc_bitmap_zero(set);
+
+  while (fscanf(file, "%lu", &start) == 1)
+  {
+    int c = fgetc(file);
+
+    stop = start;
+
+    if (c == '-') {
+      /* Range */
+      if (fscanf(file, "%lu", &stop) != 1) {
+        /* Expected a number here */
+        errno = EINVAL;
+        return -1;
+      }
+      c = fgetc(file);
+    }
+
+    if (c == EOF || c == '\n') {
+      hwloc_bitmap_set_range(set, start, stop);
+      break;
+    }
+
+    if (c != ',') {
+      /* Expected EOF, EOL, or a comma */
+      errno = EINVAL;
+      return -1;
+    }
+
+    hwloc_bitmap_set_range(set, start, stop);
+  }
+
+  return 0;
+}
+
+/*
+ * On some kernels, sched_getaffinity requires the output size to be larger
+ * than the kernel cpu_set size (defined by CONFIG_NR_CPUS).
+ * Try sched_affinity on ourself until we find a nr_cpus value that makes
+ * the kernel happy.
+ */
+static int
+hwloc_linux_find_kernel_nr_cpus(hwloc_topology_t topology)
+{
+  static int _nr_cpus = -1;
+  int nr_cpus = _nr_cpus;
+  FILE *possible;
+
+  if (nr_cpus != -1)
+    /* already computed */
+    return nr_cpus;
+
+  if (topology->levels[0][0]->complete_cpuset)
+    /* start with a nr_cpus that may contain the whole topology */
+    nr_cpus = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset) + 1;
+  if (nr_cpus <= 0)
+    /* start from scratch, the topology isn't ready yet (complete_cpuset is missing (-1) or empty (0))*/
+    nr_cpus = 1;
+
+  possible = fopen("/sys/devices/system/cpu/possible", "r");
+  if (possible) {
+    hwloc_bitmap_t possible_bitmap = hwloc_bitmap_alloc();
+    if (hwloc_linux_parse_cpuset_file(possible, possible_bitmap) == 0) {
+      int max_possible = hwloc_bitmap_last(possible_bitmap);
+
+      hwloc_debug_bitmap("possible CPUs are %s\n", possible_bitmap);
+
+      if (nr_cpus < max_possible + 1)
+        nr_cpus = max_possible + 1;
+    }
+    fclose(possible);
+    hwloc_bitmap_free(possible_bitmap);
+  }
+
+  while (1) {
+    cpu_set_t *set = CPU_ALLOC(nr_cpus);
+    size_t setsize = CPU_ALLOC_SIZE(nr_cpus);
+    int err = sched_getaffinity(0, setsize, set); /* always works, unless setsize is too small */
+    CPU_FREE(set);
+    nr_cpus = setsize * 8; /* that's the value that was actually tested */
+    if (!err)
+      /* found it */
+      return _nr_cpus = nr_cpus;
+    nr_cpus *= 2;
+  }
+}
+#endif
+
+int
+hwloc_linux_get_tid_cpubind(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid __hwloc_attribute_unused, hwloc_bitmap_t hwloc_set __hwloc_attribute_unused)
+{
+  int err __hwloc_attribute_unused;
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+  cpu_set_t *plinux_set;
+  unsigned cpu;
+  int last;
+  size_t setsize;
+  int kernel_nr_cpus;
+
+  /* find the kernel nr_cpus so as to use a large enough cpu_set size */
+  kernel_nr_cpus = hwloc_linux_find_kernel_nr_cpus(topology);
+  setsize = CPU_ALLOC_SIZE(kernel_nr_cpus);
+  plinux_set = CPU_ALLOC(kernel_nr_cpus);
+
+  err = sched_getaffinity(tid, setsize, plinux_set);
+
+  if (err < 0) {
+    CPU_FREE(plinux_set);
+    return -1;
+  }
+
+  last = -1;
+  if (topology->levels[0][0]->complete_cpuset)
+    last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
+  if (last == -1)
+    /* round the maximal support number, the topology isn't ready yet (complete_cpuset is missing or empty)*/
+    last = kernel_nr_cpus-1;
+
+  hwloc_bitmap_zero(hwloc_set);
+  for(cpu=0; cpu<=(unsigned) last; cpu++)
+    if (CPU_ISSET_S(cpu, setsize, plinux_set))
+      hwloc_bitmap_set(hwloc_set, cpu);
+
+  CPU_FREE(plinux_set);
+#elif defined(HWLOC_HAVE_CPU_SET)
+  cpu_set_t linux_set;
+  unsigned cpu;
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+  err = sched_getaffinity(tid, &linux_set);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  err = sched_getaffinity(tid, sizeof(linux_set), &linux_set);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  if (err < 0)
+    return -1;
+
+  hwloc_bitmap_zero(hwloc_set);
+  for(cpu=0; cpu<CPU_SETSIZE; cpu++)
+    if (CPU_ISSET(cpu, &linux_set))
+      hwloc_bitmap_set(hwloc_set, cpu);
+#elif defined(HWLOC_HAVE__SYSCALL3)
+  unsigned long mask;
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+  err = sched_getaffinity(tid, (void*) &mask);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  err = sched_getaffinity(tid, sizeof(mask), (void*) &mask);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  if (err < 0)
+    return -1;
+
+  hwloc_bitmap_from_ulong(hwloc_set, mask);
+#else /* !_SYSCALL3 */
+  errno = ENOSYS;
+  return -1;
+#endif /* !_SYSCALL3 */
+
+  return 0;
+}
+
+/* Get the array of tids of a process from the task directory in /proc */
+static int
+hwloc_linux_get_proc_tids(DIR *taskdir, unsigned *nr_tidsp, pid_t ** tidsp)
+{
+  struct dirent *dirent;
+  unsigned nr_tids = 0;
+  unsigned max_tids = 32;
+  pid_t *tids;
+  struct stat sb;
+
+  /* take the number of links as a good estimate for the number of tids */
+  if (fstat(dirfd(taskdir), &sb) == 0)
+    max_tids = sb.st_nlink;
+
+  tids = malloc(max_tids*sizeof(pid_t));
+  if (!tids) {
+    errno = ENOMEM;
+    return -1;
+  }
+
+  rewinddir(taskdir);
+
+  while ((dirent = readdir(taskdir)) != NULL) {
+    if (nr_tids == max_tids) {
+      pid_t *newtids;
+      max_tids += 8;
+      newtids = realloc(tids, max_tids*sizeof(pid_t));
+      if (!newtids) {
+        free(tids);
+        errno = ENOMEM;
+        return -1;
+      }
+      tids = newtids;
+    }
+    if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+      continue;
+    tids[nr_tids++] = atoi(dirent->d_name);
+  }
+
+  *nr_tidsp = nr_tids;
+  *tidsp = tids;
+  return 0;
+}
+
+/* Per-tid callbacks */
+typedef int (*hwloc_linux_foreach_proc_tid_cb_t)(hwloc_topology_t topology, pid_t tid, void *data, int idx);
+
+static int
+hwloc_linux_foreach_proc_tid(hwloc_topology_t topology,
+			     pid_t pid, hwloc_linux_foreach_proc_tid_cb_t cb,
+			     void *data)
+{
+  char taskdir_path[128];
+  DIR *taskdir;
+  pid_t *tids, *newtids;
+  unsigned i, nr, newnr, failed = 0, failed_errno = 0;
+  unsigned retrynr = 0;
+  int err;
+
+  if (pid)
+    snprintf(taskdir_path, sizeof(taskdir_path), "/proc/%u/task", (unsigned) pid);
+  else
+    snprintf(taskdir_path, sizeof(taskdir_path), "/proc/self/task");
+
+  taskdir = opendir(taskdir_path);
+  if (!taskdir) {
+    if (errno == ENOENT)
+      errno = EINVAL;
+    err = -1;
+    goto out;
+  }
+
+  /* read the current list of threads */
+  err = hwloc_linux_get_proc_tids(taskdir, &nr, &tids);
+  if (err < 0)
+    goto out_with_dir;
+
+ retry:
+  /* apply the callback to all threads */
+  failed=0;
+  for(i=0; i<nr; i++) {
+    err = cb(topology, tids[i], data, i);
+    if (err < 0) {
+      failed++;
+      failed_errno = errno;
+    }
+  }
+
+  /* re-read the list of thread */
+  err = hwloc_linux_get_proc_tids(taskdir, &newnr, &newtids);
+  if (err < 0)
+    goto out_with_tids;
+  /* retry if the list changed in the meantime, or we failed for *some* threads only.
+   * if we're really unlucky, all threads changed but we got the same set of tids. no way to support this.
+   */
+  if (newnr != nr || memcmp(newtids, tids, nr*sizeof(pid_t)) || (failed && failed != nr)) {
+    free(tids);
+    tids = newtids;
+    nr = newnr;
+    if (++retrynr > 10) {
+      /* we tried 10 times, it didn't work, the application is probably creating/destroying many threads, stop trying */
+      errno = EAGAIN;
+      err = -1;
+      goto out_with_tids;
+    }
+    goto retry;
+  } else {
+    free(newtids);
+  }
+
+  /* if all threads failed, return the last errno. */
+  if (failed) {
+    err = -1;
+    errno = failed_errno;
+    goto out_with_tids;
+  }
+
+  err = 0;
+ out_with_tids:
+  free(tids);
+ out_with_dir:
+  closedir(taskdir);
+ out:
+  return err;
+}
+
+/* Per-tid proc_set_cpubind callback and caller.
+ * Callback data is a hwloc_bitmap_t. */
+static int
+hwloc_linux_foreach_proc_tid_set_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *data, int idx __hwloc_attribute_unused)
+{
+  return hwloc_linux_set_tid_cpubind(topology, tid, (hwloc_bitmap_t) data);
+}
+
+static int
+hwloc_linux_set_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  return hwloc_linux_foreach_proc_tid(topology, pid,
+				      hwloc_linux_foreach_proc_tid_set_cpubind_cb,
+				      (void*) hwloc_set);
+}
+
+/* Per-tid proc_get_cpubind callback data, callback function and caller */
+struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s {
+  hwloc_bitmap_t cpuset;
+  hwloc_bitmap_t tidset;
+  int flags;
+};
+
+static int
+hwloc_linux_foreach_proc_tid_get_cpubind_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
+{
+  struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s *data = _data;
+  hwloc_bitmap_t cpuset = data->cpuset;
+  hwloc_bitmap_t tidset = data->tidset;
+  int flags = data->flags;
+
+  if (hwloc_linux_get_tid_cpubind(topology, tid, tidset))
+    return -1;
+
+  /* reset the cpuset on first iteration */
+  if (!idx)
+    hwloc_bitmap_zero(cpuset);
+
+  if (flags & HWLOC_CPUBIND_STRICT) {
+    /* if STRICT, we want all threads to have the same binding */
+    if (!idx) {
+      /* this is the first thread, copy its binding */
+      hwloc_bitmap_copy(cpuset, tidset);
+    } else if (!hwloc_bitmap_isequal(cpuset, tidset)) {
+      /* this is not the first thread, and it's binding is different */
+      errno = EXDEV;
+      return -1;
+    }
+  } else {
+    /* if not STRICT, just OR all thread bindings */
+    hwloc_bitmap_or(cpuset, cpuset, tidset);
+  }
+  return 0;
+}
+
+static int
+hwloc_linux_get_pid_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
+{
+  struct hwloc_linux_foreach_proc_tid_get_cpubind_cb_data_s data;
+  hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
+  int ret;
+
+  data.cpuset = hwloc_set;
+  data.tidset = tidset;
+  data.flags = flags;
+  ret = hwloc_linux_foreach_proc_tid(topology, pid,
+				     hwloc_linux_foreach_proc_tid_get_cpubind_cb,
+				     (void*) &data);
+  hwloc_bitmap_free(tidset);
+  return ret;
+}
+
+static int
+hwloc_linux_set_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  if (pid == 0)
+    pid = topology->pid;
+  if (flags & HWLOC_CPUBIND_THREAD)
+    return hwloc_linux_set_tid_cpubind(topology, pid, hwloc_set);
+  else
+    return hwloc_linux_set_pid_cpubind(topology, pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_get_proc_cpubind(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
+{
+  if (pid == 0)
+    pid = topology->pid;
+  if (flags & HWLOC_CPUBIND_THREAD)
+    return hwloc_linux_get_tid_cpubind(topology, pid, hwloc_set);
+  else
+    return hwloc_linux_get_pid_cpubind(topology, pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  return hwloc_linux_set_pid_cpubind(topology, topology->pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_get_thisproc_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
+{
+  return hwloc_linux_get_pid_cpubind(topology, topology->pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+  return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
+}
+
+static int
+hwloc_linux_get_thisthread_cpubind(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+  return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
+}
+
+#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
+#pragma weak pthread_setaffinity_np
+#pragma weak pthread_self
+
+static int
+hwloc_linux_set_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_const_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  int err;
+
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  if (!pthread_self) {
+    /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
+    errno = ENOSYS;
+    return -1;
+  }
+  if (tid == pthread_self())
+    return hwloc_linux_set_tid_cpubind(topology, 0, hwloc_set);
+
+  if (!pthread_setaffinity_np) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+  /* Use a separate block so that we can define specific variable
+     types here */
+  {
+     cpu_set_t *plinux_set;
+     unsigned cpu;
+     int last;
+     size_t setsize;
+
+     last = hwloc_bitmap_last(hwloc_set);
+     if (last == -1) {
+       errno = EINVAL;
+       return -1;
+     }
+
+     setsize = CPU_ALLOC_SIZE(last+1);
+     plinux_set = CPU_ALLOC(last+1);
+
+     CPU_ZERO_S(setsize, plinux_set);
+     hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+         CPU_SET_S(cpu, setsize, plinux_set);
+     hwloc_bitmap_foreach_end();
+
+     err = pthread_setaffinity_np(tid, setsize, plinux_set);
+
+     CPU_FREE(plinux_set);
+  }
+#elif defined(HWLOC_HAVE_CPU_SET)
+  /* Use a separate block so that we can define specific variable
+     types here */
+  {
+     cpu_set_t linux_set;
+     unsigned cpu;
+
+     CPU_ZERO(&linux_set);
+     hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+         CPU_SET(cpu, &linux_set);
+     hwloc_bitmap_foreach_end();
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+     err = pthread_setaffinity_np(tid, &linux_set);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+     err = pthread_setaffinity_np(tid, sizeof(linux_set), &linux_set);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  }
+#else /* CPU_SET */
+  /* Use a separate block so that we can define specific variable
+     types here */
+  {
+      unsigned long mask = hwloc_bitmap_to_ulong(hwloc_set);
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+      err = pthread_setaffinity_np(tid, (void*) &mask);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+      err = pthread_setaffinity_np(tid, sizeof(mask), (void*) &mask);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+  }
+#endif /* CPU_SET */
+
+  if (err) {
+    errno = err;
+    return -1;
+  }
+  return 0;
+}
+#endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
+
+#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
+#pragma weak pthread_getaffinity_np
+#pragma weak pthread_self
+
+static int
+hwloc_linux_get_thread_cpubind(hwloc_topology_t topology, pthread_t tid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  int err;
+
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  if (!pthread_self) {
+    /* ?! Application uses set_thread_cpubind, but doesn't link against libpthread ?! */
+    errno = ENOSYS;
+    return -1;
+  }
+  if (tid == pthread_self())
+    return hwloc_linux_get_tid_cpubind(topology, 0, hwloc_set);
+
+  if (!pthread_getaffinity_np) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+#if defined(HWLOC_HAVE_CPU_SET_S) && !defined(HWLOC_HAVE_OLD_SCHED_SETAFFINITY)
+  /* Use a separate block so that we can define specific variable
+     types here */
+  {
+     cpu_set_t *plinux_set;
+     unsigned cpu;
+     int last;
+     size_t setsize;
+
+     last = hwloc_bitmap_last(topology->levels[0][0]->complete_cpuset);
+     assert (last != -1);
+
+     setsize = CPU_ALLOC_SIZE(last+1);
+     plinux_set = CPU_ALLOC(last+1);
+
+     err = pthread_getaffinity_np(tid, setsize, plinux_set);
+     if (err) {
+        CPU_FREE(plinux_set);
+        errno = err;
+        return -1;
+     }
+
+     hwloc_bitmap_zero(hwloc_set);
+     for(cpu=0; cpu<=(unsigned) last; cpu++)
+       if (CPU_ISSET_S(cpu, setsize, plinux_set))
+	 hwloc_bitmap_set(hwloc_set, cpu);
+
+     CPU_FREE(plinux_set);
+  }
+#elif defined(HWLOC_HAVE_CPU_SET)
+  /* Use a separate block so that we can define specific variable
+     types here */
+  {
+     cpu_set_t linux_set;
+     unsigned cpu;
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+     err = pthread_getaffinity_np(tid, &linux_set);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+     err = pthread_getaffinity_np(tid, sizeof(linux_set), &linux_set);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+     if (err) {
+        errno = err;
+        return -1;
+     }
+
+     hwloc_bitmap_zero(hwloc_set);
+     for(cpu=0; cpu<CPU_SETSIZE; cpu++)
+       if (CPU_ISSET(cpu, &linux_set))
+	 hwloc_bitmap_set(hwloc_set, cpu);
+  }
+#else /* CPU_SET */
+  /* Use a separate block so that we can define specific variable
+     types here */
+  {
+      unsigned long mask;
+
+#ifdef HWLOC_HAVE_OLD_SCHED_SETAFFINITY
+      err = pthread_getaffinity_np(tid, (void*) &mask);
+#else /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+      err = pthread_getaffinity_np(tid, sizeof(mask), (void*) &mask);
+#endif /* HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+      if (err) {
+        errno = err;
+        return -1;
+      }
+
+     hwloc_bitmap_from_ulong(hwloc_set, mask);
+  }
+#endif /* CPU_SET */
+
+  return 0;
+}
+#endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
+
+int
+hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology __hwloc_attribute_unused, pid_t tid, hwloc_bitmap_t set)
+{
+  /* read /proc/pid/stat.
+   * its second field contains the command name between parentheses,
+   * and the command itself may contain parentheses,
+   * so read the whole line and find the last closing parenthesis to find the third field.
+   */
+  char buf[1024] = "";
+  char name[64];
+  char *tmp;
+  FILE *file;
+  int i;
+
+  if (!tid) {
+#ifdef SYS_gettid
+    tid = syscall(SYS_gettid);
+#else
+    errno = ENOSYS;
+    return -1;
+#endif
+  }
+
+  snprintf(name, sizeof(name), "/proc/%lu/stat", (unsigned long) tid);
+  file = fopen(name, "r");
+  if (!file) {
+    errno = ENOSYS;
+    return -1;
+  }
+  tmp = fgets(buf, sizeof(buf), file);
+  fclose(file);
+  if (!tmp) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  tmp = strrchr(buf, ')');
+  if (!tmp) {
+    errno = ENOSYS;
+    return -1;
+  }
+  /* skip ') ' to find the actual third argument */
+  tmp += 2;
+
+  /* skip 35 fields */
+  for(i=0; i<36; i++) {
+    tmp = strchr(tmp, ' ');
+    if (!tmp) {
+      errno = ENOSYS;
+      return -1;
+    }
+    /* skip the ' ' itself */
+    tmp++;
+  }
+
+  /* read the last cpu in the 38th field now */
+  if (sscanf(tmp, "%d ", &i) != 1) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  hwloc_bitmap_only(set, i);
+  return 0;
+}
+
+/* Per-tid proc_get_last_cpu_location callback data, callback function and caller */
+struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s {
+  hwloc_bitmap_t cpuset;
+  hwloc_bitmap_t tidset;
+};
+
+static int
+hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb(hwloc_topology_t topology, pid_t tid, void *_data, int idx)
+{
+  struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s *data = _data;
+  hwloc_bitmap_t cpuset = data->cpuset;
+  hwloc_bitmap_t tidset = data->tidset;
+
+  if (hwloc_linux_get_tid_last_cpu_location(topology, tid, tidset))
+    return -1;
+
+  /* reset the cpuset on first iteration */
+  if (!idx)
+    hwloc_bitmap_zero(cpuset);
+
+  hwloc_bitmap_or(cpuset, cpuset, tidset);
+  return 0;
+}
+
+static int
+hwloc_linux_get_pid_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  struct hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb_data_s data;
+  hwloc_bitmap_t tidset = hwloc_bitmap_alloc();
+  int ret;
+
+  data.cpuset = hwloc_set;
+  data.tidset = tidset;
+  ret = hwloc_linux_foreach_proc_tid(topology, pid,
+				     hwloc_linux_foreach_proc_tid_get_last_cpu_location_cb,
+				     &data);
+  hwloc_bitmap_free(tidset);
+  return ret;
+}
+
+static int
+hwloc_linux_get_proc_last_cpu_location(hwloc_topology_t topology, pid_t pid, hwloc_bitmap_t hwloc_set, int flags)
+{
+  if (pid == 0)
+    pid = topology->pid;
+  if (flags & HWLOC_CPUBIND_THREAD)
+    return hwloc_linux_get_tid_last_cpu_location(topology, pid, hwloc_set);
+  else
+    return hwloc_linux_get_pid_last_cpu_location(topology, pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_get_thisproc_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags)
+{
+  return hwloc_linux_get_pid_last_cpu_location(topology, topology->pid, hwloc_set, flags);
+}
+
+static int
+hwloc_linux_get_thisthread_last_cpu_location(hwloc_topology_t topology, hwloc_bitmap_t hwloc_set, int flags __hwloc_attribute_unused)
+{
+  if (topology->pid) {
+    errno = ENOSYS;
+    return -1;
+  }
+  return hwloc_linux_get_tid_last_cpu_location(topology, 0, hwloc_set);
+}
+
+
+
+/***************************
+ ****** Membind hooks ******
+ ***************************/
+
+#if defined HWLOC_HAVE_SET_MEMPOLICY || defined HWLOC_HAVE_MBIND
+static int
+hwloc_linux_membind_policy_from_hwloc(int *linuxpolicy, hwloc_membind_policy_t policy, int flags)
+{
+  switch (policy) {
+  case HWLOC_MEMBIND_DEFAULT:
+  case HWLOC_MEMBIND_FIRSTTOUCH:
+    *linuxpolicy = MPOL_DEFAULT;
+    break;
+  case HWLOC_MEMBIND_BIND:
+    if (flags & HWLOC_MEMBIND_STRICT)
+      *linuxpolicy = MPOL_BIND;
+    else
+      *linuxpolicy = MPOL_PREFERRED;
+    break;
+  case HWLOC_MEMBIND_INTERLEAVE:
+    *linuxpolicy = MPOL_INTERLEAVE;
+    break;
+  /* TODO: next-touch when (if?) patch applied upstream */
+  default:
+    errno = ENOSYS;
+    return -1;
+  }
+  return 0;
+}
+
+static int
+hwloc_linux_membind_mask_from_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
+				      hwloc_const_nodeset_t nodeset,
+				      unsigned *max_os_index_p, unsigned long **linuxmaskp)
+{
+  unsigned max_os_index = 0; /* highest os_index + 1 */
+  unsigned long *linuxmask;
+  unsigned i;
+  hwloc_nodeset_t linux_nodeset = NULL;
+
+  if (hwloc_bitmap_isfull(nodeset)) {
+    linux_nodeset = hwloc_bitmap_alloc();
+    hwloc_bitmap_only(linux_nodeset, 0);
+    nodeset = linux_nodeset;
+  }
+
+  max_os_index = hwloc_bitmap_last(nodeset);
+  if (max_os_index == (unsigned) -1)
+    max_os_index = 0;
+  /* add 1 to convert the last os_index into a max_os_index,
+   * and round up to the nearest multiple of BITS_PER_LONG */
+  max_os_index = (max_os_index + 1 + HWLOC_BITS_PER_LONG - 1) & ~(HWLOC_BITS_PER_LONG - 1);
+
+  linuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
+  if (!linuxmask) {
+    hwloc_bitmap_free(linux_nodeset);
+    errno = ENOMEM;
+    return -1;
+  }
+
+  for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
+    linuxmask[i] = hwloc_bitmap_to_ith_ulong(nodeset, i);
+
+  if (linux_nodeset)
+    hwloc_bitmap_free(linux_nodeset);
+
+  *max_os_index_p = max_os_index;
+  *linuxmaskp = linuxmask;
+  return 0;
+}
+
+static void
+hwloc_linux_membind_mask_to_nodeset(hwloc_topology_t topology __hwloc_attribute_unused,
+				    hwloc_nodeset_t nodeset,
+				    unsigned max_os_index, const unsigned long *linuxmask)
+{
+  unsigned i;
+
+#ifdef HWLOC_DEBUG
+  /* max_os_index comes from hwloc_linux_find_kernel_max_numnodes() so it's a multiple of HWLOC_BITS_PER_LONG */
+  assert(!(max_os_index%HWLOC_BITS_PER_LONG));
+#endif
+
+  hwloc_bitmap_zero(nodeset);
+  for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
+    hwloc_bitmap_set_ith_ulong(nodeset, i, linuxmask[i]);
+}
+#endif /* HWLOC_HAVE_SET_MEMPOLICY || HWLOC_HAVE_MBIND */
+
+#ifdef HWLOC_HAVE_MBIND
+static int
+hwloc_linux_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  unsigned max_os_index; /* highest os_index + 1 */
+  unsigned long *linuxmask;
+  size_t remainder;
+  int linuxpolicy;
+  unsigned linuxflags = 0;
+  int err;
+
+  remainder = (uintptr_t) addr & (sysconf(_SC_PAGESIZE)-1);
+  addr = (char*) addr - remainder;
+  len += remainder;
+
+  err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
+  if (err < 0)
+    return err;
+
+  if (linuxpolicy == MPOL_DEFAULT)
+    /* Some Linux kernels don't like being passed a set */
+    return mbind((void *) addr, len, linuxpolicy, NULL, 0, 0);
+
+  err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
+  if (err < 0)
+    goto out;
+
+  if (flags & HWLOC_MEMBIND_MIGRATE) {
+#ifdef MPOL_MF_MOVE
+    linuxflags = MPOL_MF_MOVE;
+    if (flags & HWLOC_MEMBIND_STRICT)
+      linuxflags |= MPOL_MF_STRICT;
+#else
+    if (flags & HWLOC_MEMBIND_STRICT) {
+      errno = ENOSYS;
+      goto out_with_mask;
+    }
+#endif
+  }
+
+  err = mbind((void *) addr, len, linuxpolicy, linuxmask, max_os_index+1, linuxflags);
+  if (err < 0)
+    goto out_with_mask;
+
+  free(linuxmask);
+  return 0;
+
+ out_with_mask:
+  free(linuxmask);
+ out:
+  return -1;
+}
+
+static void *
+hwloc_linux_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  void *buffer;
+  int err;
+
+  buffer = hwloc_alloc_mmap(topology, len);
+  if (buffer == MAP_FAILED)
+    return NULL;
+
+  err = hwloc_linux_set_area_membind(topology, buffer, len, nodeset, policy, flags);
+  if (err < 0 && policy & HWLOC_MEMBIND_STRICT) {
+    munmap(buffer, len);
+    return NULL;
+  }
+
+  return buffer;
+}
+#endif /* HWLOC_HAVE_MBIND */
+
+#ifdef HWLOC_HAVE_SET_MEMPOLICY
+static int
+hwloc_linux_set_thisthread_membind(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  unsigned max_os_index; /* highest os_index + 1 */
+  unsigned long *linuxmask;
+  int linuxpolicy;
+  int err;
+
+  err = hwloc_linux_membind_policy_from_hwloc(&linuxpolicy, policy, flags);
+  if (err < 0)
+    return err;
+
+  if (linuxpolicy == MPOL_DEFAULT)
+    /* Some Linux kernels don't like being passed a set */
+    return set_mempolicy(linuxpolicy, NULL, 0);
+
+  err = hwloc_linux_membind_mask_from_nodeset(topology, nodeset, &max_os_index, &linuxmask);
+  if (err < 0)
+    goto out;
+
+  if (flags & HWLOC_MEMBIND_MIGRATE) {
+#ifdef HWLOC_HAVE_MIGRATE_PAGES
+    unsigned long *fullmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
+    if (fullmask) {
+      memset(fullmask, 0xf, max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
+      err = migrate_pages(0, max_os_index+1, fullmask, linuxmask);
+      free(fullmask);
+    } else
+      err = -1;
+    if (err < 0 && (flags & HWLOC_MEMBIND_STRICT))
+      goto out_with_mask;
+#else
+    errno = ENOSYS;
+    goto out_with_mask;
+#endif
+  }
+
+  err = set_mempolicy(linuxpolicy, linuxmask, max_os_index+1);
+  if (err < 0)
+    goto out_with_mask;
+
+  free(linuxmask);
+  return 0;
+
+ out_with_mask:
+  free(linuxmask);
+ out:
+  return -1;
+}
+
+/*
+ * On some kernels, get_mempolicy requires the output size to be larger
+ * than the kernel MAX_NUMNODES (defined by CONFIG_NODES_SHIFT).
+ * Try get_mempolicy on ourself until we find a max_os_index value that
+ * makes the kernel happy.
+ */
+static int
+hwloc_linux_find_kernel_max_numnodes(hwloc_topology_t topology __hwloc_attribute_unused)
+{
+  static int max_numnodes = -1;
+  int linuxpolicy;
+
+  if (max_numnodes != -1)
+    /* already computed */
+    return max_numnodes;
+
+  /* start with a single ulong, it's the minimal and it's enough for most machines */
+  max_numnodes = HWLOC_BITS_PER_LONG;
+  while (1) {
+    unsigned long *mask = malloc(max_numnodes / HWLOC_BITS_PER_LONG * sizeof(long));
+    int err = get_mempolicy(&linuxpolicy, mask, max_numnodes, 0, 0);
+    free(mask);
+    if (!err || errno != EINVAL)
+      /* found it */
+      return max_numnodes;
+    max_numnodes *= 2;
+  }
+}
+
+static int
+hwloc_linux_membind_policy_to_hwloc(int linuxpolicy, hwloc_membind_policy_t *policy)
+{
+  switch (linuxpolicy) {
+  case MPOL_DEFAULT:
+    *policy = HWLOC_MEMBIND_FIRSTTOUCH;
+    return 0;
+  case MPOL_PREFERRED:
+  case MPOL_BIND:
+    *policy = HWLOC_MEMBIND_BIND;
+    return 0;
+  case MPOL_INTERLEAVE:
+    *policy = HWLOC_MEMBIND_INTERLEAVE;
+    return 0;
+  default:
+    errno = EINVAL;
+    return -1;
+  }
+}
+
+static int
+hwloc_linux_get_thisthread_membind(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
+{
+  unsigned max_os_index;
+  unsigned long *linuxmask;
+  int linuxpolicy;
+  int err;
+
+  max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
+
+  linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
+  if (!linuxmask) {
+    errno = ENOMEM;
+    goto out;
+  }
+
+  err = get_mempolicy(&linuxpolicy, linuxmask, max_os_index, 0, 0);
+  if (err < 0)
+    goto out_with_mask;
+
+  if (linuxpolicy == MPOL_DEFAULT) {
+    hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
+  } else {
+    hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, linuxmask);
+  }
+
+  err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
+  if (err < 0)
+    goto out_with_mask;
+
+  free(linuxmask);
+  return 0;
+
+ out_with_mask:
+  free(linuxmask);
+ out:
+  return -1;
+}
+
+static int
+hwloc_linux_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t *policy, int flags __hwloc_attribute_unused)
+{
+  unsigned max_os_index;
+  unsigned long *linuxmask, *globallinuxmask;
+  int linuxpolicy, globallinuxpolicy = 0;
+  int mixed = 0;
+  int full = 0;
+  int first = 1;
+  int pagesize = hwloc_getpagesize();
+  char *tmpaddr;
+  int err;
+  unsigned i;
+
+  max_os_index = hwloc_linux_find_kernel_max_numnodes(topology);
+
+  linuxmask = malloc(max_os_index/HWLOC_BITS_PER_LONG * sizeof(long));
+  if (!linuxmask) {
+    errno = ENOMEM;
+    goto out;
+  }
+  globallinuxmask = calloc(max_os_index/HWLOC_BITS_PER_LONG, sizeof(long));
+  if (!globallinuxmask) {
+    errno = ENOMEM;
+    goto out_with_masks;
+  }
+
+  for(tmpaddr = (char *)((unsigned long)addr & ~(pagesize-1));
+      tmpaddr < (char *)addr + len;
+      tmpaddr += pagesize) {
+    err = get_mempolicy(&linuxpolicy, linuxmask, max_os_index, tmpaddr, MPOL_F_ADDR);
+    if (err < 0)
+      goto out_with_masks;
+
+    /* use the first found policy. if we find a different one later, set mixed to 1 */
+    if (first)
+      globallinuxpolicy = linuxpolicy;
+    else if (globallinuxpolicy != linuxpolicy)
+      mixed = 1;
+
+    /* agregate masks, and set full to 1 if we ever find DEFAULT */
+    if (full || linuxpolicy == MPOL_DEFAULT) {
+      full = 1;
+    } else {
+      for(i=0; i<max_os_index/HWLOC_BITS_PER_LONG; i++)
+        globallinuxmask[i] |= linuxmask[i];
+    }
+
+    first = 0;
+  }
+
+  if (mixed) {
+    *policy = HWLOC_MEMBIND_MIXED;
+  } else {
+    err = hwloc_linux_membind_policy_to_hwloc(linuxpolicy, policy);
+    if (err < 0)
+      goto out_with_masks;
+  }
+
+  if (full) {
+    hwloc_bitmap_copy(nodeset, hwloc_topology_get_topology_nodeset(topology));
+  } else {
+    hwloc_linux_membind_mask_to_nodeset(topology, nodeset, max_os_index, globallinuxmask);
+  }
+
+  free(globallinuxmask);
+  free(linuxmask);
+  return 0;
+
+ out_with_masks:
+  free(globallinuxmask);
+  free(linuxmask);
+ out:
+  return -1;
+}
+
+#endif /* HWLOC_HAVE_SET_MEMPOLICY */
+
+void
+hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *hooks,
+			struct hwloc_topology_support *support __hwloc_attribute_unused)
+{
+  hooks->set_thisthread_cpubind = hwloc_linux_set_thisthread_cpubind;
+  hooks->get_thisthread_cpubind = hwloc_linux_get_thisthread_cpubind;
+  hooks->set_thisproc_cpubind = hwloc_linux_set_thisproc_cpubind;
+  hooks->get_thisproc_cpubind = hwloc_linux_get_thisproc_cpubind;
+  hooks->set_proc_cpubind = hwloc_linux_set_proc_cpubind;
+  hooks->get_proc_cpubind = hwloc_linux_get_proc_cpubind;
+#if HAVE_DECL_PTHREAD_SETAFFINITY_NP
+  hooks->set_thread_cpubind = hwloc_linux_set_thread_cpubind;
+#endif /* HAVE_DECL_PTHREAD_SETAFFINITY_NP */
+#if HAVE_DECL_PTHREAD_GETAFFINITY_NP
+  hooks->get_thread_cpubind = hwloc_linux_get_thread_cpubind;
+#endif /* HAVE_DECL_PTHREAD_GETAFFINITY_NP */
+  hooks->get_thisthread_last_cpu_location = hwloc_linux_get_thisthread_last_cpu_location;
+  hooks->get_thisproc_last_cpu_location = hwloc_linux_get_thisproc_last_cpu_location;
+  hooks->get_proc_last_cpu_location = hwloc_linux_get_proc_last_cpu_location;
+#ifdef HWLOC_HAVE_SET_MEMPOLICY
+  hooks->set_thisthread_membind = hwloc_linux_set_thisthread_membind;
+  hooks->get_thisthread_membind = hwloc_linux_get_thisthread_membind;
+  hooks->get_area_membind = hwloc_linux_get_area_membind;
+#endif /* HWLOC_HAVE_SET_MEMPOLICY */
+#ifdef HWLOC_HAVE_MBIND
+  hooks->set_area_membind = hwloc_linux_set_area_membind;
+  hooks->alloc_membind = hwloc_linux_alloc_membind;
+  hooks->alloc = hwloc_alloc_mmap;
+  hooks->free_membind = hwloc_free_mmap;
+  support->membind->firsttouch_membind = 1;
+  support->membind->bind_membind = 1;
+  support->membind->interleave_membind = 1;
+#endif /* HWLOC_HAVE_MBIND */
+#if (defined HWLOC_HAVE_MIGRATE_PAGES) || ((defined HWLOC_HAVE_MBIND) && (defined MPOL_MF_MOVE))
+  support->membind->migrate_membind = 1;
+#endif
+}
+
+
+
+/*******************************************
+ *** Misc Helpers for Topology Discovery ***
+ *******************************************/
+
+/* cpuinfo array */
+struct hwloc_linux_cpuinfo_proc {
+  /* set during hwloc_linux_parse_cpuinfo */
+  unsigned long Pproc;
+  /* set during hwloc_linux_parse_cpuinfo or -1 if unknown*/
+  long Pcore, Ppkg;
+  /* set later, or -1 if unknown */
+  long Lcore, Lpkg;
+
+  /* custom info, set during hwloc_linux_parse_cpuinfo */
+  struct hwloc_obj_info_s *infos;
+  unsigned infos_count;
+};
+
+static int
+hwloc_parse_sysfs_unsigned(const char *mappath, unsigned *value, int fsroot_fd)
+{
+  char string[11];
+  FILE * fd;
+
+  fd = hwloc_fopen(mappath, "r", fsroot_fd);
+  if (!fd) {
+    *value = -1;
+    return -1;
+  }
+
+  if (!fgets(string, 11, fd)) {
+    *value = -1;
+    fclose(fd);
+    return -1;
+  }
+  *value = strtoul(string, NULL, 10);
+
+  fclose(fd);
+
+  return 0;
+}
+
+
+/* kernel cpumaps are composed of an array of 32bits cpumasks */
+#define KERNEL_CPU_MASK_BITS 32
+#define KERNEL_CPU_MAP_LEN (KERNEL_CPU_MASK_BITS/4+2)
+
+int
+hwloc_linux_parse_cpumap_file(FILE *file, hwloc_bitmap_t set)
+{
+  unsigned long *maps;
+  unsigned long map;
+  int nr_maps = 0;
+  static int nr_maps_allocated = 8; /* only compute the power-of-two above the kernel cpumask size once */
+  int i;
+
+  maps = malloc(nr_maps_allocated * sizeof(*maps));
+
+  /* reset to zero first */
+  hwloc_bitmap_zero(set);
+
+  /* parse the whole mask */
+  while (fscanf(file, "%lx,", &map) == 1) /* read one kernel cpu mask and the ending comma */
+    {
+      if (nr_maps == nr_maps_allocated) {
+	nr_maps_allocated *= 2;
+	maps = realloc(maps, nr_maps_allocated * sizeof(*maps));
+      }
+
+      if (!map && !nr_maps)
+	/* ignore the first map if it's empty */
+	continue;
+
+      memmove(&maps[1], &maps[0], nr_maps*sizeof(*maps));
+      maps[0] = map;
+      nr_maps++;
+    }
+
+  /* convert into a set */
+#if KERNEL_CPU_MASK_BITS == HWLOC_BITS_PER_LONG
+  for(i=0; i<nr_maps; i++)
+    hwloc_bitmap_set_ith_ulong(set, i, maps[i]);
+#else
+  for(i=0; i<(nr_maps+1)/2; i++) {
+    unsigned long mask;
+    mask = maps[2*i];
+    if (2*i+1<nr_maps)
+      mask |= maps[2*i+1] << KERNEL_CPU_MASK_BITS;
+    hwloc_bitmap_set_ith_ulong(set, i, mask);
+  }
+#endif
+
+  free(maps);
+
+  return 0;
+}
+
+static hwloc_bitmap_t
+hwloc_parse_cpumap(const char *mappath, int fsroot_fd)
+{
+  hwloc_bitmap_t set;
+  FILE * file;
+
+  file = hwloc_fopen(mappath, "r", fsroot_fd);
+  if (!file)
+    return NULL;
+
+  set = hwloc_bitmap_alloc();
+  hwloc_linux_parse_cpumap_file(file, set);
+
+  fclose(file);
+  return set;
+}
+
+static char *
+hwloc_strdup_mntpath(const char *escapedpath, size_t length)
+{
+  char *path = malloc(length+1);
+  const char *src = escapedpath, *tmp;
+  char *dst = path;
+
+  while ((tmp = strchr(src, '\\')) != NULL) {
+    strncpy(dst, src, tmp-src);
+    dst += tmp-src;
+    if (!strncmp(tmp+1, "040", 3))
+      *dst = ' ';
+    else if (!strncmp(tmp+1, "011", 3))
+      *dst = '	';
+    else if (!strncmp(tmp+1, "012", 3))
+      *dst = '\n';
+    else
+      *dst = '\\';
+    dst++;
+    src = tmp+4;
+  }
+
+  strcpy(dst, src);
+
+  return path;
+}
+
+static void
+hwloc_find_linux_cpuset_mntpnt(char **cgroup_mntpnt, char **cpuset_mntpnt, int fsroot_fd)
+{
+#define PROC_MOUNT_LINE_LEN 512
+  char line[PROC_MOUNT_LINE_LEN];
+  FILE *fd;
+
+  *cgroup_mntpnt = NULL;
+  *cpuset_mntpnt = NULL;
+
+  /* ideally we should use setmntent, getmntent, hasmntopt and endmntent,
+   * but they do not support fsroot_fd.
+   */
+
+  fd = hwloc_fopen("/proc/mounts", "r", fsroot_fd);
+  if (!fd)
+    return;
+
+  while (fgets(line, sizeof(line), fd)) {
+    char *path;
+    char *type;
+    char *tmp;
+
+    /* remove the ending " 0 0\n" that the kernel always adds */
+    tmp = line + strlen(line) - 5;
+    if (tmp < line || strcmp(tmp, " 0 0\n"))
+      fprintf(stderr, "Unexpected end of /proc/mounts line `%s'\n", line);
+    else
+      *tmp = '\0';
+
+    /* path is after first field and a space */
+    tmp = strchr(line, ' ');
+    if (!tmp)
+      continue;
+    path = tmp+1;
+
+    /* type is after path, which may not contain spaces since the kernel escaped them to \040
+     * (see the manpage of getmntent) */
+    tmp = strchr(path, ' ');
+    if (!tmp)
+      continue;
+    type = tmp+1;
+    /* mark the end of path to ease upcoming strdup */
+    *tmp = '\0';
+
+    if (!strncmp(type, "cpuset ", 7)) {
+      /* found a cpuset mntpnt */
+      hwloc_debug("Found cpuset mount point on %s\n", path);
+      *cpuset_mntpnt = hwloc_strdup_mntpath(path, type-path);
+      break;
+
+    } else if (!strncmp(type, "cgroup ", 7)) {
+      /* found a cgroup mntpnt */
+      char *opt, *opts;
+      int cpuset_opt = 0;
+      int noprefix_opt = 0;
+
+      /* find options */
+      tmp = strchr(type, ' ');
+      if (!tmp)
+	continue;
+      opts = tmp+1;
+
+      /* look at options */
+      while ((opt = strsep(&opts, ",")) != NULL) {
+	if (!strcmp(opt, "cpuset"))
+	  cpuset_opt = 1;
+	else if (!strcmp(opt, "noprefix"))
+	  noprefix_opt = 1;
+      }
+      if (!cpuset_opt)
+	continue;
+
+      if (noprefix_opt) {
+	hwloc_debug("Found cgroup emulating a cpuset mount point on %s\n", path);
+	*cpuset_mntpnt = hwloc_strdup_mntpath(path, type-path);
+      } else {
+	hwloc_debug("Found cgroup/cpuset mount point on %s\n", path);
+	*cgroup_mntpnt = hwloc_strdup_mntpath(path, type-path);
+      }
+      break;
+    }
+  }
+
+  fclose(fd);
+}
+
+/*
+ * Linux cpusets may be managed directly or through cgroup.
+ * If cgroup is used, tasks get a /proc/pid/cgroup which may contain a
+ * single line %d:cpuset:<name>. If cpuset are used they get /proc/pid/cpuset
+ * containing <name>.
+ */
+static char *
+hwloc_read_linux_cpuset_name(int fsroot_fd, hwloc_pid_t pid)
+{
+#define CPUSET_NAME_LEN 128
+  char cpuset_name[CPUSET_NAME_LEN];
+  FILE *fd;
+  char *tmp;
+
+  /* check whether a cgroup-cpuset is enabled */
+  if (!pid)
+    fd = hwloc_fopen("/proc/self/cgroup", "r", fsroot_fd);
+  else {
+    char path[] = "/proc/XXXXXXXXXX/cgroup";
+    snprintf(path, sizeof(path), "/proc/%d/cgroup", pid);
+    fd = hwloc_fopen(path, "r", fsroot_fd);
+  }
+  if (fd) {
+    /* find a cpuset line */
+#define CGROUP_LINE_LEN 256
+    char line[CGROUP_LINE_LEN];
+    while (fgets(line, sizeof(line), fd)) {
+      char *end, *colon = strchr(line, ':');
+      if (!colon)
+	continue;
+      if (strncmp(colon, ":cpuset:", 8))
+	continue;
+
+      /* found a cgroup-cpuset line, return the name */
+      fclose(fd);
+      end = strchr(colon, '\n');
+      if (end)
+	*end = '\0';
+      hwloc_debug("Found cgroup-cpuset %s\n", colon+8);
+      return strdup(colon+8);
+    }
+    fclose(fd);
+  }
+
+  /* check whether a cpuset is enabled */
+  if (!pid)
+    fd = hwloc_fopen("/proc/self/cpuset", "r", fsroot_fd);
+  else {
+    char path[] = "/proc/XXXXXXXXXX/cpuset";
+    snprintf(path, sizeof(path), "/proc/%d/cpuset", pid);
+    fd = hwloc_fopen(path, "r", fsroot_fd);
+  }
+  if (!fd) {
+    /* found nothing */
+    hwloc_debug("%s", "No cgroup or cpuset found\n");
+    return NULL;
+  }
+
+  /* found a cpuset, return the name */
+  tmp = fgets(cpuset_name, sizeof(cpuset_name), fd);
+  fclose(fd);
+  if (!tmp)
+    return NULL;
+  tmp = strchr(cpuset_name, '\n');
+  if (tmp)
+    *tmp = '\0';
+  hwloc_debug("Found cpuset %s\n", cpuset_name);
+  return strdup(cpuset_name);
+}
+
+/*
+ * Then, the cpuset description is available from either the cgroup or
+ * the cpuset filesystem (usually mounted in / or /dev) where there
+ * are cgroup<name>/cpuset.{cpus,mems} or cpuset<name>/{cpus,mems} files.
+ */
+static char *
+hwloc_read_linux_cpuset_mask(const char *cgroup_mntpnt, const char *cpuset_mntpnt, const char *cpuset_name, const char *attr_name, int fsroot_fd)
+{
+#define CPUSET_FILENAME_LEN 256
+  char cpuset_filename[CPUSET_FILENAME_LEN];
+  FILE *fd;
+  char *info = NULL, *tmp;
+  ssize_t ssize;
+  size_t size;
+
+  if (cgroup_mntpnt) {
+    /* try to read the cpuset from cgroup */
+    snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/cpuset.%s", cgroup_mntpnt, cpuset_name, attr_name);
+    hwloc_debug("Trying to read cgroup file <%s>\n", cpuset_filename);
+    fd = hwloc_fopen(cpuset_filename, "r", fsroot_fd);
+    if (fd)
+      goto gotfile;
+  } else if (cpuset_mntpnt) {
+    /* try to read the cpuset directly */
+    snprintf(cpuset_filename, CPUSET_FILENAME_LEN, "%s%s/%s", cpuset_mntpnt, cpuset_name, attr_name);
+    hwloc_debug("Trying to read cpuset file <%s>\n", cpuset_filename);
+    fd = hwloc_fopen(cpuset_filename, "r", fsroot_fd);
+    if (fd)
+      goto gotfile;
+  }
+
+  /* found no cpuset description, ignore it */
+  hwloc_debug("Couldn't find cpuset <%s> description, ignoring\n", cpuset_name);
+  goto out;
+
+gotfile:
+  ssize = getline(&info, &size, fd);
+  fclose(fd);
+  if (ssize < 0)
+    goto out;
+  if (!info)
+    goto out;
+
+  tmp = strchr(info, '\n');
+  if (tmp)
+    *tmp = '\0';
+
+out:
+  return info;
+}
+
+static void
+hwloc_admin_disable_set_from_cpuset(struct hwloc_linux_backend_data_s *data,
+				    const char *cgroup_mntpnt, const char *cpuset_mntpnt, const char *cpuset_name,
+				    const char *attr_name,
+				    hwloc_bitmap_t admin_enabled_cpus_set)
+{
+  char *cpuset_mask;
+  char *current, *comma, *tmp;
+  int prevlast, nextfirst, nextlast; /* beginning/end of enabled-segments */
+  hwloc_bitmap_t tmpset;
+
+  cpuset_mask = hwloc_read_linux_cpuset_mask(cgroup_mntpnt, cpuset_mntpnt, cpuset_name,
+					     attr_name, data->root_fd);
+  if (!cpuset_mask)
+    return;
+
+  hwloc_debug("found cpuset %s: %s\n", attr_name, cpuset_mask);
+
+  current = cpuset_mask;
+  prevlast = -1;
+
+  while (1) {
+    /* save a pointer to the next comma and erase it to simplify things */
+    comma = strchr(current, ',');
+    if (comma)
+      *comma = '\0';
+
+    /* find current enabled-segment bounds */
+    nextfirst = strtoul(current, &tmp, 0);
+    if (*tmp == '-')
+      nextlast = strtoul(tmp+1, NULL, 0);
+    else
+      nextlast = nextfirst;
+    if (prevlast+1 <= nextfirst-1) {
+      hwloc_debug("%s [%d:%d] excluded by cpuset\n", attr_name, prevlast+1, nextfirst-1);
+      hwloc_bitmap_clr_range(admin_enabled_cpus_set, prevlast+1, nextfirst-1);
+    }
+
+    /* switch to next enabled-segment */
+    prevlast = nextlast;
+    if (!comma)
+      break;
+    current = comma+1;
+  }
+
+  hwloc_debug("%s [%d:%d] excluded by cpuset\n", attr_name, prevlast+1, nextfirst-1);
+  /* no easy way to clear until the infinity */
+  tmpset = hwloc_bitmap_alloc();
+  hwloc_bitmap_set_range(tmpset, 0, prevlast);
+  hwloc_bitmap_and(admin_enabled_cpus_set, admin_enabled_cpus_set, tmpset);
+  hwloc_bitmap_free(tmpset);
+
+  free(cpuset_mask);
+}
+
+static void
+hwloc_parse_meminfo_info(struct hwloc_linux_backend_data_s *data,
+			 const char *path,
+			 int prefixlength,
+			 uint64_t *local_memory,
+			 uint64_t *meminfo_hugepages_count,
+			 uint64_t *meminfo_hugepages_size,
+			 int onlytotal)
+{
+  char string[64];
+  FILE *fd;
+
+  fd = hwloc_fopen(path, "r", data->root_fd);
+  if (!fd)
+    return;
+
+  while (fgets(string, sizeof(string), fd) && *string != '\0')
+    {
+      unsigned long long number;
+      if (strlen(string) < (size_t) prefixlength)
+        continue;
+      if (sscanf(string+prefixlength, "MemTotal: %llu kB", (unsigned long long *) &number) == 1) {
+	*local_memory = number << 10;
+	if (onlytotal)
+	  break;
+      }
+      else if (!onlytotal) {
+	if (sscanf(string+prefixlength, "Hugepagesize: %llu", (unsigned long long *) &number) == 1)
+	  *meminfo_hugepages_size = number << 10;
+	else if (sscanf(string+prefixlength, "HugePages_Free: %llu", (unsigned long long *) &number) == 1)
+          /* these are free hugepages, not the total amount of huge pages */
+	  *meminfo_hugepages_count = number;
+      }
+    }
+
+  fclose(fd);
+}
+
+#define SYSFS_NUMA_NODE_PATH_LEN 128
+
+static void
+hwloc_parse_hugepages_info(struct hwloc_linux_backend_data_s *data,
+			   const char *dirpath,
+			   struct hwloc_obj_memory_s *memory,
+			   uint64_t *remaining_local_memory)
+{
+  DIR *dir;
+  struct dirent *dirent;
+  unsigned long index_ = 1;
+  FILE *hpfd;
+  char line[64];
+  char path[SYSFS_NUMA_NODE_PATH_LEN];
+
+  dir = hwloc_opendir(dirpath, data->root_fd);
+  if (dir) {
+    while ((dirent = readdir(dir)) != NULL) {
+      if (strncmp(dirent->d_name, "hugepages-", 10))
+        continue;
+      memory->page_types[index_].size = strtoul(dirent->d_name+10, NULL, 0) * 1024ULL;
+      sprintf(path, "%s/%s/nr_hugepages", dirpath, dirent->d_name);
+      hpfd = hwloc_fopen(path, "r", data->root_fd);
+      if (hpfd) {
+        if (fgets(line, sizeof(line), hpfd)) {
+          /* these are the actual total amount of huge pages */
+          memory->page_types[index_].count = strtoull(line, NULL, 0);
+          *remaining_local_memory -= memory->page_types[index_].count * memory->page_types[index_].size;
+          index_++;
+        }
+	fclose(hpfd);
+      }
+    }
+    closedir(dir);
+    memory->page_types_len = index_;
+  }
+}
+
+static void
+hwloc_get_procfs_meminfo_info(struct hwloc_topology *topology,
+			      struct hwloc_linux_backend_data_s *data,
+			      struct hwloc_obj_memory_s *memory)
+{
+  uint64_t meminfo_hugepages_count, meminfo_hugepages_size = 0;
+  struct stat st;
+  int has_sysfs_hugepages = 0;
+  const char *pagesize_env = getenv("HWLOC_DEBUG_PAGESIZE");
+  int types = 2;
+  int err;
+
+  err = hwloc_stat("/sys/kernel/mm/hugepages", &st, data->root_fd);
+  if (!err) {
+    types = 1 + st.st_nlink-2;
+    has_sysfs_hugepages = 1;
+  }
+
+  if (topology->is_thissystem || pagesize_env) {
+    /* we cannot report any page_type info unless we have the page size.
+     * we'll take it either from the system if local, or from the debug env variable
+     */
+    memory->page_types_len = types;
+    memory->page_types = calloc(types, sizeof(*memory->page_types));
+  }
+
+  if (topology->is_thissystem) {
+    /* Get the page and hugepage sizes from sysconf */
+#ifdef HAVE__SC_LARGE_PAGESIZE
+    memory->page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
+#endif
+    memory->page_types[0].size = hwloc_getpagesize(); /* might be overwritten later by /proc/meminfo or sysfs */
+  }
+
+  hwloc_parse_meminfo_info(data, "/proc/meminfo", 0 /* no prefix */,
+			   &memory->local_memory,
+			   &meminfo_hugepages_count, &meminfo_hugepages_size,
+			   memory->page_types == NULL);
+
+  if (memory->page_types) {
+    uint64_t remaining_local_memory = memory->local_memory;
+    if (has_sysfs_hugepages) {
+      /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
+      hwloc_parse_hugepages_info(data, "/sys/kernel/mm/hugepages", memory, &remaining_local_memory);
+    } else {
+      /* use what we found in meminfo */
+      if (meminfo_hugepages_size) {
+        memory->page_types[1].size = meminfo_hugepages_size;
+        memory->page_types[1].count = meminfo_hugepages_count;
+        remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
+      } else {
+        memory->page_types_len = 1;
+      }
+    }
+
+    if (pagesize_env) {
+      /* We cannot get the pagesize if not thissystem, use the env-given one to experience the code during make check */
+      memory->page_types[0].size = strtoull(pagesize_env, NULL, 10);
+      /* If failed, use 4kB */
+      if (!memory->page_types[0].size)
+	memory->page_types[0].size = 4096;
+    }
+    assert(memory->page_types[0].size); /* from sysconf if local or from the env */
+    /* memory->page_types[1].size from sysconf if local, or from /proc/meminfo, or from sysfs,
+     * may be 0 if no hugepage support in the kernel */
+
+    memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
+  }
+}
+
+static void
+hwloc_sysfs_node_meminfo_info(struct hwloc_topology *topology,
+			      struct hwloc_linux_backend_data_s *data,
+			      const char *syspath, int node,
+			      struct hwloc_obj_memory_s *memory)
+{
+  char path[SYSFS_NUMA_NODE_PATH_LEN];
+  char meminfopath[SYSFS_NUMA_NODE_PATH_LEN];
+  uint64_t meminfo_hugepages_count = 0;
+  uint64_t meminfo_hugepages_size = 0;
+  struct stat st;
+  int has_sysfs_hugepages = 0;
+  int types = 2;
+  int err;
+
+  sprintf(path, "%s/node%d/hugepages", syspath, node);
+  err = hwloc_stat(path, &st, data->root_fd);
+  if (!err) {
+    types = 1 + st.st_nlink-2;
+    has_sysfs_hugepages = 1;
+  }
+
+  if (topology->is_thissystem) {
+    memory->page_types_len = types;
+    memory->page_types = malloc(types*sizeof(*memory->page_types));
+    memset(memory->page_types, 0, types*sizeof(*memory->page_types));
+  }
+
+  sprintf(meminfopath, "%s/node%d/meminfo", syspath, node);
+  hwloc_parse_meminfo_info(data, meminfopath,
+			   snprintf(NULL, 0, "Node %d ", node),
+			   &memory->local_memory,
+			   &meminfo_hugepages_count, NULL /* no hugepage size in node-specific meminfo */,
+			   memory->page_types == NULL);
+
+  if (memory->page_types) {
+    uint64_t remaining_local_memory = memory->local_memory;
+    if (has_sysfs_hugepages) {
+      /* read from node%d/hugepages/hugepages-%skB/nr_hugepages */
+      hwloc_parse_hugepages_info(data, path, memory, &remaining_local_memory);
+    } else {
+      /* get hugepage size from machine-specific meminfo since there is no size in node-specific meminfo,
+       * hwloc_get_procfs_meminfo_info must have been called earlier */
+      meminfo_hugepages_size = topology->levels[0][0]->memory.page_types[1].size;
+      /* use what we found in meminfo */
+      if (meminfo_hugepages_size) {
+        memory->page_types[1].count = meminfo_hugepages_count;
+        memory->page_types[1].size = meminfo_hugepages_size;
+        remaining_local_memory -= meminfo_hugepages_count * meminfo_hugepages_size;
+      } else {
+        memory->page_types_len = 1;
+      }
+    }
+    /* update what's remaining as normal pages */
+    memory->page_types[0].size = hwloc_getpagesize();
+    memory->page_types[0].count = remaining_local_memory / memory->page_types[0].size;
+  }
+}
+
+static void
+hwloc_parse_node_distance(const char *distancepath, unsigned nbnodes, float *distances, int fsroot_fd)
+{
+  char string[4096]; /* enough for hundreds of nodes */
+  char *tmp, *next;
+  FILE * fd;
+
+  fd = hwloc_fopen(distancepath, "r", fsroot_fd);
+  if (!fd)
+    return;
+
+  if (!fgets(string, sizeof(string), fd)) {
+    fclose(fd);
+    return;
+  }
+
+  tmp = string;
+  while (tmp) {
+    unsigned distance = strtoul(tmp, &next, 0);
+    if (next == tmp)
+      break;
+    *distances = (float) distance;
+    distances++;
+    nbnodes--;
+    if (!nbnodes)
+      break;
+    tmp = next+1;
+  }
+
+  fclose(fd);
+}
+
+static void
+hwloc__get_dmi_id_one_info(struct hwloc_linux_backend_data_s *data,
+			   hwloc_obj_t obj,
+			   char *path, unsigned pathlen,
+			   const char *dmi_name, const char *hwloc_name)
+{
+  char dmi_line[64];
+  char *tmp;
+  FILE *fd;
+
+  strcpy(path+pathlen, dmi_name);
+  fd = hwloc_fopen(path, "r", data->root_fd);
+  if (!fd)
+    return;
+
+  dmi_line[0] = '\0';
+  tmp = fgets(dmi_line, sizeof(dmi_line), fd);
+  fclose (fd);
+
+  if (tmp && dmi_line[0] != '\0') {
+    tmp = strchr(dmi_line, '\n');
+    if (tmp)
+      *tmp = '\0';
+    hwloc_debug("found %s '%s'\n", hwloc_name, dmi_line);
+    hwloc_obj_add_info(obj, hwloc_name, dmi_line);
+  }
+}
+
+static void
+hwloc__get_dmi_id_info(struct hwloc_linux_backend_data_s *data, hwloc_obj_t obj)
+{
+  char path[128];
+  unsigned pathlen;
+  DIR *dir;
+
+  strcpy(path, "/sys/devices/virtual/dmi/id");
+  dir = hwloc_opendir(path, data->root_fd);
+  if (dir) {
+    pathlen = 27;
+  } else {
+    strcpy(path, "/sys/class/dmi/id");
+    dir = hwloc_opendir(path, data->root_fd);
+    if (dir)
+      pathlen = 17;
+    else
+      return;
+  }
+  closedir(dir);
+
+  path[pathlen++] = '/';
+
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_name", "DMIProductName");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_version", "DMIProductVersion");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_serial", "DMIProductSerial");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "product_uuid", "DMIProductUUID");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_vendor", "DMIBoardVendor");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_name", "DMIBoardName");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_version", "DMIBoardVersion");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_serial", "DMIBoardSerial");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "board_asset_tag", "DMIBoardAssetTag");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_vendor", "DMIChassisVendor");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_type", "DMIChassisType");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_version", "DMIChassisVersion");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_serial", "DMIChassisSerial");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "chassis_asset_tag", "DMIChassisAssetTag");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_vendor", "DMIBIOSVendor");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_version", "DMIBIOSVersion");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "bios_date", "DMIBIOSDate");
+  hwloc__get_dmi_id_one_info(data, obj, path, pathlen, "sys_vendor", "DMISysVendor");
+}
+
+struct hwloc_firmware_dmi_mem_device_header {
+  unsigned char type;
+  unsigned char length;
+  unsigned char handle[2];
+  unsigned char phy_mem_handle[2];
+  unsigned char mem_err_handle[2];
+  unsigned char tot_width[2];
+  unsigned char dat_width[2];
+  unsigned char size[2];
+  unsigned char ff;
+  unsigned char dev_set;
+  unsigned char dev_loc_str_num;
+  unsigned char bank_loc_str_num;
+  unsigned char mem_type;
+  unsigned char type_detail[2];
+  unsigned char speed[2];
+  unsigned char manuf_str_num;
+  unsigned char serial_str_num;
+  unsigned char asset_tag_str_num;
+  unsigned char part_num_str_num;
+  /* don't include the following fields since we don't need them,
+   * some old implementations may miss them.
+   */
+};
+
+static int check_dmi_entry(const char *buffer)
+{
+  /* reject empty strings */
+  if (!*buffer)
+    return 0;
+  /* reject strings of spaces (at least Dell use this for empty memory slots) */
+  if (strspn(buffer, " ") == strlen(buffer))
+    return 0;
+  return 1;
+}
+
+static void
+hwloc__get_firmware_dmi_memory_info_one(struct hwloc_topology *topology,
+					unsigned idx, const char *path, FILE *fd,
+					struct hwloc_firmware_dmi_mem_device_header *header)
+{
+  unsigned slen;
+  char buffer[256]; /* enough for memory device strings, or at least for each of them */
+  unsigned foff; /* offset in raw file */
+  unsigned boff; /* offset in buffer read from raw file */
+  unsigned i;
+  struct hwloc_obj_info_s *infos = NULL;
+  unsigned infos_count = 0;
+  hwloc_obj_t misc;
+  int foundinfo = 0;
+
+  hwloc__add_info(&infos, &infos_count, "Type", "MemoryModule");
+
+  /* start after the header */
+  foff = header->length;
+  i = 1;
+  while (1) {
+    /* read one buffer */
+    if (fseek(fd, foff, SEEK_SET) < 0)
+      break;
+    if (!fgets(buffer, sizeof(buffer), fd))
+      break;
+    /* read string at the beginning of the buffer */
+    boff = 0;
+    while (1) {
+      /* stop on empty string */
+      if (!buffer[boff])
+        goto done;
+      /* stop if this string goes to the end of the buffer */
+      slen = strlen(buffer+boff);
+      if (boff + slen+1 == sizeof(buffer))
+        break;
+      /* string didn't get truncated, should be OK */
+      if (i == header->manuf_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "Vendor", buffer+boff);
+	  foundinfo = 1;
+	}
+      }	else if (i == header->serial_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "SerialNumber", buffer+boff);
+	  foundinfo = 1;
+	}
+      } else if (i == header->asset_tag_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "AssetTag", buffer+boff);
+	  foundinfo = 1;
+	}
+      } else if (i == header->part_num_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "PartNumber", buffer+boff);
+	  foundinfo = 1;
+	}
+      } else if (i == header->dev_loc_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "DeviceLocation", buffer+boff);
+	  /* only a location, not an actual info about the device */
+	}
+      } else if (i == header->bank_loc_str_num) {
+	if (check_dmi_entry(buffer+boff)) {
+	  hwloc__add_info(&infos, &infos_count, "BankLocation", buffer+boff);
+	  /* only a location, not an actual info about the device */
+	}
+      } else {
+	goto done;
+      }
+      /* next string in buffer */
+      boff += slen+1;
+      i++;
+    }
+    /* couldn't read a single full string from that buffer, we're screwed */
+    if (!boff) {
+      fprintf(stderr, "hwloc could read a DMI firmware entry #%u in %s\n",
+	      i, path);
+      break;
+    }
+    /* reread buffer after previous string */
+    foff += boff;
+  }
+
+done:
+  if (!foundinfo) {
+    /* found no actual info about the device. if there's only location info, the slot may be empty */
+    goto out_with_infos;
+  }
+
+  misc = hwloc_alloc_setup_object(HWLOC_OBJ_MISC, idx);
+  if (!misc)
+    goto out_with_infos;
+
+  hwloc__move_infos(&misc->infos, &misc->infos_count, &infos, &infos_count);
+  /* FIXME: find a way to identify the corresponding NUMA node and attach these objects there.
+   * but it means we need to parse DeviceLocation=DIMM_B4 but these vary significantly
+   * with the vendor, and it's hard to be 100% sure 'B' is second socket.
+   * Examples at http://sourceforge.net/p/edac-utils/code/HEAD/tree/trunk/src/etc/labels.db
+   * or https://github.com/grondo/edac-utils/blob/master/src/etc/labels.db
+   */
+  hwloc_insert_object_by_parent(topology, hwloc_get_root_obj(topology), misc);
+  return;
+
+ out_with_infos:
+  hwloc__free_infos(infos, infos_count);
+}
+
+static void
+hwloc__get_firmware_dmi_memory_info(struct hwloc_topology *topology,
+				    struct hwloc_linux_backend_data_s *data)
+{
+  char path[128];
+  unsigned i;
+
+  for(i=0; ; i++) {
+    FILE *fd;
+    struct hwloc_firmware_dmi_mem_device_header header;
+    int err;
+
+    snprintf(path, sizeof(path), "/sys/firmware/dmi/entries/17-%u/raw", i);
+    fd = hwloc_fopen(path, "r", data->root_fd);
+    if (!fd)
+      break;
+
+    err = fread(&header, sizeof(header), 1, fd);
+    if (err != 1)
+      break;
+    if (header.length < sizeof(header)) {
+      /* invalid, or too old entry/spec that doesn't contain what we need */
+      fclose(fd);
+      break;
+    }
+
+    hwloc__get_firmware_dmi_memory_info_one(topology, i, path, fd, &header);
+
+    fclose(fd);
+  }
+}
+
+
+/***********************************
+ ****** Device tree Discovery ******
+ ***********************************/
+
+/* Reads the entire file and returns bytes read if bytes_read != NULL
+ * Returned pointer can be freed by using free().  */
+static void *
+hwloc_read_raw(const char *p, const char *p1, size_t *bytes_read, int root_fd)
+{
+  char fname[256];
+  char *ret = NULL;
+  struct stat fs;
+  int file = -1;
+
+  snprintf(fname, sizeof(fname), "%s/%s", p, p1);
+
+  file = hwloc_open(fname, root_fd);
+  if (-1 == file) {
+      goto out_no_close;
+  }
+  if (fstat(file, &fs)) {
+    goto out;
+  }
+
+  ret = (char *) malloc(fs.st_size);
+  if (NULL != ret) {
+    ssize_t cb = read(file, ret, fs.st_size);
+    if (cb == -1) {
+      free(ret);
+      ret = NULL;
+    } else {
+      if (NULL != bytes_read)
+        *bytes_read = cb;
+    }
+  }
+
+ out:
+  close(file);
+ out_no_close:
+  return ret;
+}
+
+/* Reads the entire file and returns it as a 0-terminated string
+ * Returned pointer can be freed by using free().  */
+static char *
+hwloc_read_str(const char *p, const char *p1, int root_fd)
+{
+  size_t cb = 0;
+  char *ret = hwloc_read_raw(p, p1, &cb, root_fd);
+  if ((NULL != ret) && (0 < cb) && (0 != ret[cb-1])) {
+    ret = realloc(ret, cb + 1);
+    ret[cb] = 0;
+  }
+  return ret;
+}
+
+/* Reads first 32bit bigendian value */
+static ssize_t
+hwloc_read_unit32be(const char *p, const char *p1, uint32_t *buf, int root_fd)
+{
+  size_t cb = 0;
+  uint32_t *tmp = hwloc_read_raw(p, p1, &cb, root_fd);
+  if (sizeof(*buf) != cb) {
+    errno = EINVAL;
+    free(tmp); /* tmp is either NULL or contains useless things */
+    return -1;
+  }
+  *buf = htonl(*tmp);
+  free(tmp);
+  return sizeof(*buf);
+}
+
+typedef struct {
+  unsigned int n, allocated;
+  struct {
+    hwloc_bitmap_t cpuset;
+    uint32_t phandle;
+    uint32_t l2_cache;
+    char *name;
+  } *p;
+} device_tree_cpus_t;
+
+static void
+add_device_tree_cpus_node(device_tree_cpus_t *cpus, hwloc_bitmap_t cpuset,
+    uint32_t l2_cache, uint32_t phandle, const char *name)
+{
+  if (cpus->n == cpus->allocated) {
+    if (!cpus->allocated)
+      cpus->allocated = 64;
+    else
+      cpus->allocated *= 2;
+    cpus->p = realloc(cpus->p, cpus->allocated * sizeof(cpus->p[0]));
+  }
+  cpus->p[cpus->n].phandle = phandle;
+  cpus->p[cpus->n].cpuset = (NULL == cpuset)?NULL:hwloc_bitmap_dup(cpuset);
+  cpus->p[cpus->n].l2_cache = l2_cache;
+  cpus->p[cpus->n].name = strdup(name);
+  ++cpus->n;
+}
+
+/* Walks over the cache list in order to detect nested caches and CPU mask for each */
+static int
+look_powerpc_device_tree_discover_cache(device_tree_cpus_t *cpus,
+    uint32_t phandle, unsigned int *level, hwloc_bitmap_t cpuset)
+{
+  unsigned int i;
+  int ret = -1;
+  if ((NULL == level) || (NULL == cpuset) || phandle == (uint32_t) -1)
+    return ret;
+  for (i = 0; i < cpus->n; ++i) {
+    if (phandle != cpus->p[i].l2_cache)
+      continue;
+    if (NULL != cpus->p[i].cpuset) {
+      hwloc_bitmap_or(cpuset, cpuset, cpus->p[i].cpuset);
+      ret = 0;
+    } else {
+      ++(*level);
+      if (0 == look_powerpc_device_tree_discover_cache(cpus,
+            cpus->p[i].phandle, level, cpuset))
+        ret = 0;
+    }
+  }
+  return ret;
+}
+
+static void
+try__add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
+				    unsigned int level, hwloc_obj_cache_type_t type,
+				    uint32_t cache_line_size, uint32_t cache_size, uint32_t cache_sets,
+				    hwloc_bitmap_t cpuset)
+{
+  struct hwloc_obj *c = NULL;
+
+  if (0 == cache_size)
+    return;
+
+  c = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+  c->attr->cache.depth = level;
+  c->attr->cache.linesize = cache_line_size;
+  c->attr->cache.size = cache_size;
+  c->attr->cache.type = type;
+  if (cache_sets == 1)
+    /* likely wrong, make it unknown */
+    cache_sets = 0;
+  if (cache_sets && cache_line_size)
+    c->attr->cache.associativity = cache_size / (cache_sets * cache_line_size);
+  else
+    c->attr->cache.associativity = 0;
+  c->cpuset = hwloc_bitmap_dup(cpuset);
+  hwloc_debug_2args_bitmap("cache (%s) depth %d has cpuset %s\n",
+			   type == HWLOC_OBJ_CACHE_UNIFIED ? "unified" : (type == HWLOC_OBJ_CACHE_DATA ? "data" : "instruction"),
+			   level, c->cpuset);
+  hwloc_insert_object_by_cpuset(topology, c);
+}
+
+static void
+try_add_cache_from_device_tree_cpu(struct hwloc_topology *topology,
+				   struct hwloc_linux_backend_data_s *data,
+				   const char *cpu, unsigned int level, hwloc_bitmap_t cpuset)
+{
+  /* d-cache-block-size - ignore */
+  /* d-cache-line-size - to read, in bytes */
+  /* d-cache-sets - ignore */
+  /* d-cache-size - to read, in bytes */
+  /* i-cache, same for instruction */
+  /* cache-unified only exist if data and instruction caches are unified */
+  /* d-tlb-sets - ignore */
+  /* d-tlb-size - ignore, always 0 on power6 */
+  /* i-tlb-*, same */
+  uint32_t d_cache_line_size = 0, d_cache_size = 0, d_cache_sets = 0;
+  uint32_t i_cache_line_size = 0, i_cache_size = 0, i_cache_sets = 0;
+  char unified_path[1024];
+  struct stat statbuf;
+  int unified;
+
+  snprintf(unified_path, sizeof(unified_path), "%s/cache-unified", cpu);
+  unified = (hwloc_stat(unified_path, &statbuf, data->root_fd) == 0);
+
+  hwloc_read_unit32be(cpu, "d-cache-line-size", &d_cache_line_size,
+      data->root_fd);
+  hwloc_read_unit32be(cpu, "d-cache-size", &d_cache_size,
+      data->root_fd);
+  hwloc_read_unit32be(cpu, "d-cache-sets", &d_cache_sets,
+      data->root_fd);
+  hwloc_read_unit32be(cpu, "i-cache-line-size", &i_cache_line_size,
+      data->root_fd);
+  hwloc_read_unit32be(cpu, "i-cache-size", &i_cache_size,
+      data->root_fd);
+  hwloc_read_unit32be(cpu, "i-cache-sets", &i_cache_sets,
+      data->root_fd);
+
+  if (!unified)
+    try__add_cache_from_device_tree_cpu(topology, level, HWLOC_OBJ_CACHE_INSTRUCTION,
+					i_cache_line_size, i_cache_size, i_cache_sets, cpuset);
+  try__add_cache_from_device_tree_cpu(topology, level, unified ? HWLOC_OBJ_CACHE_UNIFIED : HWLOC_OBJ_CACHE_DATA,
+				      d_cache_line_size, d_cache_size, d_cache_sets, cpuset);
+}
+
+/*
+ * Discovers L1/L2/L3 cache information on IBM PowerPC systems for old kernels (RHEL5.*)
+ * which provide NUMA nodes information without any details
+ */
+static void
+look_powerpc_device_tree(struct hwloc_topology *topology,
+			 struct hwloc_linux_backend_data_s *data)
+{
+  device_tree_cpus_t cpus;
+  const char ofroot[] = "/proc/device-tree/cpus";
+  unsigned int i;
+  int root_fd = data->root_fd;
+  DIR *dt = hwloc_opendir(ofroot, root_fd);
+  struct dirent *dirent;
+
+  if (NULL == dt)
+    return;
+
+  /* only works for Power so far, and not useful on ARM */
+  if (strncmp(data->utsname.machine, "ppc", 3))
+    return;
+
+  cpus.n = 0;
+  cpus.p = NULL;
+  cpus.allocated = 0;
+
+  while (NULL != (dirent = readdir(dt))) {
+    char cpu[256];
+    char *device_type;
+    uint32_t reg = -1, l2_cache = -1, phandle = -1;
+
+    if ('.' == dirent->d_name[0])
+      continue;
+
+    snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, dirent->d_name);
+
+    device_type = hwloc_read_str(cpu, "device_type", root_fd);
+    if (NULL == device_type)
+      continue;
+
+    hwloc_read_unit32be(cpu, "reg", &reg, root_fd);
+    if (hwloc_read_unit32be(cpu, "next-level-cache", &l2_cache, root_fd) == -1)
+      hwloc_read_unit32be(cpu, "l2-cache", &l2_cache, root_fd);
+    if (hwloc_read_unit32be(cpu, "phandle", &phandle, root_fd) == -1)
+      if (hwloc_read_unit32be(cpu, "ibm,phandle", &phandle, root_fd) == -1)
+        hwloc_read_unit32be(cpu, "linux,phandle", &phandle, root_fd);
+
+    if (0 == strcmp(device_type, "cache")) {
+      add_device_tree_cpus_node(&cpus, NULL, l2_cache, phandle, dirent->d_name);
+    }
+    else if (0 == strcmp(device_type, "cpu")) {
+      /* Found CPU */
+      hwloc_bitmap_t cpuset = NULL;
+      size_t cb = 0;
+      uint32_t *threads = hwloc_read_raw(cpu, "ibm,ppc-interrupt-server#s", &cb, root_fd);
+      uint32_t nthreads = cb / sizeof(threads[0]);
+
+      if (NULL != threads) {
+        cpuset = hwloc_bitmap_alloc();
+        for (i = 0; i < nthreads; ++i) {
+          if (hwloc_bitmap_isset(topology->levels[0][0]->complete_cpuset, ntohl(threads[i])))
+            hwloc_bitmap_set(cpuset, ntohl(threads[i]));
+        }
+        free(threads);
+      } else if ((unsigned int)-1 != reg) {
+        /* Doesn't work on ARM because cpu "reg" do not start at 0.
+	 * We know the first cpu "reg" is the lowest. The others are likely
+	 * in order assuming the device-tree shows objects in order.
+	 */
+        cpuset = hwloc_bitmap_alloc();
+        hwloc_bitmap_set(cpuset, reg);
+      }
+
+      if (NULL == cpuset) {
+        hwloc_debug("%s has no \"reg\" property, skipping\n", cpu);
+      } else {
+        struct hwloc_obj *core = NULL;
+        add_device_tree_cpus_node(&cpus, cpuset, l2_cache, phandle, dirent->d_name);
+
+        /* Add core */
+        core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, reg);
+        core->cpuset = hwloc_bitmap_dup(cpuset);
+        hwloc_insert_object_by_cpuset(topology, core);
+
+        /* Add L1 cache */
+        try_add_cache_from_device_tree_cpu(topology, data, cpu, 1, cpuset);
+
+        hwloc_bitmap_free(cpuset);
+      }
+    }
+    free(device_type);
+  }
+  closedir(dt);
+
+  /* No cores and L2 cache were found, exiting */
+  if (0 == cpus.n) {
+    hwloc_debug("No cores and L2 cache were found in %s, exiting\n", ofroot);
+    return;
+  }
+
+#ifdef HWLOC_DEBUG
+  for (i = 0; i < cpus.n; ++i) {
+    hwloc_debug("%i: %s  ibm,phandle=%08X l2_cache=%08X ",
+      i, cpus.p[i].name, cpus.p[i].phandle, cpus.p[i].l2_cache);
+    if (NULL == cpus.p[i].cpuset) {
+      hwloc_debug("%s\n", "no cpuset");
+    } else {
+      hwloc_debug_bitmap("cpuset %s\n", cpus.p[i].cpuset);
+    }
+  }
+#endif
+
+  /* Scan L2/L3/... caches */
+  for (i = 0; i < cpus.n; ++i) {
+    unsigned int level = 2;
+    hwloc_bitmap_t cpuset;
+    /* Skip real CPUs */
+    if (NULL != cpus.p[i].cpuset)
+      continue;
+
+    /* Calculate cache level and CPU mask */
+    cpuset = hwloc_bitmap_alloc();
+    if (0 == look_powerpc_device_tree_discover_cache(&cpus,
+          cpus.p[i].phandle, &level, cpuset)) {
+      char cpu[256];
+      snprintf(cpu, sizeof(cpu), "%s/%s", ofroot, cpus.p[i].name);
+      try_add_cache_from_device_tree_cpu(topology, data, cpu, level, cpuset);
+    }
+    hwloc_bitmap_free(cpuset);
+  }
+
+  /* Do cleanup */
+  for (i = 0; i < cpus.n; ++i) {
+    hwloc_bitmap_free(cpus.p[i].cpuset);
+    free(cpus.p[i].name);
+  }
+  free(cpus.p);
+}
+
+
+
+/**************************************
+ ****** Sysfs Topology Discovery ******
+ **************************************/
+
+static int
+look_sysfsnode(struct hwloc_topology *topology,
+	       struct hwloc_linux_backend_data_s *data,
+	       const char *path, unsigned *found)
+{
+  unsigned osnode;
+  unsigned nbnodes = 0;
+  DIR *dir;
+  struct dirent *dirent;
+  hwloc_bitmap_t nodeset;
+
+  *found = 0;
+
+  /* Get the list of nodes first */
+  dir = hwloc_opendir(path, data->root_fd);
+  if (dir)
+    {
+      nodeset = hwloc_bitmap_alloc();
+      while ((dirent = readdir(dir)) != NULL)
+	{
+	  if (strncmp(dirent->d_name, "node", 4))
+	    continue;
+	  osnode = strtoul(dirent->d_name+4, NULL, 0);
+	  hwloc_bitmap_set(nodeset, osnode);
+	  nbnodes++;
+	}
+      closedir(dir);
+    }
+  else
+    return -1;
+
+  if (nbnodes <= 1)
+    {
+      hwloc_bitmap_free(nodeset);
+      return 0;
+    }
+
+  /* For convenience, put these declarations inside a block. */
+
+  {
+      hwloc_obj_t * nodes = calloc(nbnodes, sizeof(hwloc_obj_t));
+      unsigned *indexes = calloc(nbnodes, sizeof(unsigned));
+      float * distances;
+      int failednodes = 0;
+      unsigned index_;
+
+      if (NULL == nodes || NULL == indexes) {
+          free(nodes);
+          free(indexes);
+          hwloc_bitmap_free(nodeset);
+          nbnodes = 0;
+          goto out;
+      }
+
+      /* Unsparsify node indexes.
+       * We'll need them later because Linux groups sparse distances
+       * and keeps them in order in the sysfs distance files.
+       * It'll simplify things in the meantime.
+       */
+      index_ = 0;
+      hwloc_bitmap_foreach_begin (osnode, nodeset) {
+	indexes[index_] = osnode;
+	index_++;
+      } hwloc_bitmap_foreach_end();
+      hwloc_bitmap_free(nodeset);
+
+#ifdef HWLOC_DEBUG
+      hwloc_debug("%s", "NUMA indexes: ");
+      for (index_ = 0; index_ < nbnodes; index_++) {
+	hwloc_debug(" %u", indexes[index_]);
+      }
+      hwloc_debug("%s", "\n");
+#endif
+
+      /* Create NUMA objects */
+      for (index_ = 0; index_ < nbnodes; index_++) {
+          char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
+          hwloc_bitmap_t cpuset;
+          hwloc_obj_t node, res_obj;
+
+	  osnode = indexes[index_];
+
+          sprintf(nodepath, "%s/node%u/cpumap", path, osnode);
+          cpuset = hwloc_parse_cpumap(nodepath, data->root_fd);
+          if (!cpuset) {
+	    /* This NUMA object won't be inserted, we'll ignore distances */
+	    failednodes++;
+	    continue;
+	  }
+
+          node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, osnode);
+          node->cpuset = cpuset;
+          node->nodeset = hwloc_bitmap_alloc();
+          hwloc_bitmap_set(node->nodeset, osnode);
+
+          hwloc_sysfs_node_meminfo_info(topology, data, path, osnode, &node->memory);
+
+          hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
+                                  osnode, node->cpuset);
+          res_obj = hwloc_insert_object_by_cpuset(topology, node);
+	  if (node == res_obj) {
+	    nodes[index_] = node;
+	  } else {
+	    /* We got merged somehow, could be a buggy BIOS reporting wrong NUMA node cpuset.
+	     * This object disappeared, we'll ignore distances */
+	    failednodes++;
+	  }
+      }
+
+      if (failednodes) {
+	/* failed to read/create some nodes, don't bother reading/fixing
+	 * a distance matrix that would likely be wrong anyway.
+	 */
+	nbnodes -= failednodes;
+	distances = NULL;
+      } else {
+	distances = calloc(nbnodes*nbnodes, sizeof(float));
+      }
+
+      if (NULL == distances) {
+          free(nodes);
+          free(indexes);
+          goto out;
+      }
+
+      /* Get actual distances now */
+      for (index_ = 0; index_ < nbnodes; index_++) {
+          char nodepath[SYSFS_NUMA_NODE_PATH_LEN];
+
+	  osnode = indexes[index_];
+
+	  /* Linux nodeX/distance file contains distance from X to other localities (from ACPI SLIT table or so),
+	   * store them in slots X*N...X*N+N-1 */
+          sprintf(nodepath, "%s/node%u/distance", path, osnode);
+          hwloc_parse_node_distance(nodepath, nbnodes, distances+index_*nbnodes, data->root_fd);
+      }
+
+      hwloc_distances_set(topology, HWLOC_OBJ_NUMANODE, nbnodes, indexes, nodes, distances, 0 /* OS cannot force */);
+  }
+
+ out:
+  *found = nbnodes;
+  return 0;
+}
+
+/* Look at Linux' /sys/devices/system/cpu/cpu%d/topology/ */
+static int
+look_sysfscpu(struct hwloc_topology *topology,
+	      struct hwloc_linux_backend_data_s *data,
+	      const char *path,
+	      struct hwloc_linux_cpuinfo_proc * cpuinfo_Lprocs, unsigned cpuinfo_numprocs)
+{
+  hwloc_bitmap_t cpuset; /* Set of cpus for which we have topology information */
+  hwloc_bitmap_t unknownset; /* Set of cpus to clear */
+#define CPU_TOPOLOGY_STR_LEN 128
+  char str[CPU_TOPOLOGY_STR_LEN];
+  DIR *dir;
+  int i,j;
+  FILE *fd;
+  unsigned caches_added, merge_buggy_core_siblings;
+  hwloc_obj_t packages = NULL; /* temporary list of packages before actual insert in the tree */
+
+  /* fill the cpuset of interesting cpus */
+  dir = hwloc_opendir(path, data->root_fd);
+  if (!dir)
+    return -1;
+  else {
+    struct dirent *dirent;
+    cpuset = hwloc_bitmap_alloc();
+    unknownset = hwloc_bitmap_alloc();
+
+    while ((dirent = readdir(dir)) != NULL) {
+      unsigned long cpu;
+      char online[2];
+
+      if (strncmp(dirent->d_name, "cpu", 3))
+	continue;
+      cpu = strtoul(dirent->d_name+3, NULL, 0);
+
+      /* Maybe we don't have topology information but at least it exists */
+      hwloc_bitmap_set(topology->levels[0][0]->complete_cpuset, cpu);
+
+      /* check whether this processor is online */
+      sprintf(str, "%s/cpu%lu/online", path, cpu);
+      fd = hwloc_fopen(str, "r", data->root_fd);
+      if (fd) {
+	if (fgets(online, sizeof(online), fd)) {
+	  if (!atoi(online)) {
+	    fclose(fd);
+	    hwloc_debug("os proc %lu is offline\n", cpu);
+	    hwloc_bitmap_clr(topology->levels[0][0]->allowed_cpuset, cpu);
+	    hwloc_bitmap_set(unknownset, cpu);
+	    continue;
+	  }
+	}
+	fclose(fd);
+      }
+
+      /* check whether the kernel exports topology information for this cpu */
+      sprintf(str, "%s/cpu%lu/topology", path, cpu);
+      if (hwloc_access(str, X_OK, data->root_fd) < 0 && errno == ENOENT) {
+	hwloc_debug("os proc %lu has no accessible %s/cpu%lu/topology\n",
+		   cpu, path, cpu);
+	hwloc_bitmap_clr(topology->levels[0][0]->allowed_cpuset, cpu);
+	hwloc_bitmap_set(unknownset, cpu);
+	continue;
+      }
+
+      hwloc_bitmap_set(cpuset, cpu);
+    }
+    closedir(dir);
+  }
+
+  topology->support.discovery->pu = 1;
+  hwloc_debug_1arg_bitmap("found %d cpu topologies, cpuset %s\n",
+	     hwloc_bitmap_weight(cpuset), cpuset);
+
+  merge_buggy_core_siblings = (!strcmp(data->utsname.machine, "x86_64"))
+			   || (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"));
+  caches_added = 0;
+  hwloc_bitmap_foreach_begin(i, cpuset)
+    {
+      hwloc_bitmap_t packageset, coreset, bookset, threadset, savedcoreset;
+      unsigned mypackageid, mycoreid, mybookid;
+      int threadwithcoreid = 0;
+
+      /* look at the package */
+      mypackageid = 0; /* shut-up the compiler */
+      sprintf(str, "%s/cpu%d/topology/physical_package_id", path, i);
+      hwloc_parse_sysfs_unsigned(str, &mypackageid, data->root_fd);
+
+      sprintf(str, "%s/cpu%d/topology/core_siblings", path, i);
+      packageset = hwloc_parse_cpumap(str, data->root_fd);
+      if (packageset) {
+       hwloc_bitmap_andnot(packageset, packageset, unknownset);
+       if (hwloc_bitmap_first(packageset) == i) {
+        /* first cpu in this package, add the package */
+	struct hwloc_obj *package;
+
+	if (merge_buggy_core_siblings) {
+	  /* check for another package with same physical_package_id */
+	  hwloc_obj_t curpackage = packages;
+	  while (curpackage) {
+	    if (curpackage->os_index == mypackageid) {
+	      /* found another package with same physical_package_id but different core_siblings.
+	       * looks like a buggy kernel on Intel Xeon E5 v3 processor with two rings.
+	       * merge these core_siblings to extend the existing first package object.
+	       */
+	      static int reported = 0;
+	      if (!reported && !hwloc_hide_errors()) {
+		char *a, *b;
+		hwloc_bitmap_asprintf(&a, curpackage->cpuset);
+		hwloc_bitmap_asprintf(&b, packageset);
+		fprintf(stderr, "****************************************************************************\n");
+		fprintf(stderr, "* hwloc %s has detected buggy sysfs package information: Two packages have\n", HWLOC_VERSION);
+		fprintf(stderr, "* the same physical package id %u but different core_siblings %s and %s\n",
+			mypackageid, a, b);
+		fprintf(stderr, "* hwloc is merging these packages into a single one assuming your Linux kernel\n");
+		fprintf(stderr, "* does not support this processor correctly.\n");
+		fprintf(stderr, "* You may hide this warning by setting HWLOC_HIDE_ERRORS=1 in the environment.\n");
+	        fprintf(stderr, "*\n");
+		fprintf(stderr, "* If hwloc does not report the right number of packages,\n");
+		fprintf(stderr, "* please report this error message to the hwloc user's mailing list,\n");
+		fprintf(stderr, "* along with the output+tarball generated by the hwloc-gather-topology script.\n");
+		fprintf(stderr, "****************************************************************************\n");
+		reported = 1;
+		free(a);
+		free(b);
+	      }
+	      hwloc_bitmap_or(curpackage->cpuset, curpackage->cpuset, packageset);
+	      goto package_done;
+	    }
+	    curpackage = curpackage->next_cousin;
+	  }
+	}
+
+	/* no package with same physical_package_id, create a new one */
+	package = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, mypackageid);
+	package->cpuset = packageset;
+	hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
+				mypackageid, packageset);
+	/* add cpuinfo */
+	if (cpuinfo_Lprocs) {
+	  for(j=0; j<(int) cpuinfo_numprocs; j++)
+	    if ((int) cpuinfo_Lprocs[j].Pproc == i) {
+	      hwloc__move_infos(&package->infos, &package->infos_count,
+				&cpuinfo_Lprocs[j].infos, &cpuinfo_Lprocs[j].infos_count);
+	    }
+	}
+	/* insert in a temporary list in case we have to modify the cpuset by merging other core_siblings later.
+	 * we'll actually insert the tree at the end of the entire sysfs cpu loop.
+	 */
+	package->next_cousin = packages;
+	packages = package;
+
+	packageset = NULL; /* don't free it */
+       }
+      }
+package_done:
+      hwloc_bitmap_free(packageset);
+
+      /* look at the core */
+      mycoreid = 0; /* shut-up the compiler */
+      sprintf(str, "%s/cpu%d/topology/core_id", path, i);
+      hwloc_parse_sysfs_unsigned(str, &mycoreid, data->root_fd);
+
+      sprintf(str, "%s/cpu%d/topology/thread_siblings", path, i);
+      coreset = hwloc_parse_cpumap(str, data->root_fd);
+      savedcoreset = coreset; /* store it for later work-arounds */
+      if (coreset) {
+       hwloc_bitmap_andnot(coreset, coreset, unknownset);
+       if (hwloc_bitmap_weight(coreset) > 1) {
+	/* check if this is hyper-threading or different coreids */
+	unsigned siblingid, siblingcoreid;
+	hwloc_bitmap_t set = hwloc_bitmap_dup(coreset);
+	hwloc_bitmap_clr(set, i);
+	siblingid = hwloc_bitmap_first(set);
+	siblingcoreid = mycoreid;
+	sprintf(str, "%s/cpu%d/topology/core_id", path, siblingid);
+	hwloc_parse_sysfs_unsigned(str, &siblingcoreid, data->root_fd);
+	threadwithcoreid = (siblingcoreid != mycoreid);
+	hwloc_bitmap_free(set);
+       }
+       if (hwloc_bitmap_first(coreset) == i || threadwithcoreid) {
+	/* regular core */
+        struct hwloc_obj *core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, mycoreid);
+	if (threadwithcoreid) {
+	  /* amd multicore compute-unit, create one core per thread */
+	  core->cpuset = hwloc_bitmap_alloc();
+	  hwloc_bitmap_set(core->cpuset, i);
+	} else {
+	  core->cpuset = coreset;
+	}
+        hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
+                     mycoreid, coreset);
+        hwloc_insert_object_by_cpuset(topology, core);
+        coreset = NULL; /* don't free it */
+       }
+      }
+
+      /* look at the books */
+      mybookid = 0; /* shut-up the compiler */
+      sprintf(str, "%s/cpu%d/topology/book_id", path, i);
+      if (hwloc_parse_sysfs_unsigned(str, &mybookid, data->root_fd) == 0) {
+        sprintf(str, "%s/cpu%d/topology/book_siblings", path, i);
+        bookset = hwloc_parse_cpumap(str, data->root_fd);
+	if (bookset) {
+	 hwloc_bitmap_andnot(bookset, bookset, unknownset);
+         if (bookset && hwloc_bitmap_first(bookset) == i) {
+          struct hwloc_obj *book = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, mybookid);
+          book->cpuset = bookset;
+          hwloc_debug_1arg_bitmap("os book %u has cpuset %s\n",
+                       mybookid, bookset);
+          hwloc_obj_add_info(book, "Type", "Book");
+          hwloc_insert_object_by_cpuset(topology, book);
+          bookset = NULL; /* don't free it */
+	 }
+        }
+      }
+
+      {
+      /* look at the thread */
+      struct hwloc_obj *thread = hwloc_alloc_setup_object(HWLOC_OBJ_PU, i);
+      threadset = hwloc_bitmap_alloc();
+      hwloc_bitmap_only(threadset, i);
+      thread->cpuset = threadset;
+      hwloc_debug_1arg_bitmap("thread %d has cpuset %s\n",
+		 i, threadset);
+      hwloc_insert_object_by_cpuset(topology, thread);
+      }
+
+      /* look at the caches */
+      for(j=0; j<10; j++) {
+#define SHARED_CPU_MAP_STRLEN 128
+	char mappath[SHARED_CPU_MAP_STRLEN];
+	char str2[20]; /* enough for a level number (one digit) or a type (Data/Instruction/Unified) */
+	hwloc_bitmap_t cacheset;
+	unsigned long kB = 0;
+	unsigned linesize = 0;
+	unsigned sets = 0, lines_per_tag = 1;
+	int depth; /* 0 for L1, .... */
+	hwloc_obj_cache_type_t type = HWLOC_OBJ_CACHE_UNIFIED; /* default */
+
+	/* get the cache level depth */
+	sprintf(mappath, "%s/cpu%d/cache/index%d/level", path, i, j);
+	fd = hwloc_fopen(mappath, "r", data->root_fd);
+	if (fd) {
+	  char *res = fgets(str2,sizeof(str2), fd);
+	  fclose(fd);
+	  if (res)
+	    depth = strtoul(str2, NULL, 10)-1;
+	  else
+	    continue;
+	} else
+	  continue;
+
+	/* cache type */
+	sprintf(mappath, "%s/cpu%d/cache/index%d/type", path, i, j);
+	fd = hwloc_fopen(mappath, "r", data->root_fd);
+	if (fd) {
+	  if (fgets(str2, sizeof(str2), fd)) {
+	    fclose(fd);
+	    if (!strncmp(str2, "Data", 4))
+	      type = HWLOC_OBJ_CACHE_DATA;
+	    else if (!strncmp(str2, "Unified", 7))
+	      type = HWLOC_OBJ_CACHE_UNIFIED;
+	    else if (!strncmp(str2, "Instruction", 11))
+	      type = HWLOC_OBJ_CACHE_INSTRUCTION;
+	    else
+	      continue;
+	  } else {
+	    fclose(fd);
+	    continue;
+	  }
+	} else
+	  continue;
+
+	/* get the cache size */
+	sprintf(mappath, "%s/cpu%d/cache/index%d/size", path, i, j);
+	fd = hwloc_fopen(mappath, "r", data->root_fd);
+	if (fd) {
+	  if (fgets(str2,sizeof(str2), fd))
+	    kB = atol(str2); /* in kB */
+	  fclose(fd);
+	}
+
+	/* get the line size */
+	sprintf(mappath, "%s/cpu%d/cache/index%d/coherency_line_size", path, i, j);
+	fd = hwloc_fopen(mappath, "r", data->root_fd);
+	if (fd) {
+	  if (fgets(str2,sizeof(str2), fd))
+	    linesize = atol(str2); /* in bytes */
+	  fclose(fd);
+	}
+
+	/* get the number of sets and lines per tag.
+	 * don't take the associativity directly in "ways_of_associativity" because
+	 * some archs (ia64, ppc) put 0 there when fully-associative, while others (x86) put something like -1 there.
+	 */
+	sprintf(mappath, "%s/cpu%d/cache/index%d/number_of_sets", path, i, j);
+	fd = hwloc_fopen(mappath, "r", data->root_fd);
+	if (fd) {
+	  if (fgets(str2,sizeof(str2), fd))
+	    sets = atol(str2);
+	  fclose(fd);
+	}
+	sprintf(mappath, "%s/cpu%d/cache/index%d/physical_line_partition", path, i, j);
+	fd = hwloc_fopen(mappath, "r", data->root_fd);
+	if (fd) {
+	  if (fgets(str2,sizeof(str2), fd))
+	    lines_per_tag = atol(str2);
+	  fclose(fd);
+	}
+
+	sprintf(mappath, "%s/cpu%d/cache/index%d/shared_cpu_map", path, i, j);
+	cacheset = hwloc_parse_cpumap(mappath, data->root_fd);
+        if (cacheset) {
+	  hwloc_bitmap_andnot(cacheset, cacheset, unknownset);
+          if (hwloc_bitmap_weight(cacheset) < 1) {
+            /* mask is wrong (useful for many itaniums) */
+            if (savedcoreset)
+              /* assume it's a core-specific cache */
+              hwloc_bitmap_copy(cacheset, savedcoreset);
+            else
+              /* assumes it's not shared */
+              hwloc_bitmap_only(cacheset, i);
+          }
+
+          if (hwloc_bitmap_first(cacheset) == i) {
+            /* first cpu in this cache, add the cache */
+            struct hwloc_obj *cache = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, -1);
+            cache->attr->cache.size = kB << 10;
+            cache->attr->cache.depth = depth+1;
+            cache->attr->cache.linesize = linesize;
+	    cache->attr->cache.type = type;
+	    if (!linesize || !lines_per_tag || !sets)
+	      cache->attr->cache.associativity = 0; /* unknown */
+	    else if (sets == 1)
+	      cache->attr->cache.associativity = 0; /* likely wrong, make it unknown */
+	    else
+	      cache->attr->cache.associativity = (kB << 10) / linesize / lines_per_tag / sets;
+            cache->cpuset = cacheset;
+            hwloc_debug_1arg_bitmap("cache depth %d has cpuset %s\n",
+                       depth, cacheset);
+            hwloc_insert_object_by_cpuset(topology, cache);
+            cacheset = NULL; /* don't free it */
+            ++caches_added;
+          }
+        }
+        hwloc_bitmap_free(cacheset);
+      }
+      hwloc_bitmap_free(coreset);
+    }
+  hwloc_bitmap_foreach_end();
+
+  /* actually insert in the tree now that package cpusets have been fixed-up */
+  while (packages) {
+    hwloc_obj_t next = packages->next_cousin;
+    packages->next_cousin = NULL;
+    hwloc_insert_object_by_cpuset(topology, packages);
+    packages = next;
+  }
+
+  if (0 == caches_added)
+    look_powerpc_device_tree(topology, data);
+
+  hwloc_bitmap_free(cpuset);
+  hwloc_bitmap_free(unknownset);
+
+  return 0;
+}
+
+
+
+/****************************************
+ ****** cpuinfo Topology Discovery ******
+ ****************************************/
+
+static int
+hwloc_linux_parse_cpuinfo_x86(const char *prefix, const char *value,
+			      struct hwloc_obj_info_s **infos, unsigned *infos_count,
+			      int is_global __hwloc_attribute_unused)
+{
+  if (!strcmp("vendor_id", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUVendor", value);
+  } else if (!strcmp("model name", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUModel", value);
+  } else if (!strcmp("model", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
+  } else if (!strcmp("cpu family", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
+  } else if (!strcmp("stepping", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUStepping", value);
+  }
+  return 0;
+}
+
+static int
+hwloc_linux_parse_cpuinfo_ia64(const char *prefix, const char *value,
+			       struct hwloc_obj_info_s **infos, unsigned *infos_count,
+			       int is_global __hwloc_attribute_unused)
+{
+  if (!strcmp("vendor", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUVendor", value);
+  } else if (!strcmp("model name", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUModel", value);
+  } else if (!strcmp("model", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUModelNumber", value);
+  } else if (!strcmp("family", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUFamilyNumber", value);
+  }
+  return 0;
+}
+
+static int
+hwloc_linux_parse_cpuinfo_arm(const char *prefix, const char *value,
+			      struct hwloc_obj_info_s **infos, unsigned *infos_count,
+			      int is_global __hwloc_attribute_unused)
+{
+  if (!strcmp("Processor", prefix) /* old kernels with one Processor header */
+      || !strcmp("model name", prefix) /* new kernels with one model name per core */) {
+    hwloc__add_info(infos, infos_count, "CPUModel", value);
+  } else if (!strcmp("CPU implementer", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUImplementer", value);
+  } else if (!strcmp("CPU architecture", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUArchitecture", value);
+  } else if (!strcmp("CPU variant", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUVariant", value);
+  } else if (!strcmp("CPU part", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUPart", value);
+  } else if (!strcmp("CPU revision", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPURevision", value);
+  } else if (!strcmp("Hardware", prefix)) {
+    hwloc__add_info(infos, infos_count, "HardwareName", value);
+  } else if (!strcmp("Revision", prefix)) {
+    hwloc__add_info(infos, infos_count, "HardwareRevision", value);
+  } else if (!strcmp("Serial", prefix)) {
+    hwloc__add_info(infos, infos_count, "HardwareSerial", value);
+  }
+  return 0;
+}
+
+static int
+hwloc_linux_parse_cpuinfo_ppc(const char *prefix, const char *value,
+			      struct hwloc_obj_info_s **infos, unsigned *infos_count,
+			      int is_global)
+{
+  /* common fields */
+  if (!strcmp("cpu", prefix)) {
+    hwloc__add_info(infos, infos_count, "CPUModel", value);
+  } else if (!strcmp("platform", prefix)) {
+    hwloc__add_info(infos, infos_count, "PlatformName", value);
+  } else if (!strcmp("model", prefix)) {
+    hwloc__add_info(infos, infos_count, "PlatformModel", value);
+  }
+  /* platform-specific fields */
+  else if (!strcasecmp("vendor", prefix)) {
+    hwloc__add_info(infos, infos_count, "PlatformVendor", value);
+  } else if (!strcmp("Board ID", prefix)) {
+    hwloc__add_info(infos, infos_count, "PlatformBoardID", value);
+  } else if (!strcmp("Board", prefix)
+	     || !strcasecmp("Machine", prefix)) {
+    /* machine and board are similar (and often more precise) than model above */
+    char **valuep = hwloc__find_info_slot(infos, infos_count, "PlatformModel");
+    if (*valuep)
+      free(*valuep);
+    *valuep = strdup(value);
+  } else if (!strcasecmp("Revision", prefix)
+	     || !strcmp("Hardware rev", prefix)) {
+    hwloc__add_info(infos, infos_count, is_global ? "PlatformRevision" : "CPURevision", value);
+  } else if (!strcmp("SVR", prefix)) {
+    hwloc__add_info(infos, infos_count, "SystemVersionRegister", value);
+  } else if (!strcmp("PVR", prefix)) {
+    hwloc__add_info(infos, infos_count, "ProcessorVersionRegister", value);
+  }
+  /* don't match 'board*' because there's also "board l2" on some platforms */
+  return 0;
+}
+
+/*
+ * avr32: "chip type\t:"			=> OK
+ * blackfin: "model name\t:"			=> OK
+ * h8300: "CPU:"				=> OK
+ * m68k: "CPU:"					=> OK
+ * mips: "cpu model\t\t:"			=> OK
+ * openrisc: "CPU:"				=> OK
+ * sparc: "cpu\t\t:"				=> OK
+ * tile: "model name\t:"			=> OK
+ * unicore32: "Processor\t:"			=> OK
+ * alpha: "cpu\t\t\t: Alpha" + "cpu model\t\t:"	=> "cpu" overwritten by "cpu model", no processor indexes
+ * cris: "cpu\t\t:" + "cpu model\t:"		=> only "cpu"
+ * frv: "CPU-Core:" + "CPU:"			=> only "CPU"
+ * mn10300: "cpu core   :" + "model name :"	=> only "model name"
+ * parisc: "cpu family\t:" + "cpu\t\t:"		=> only "cpu"
+ *
+ * not supported because of conflicts with other arch minor lines:
+ * m32r: "cpu family\t:"			=> KO (adding "cpu family" would break "blackfin")
+ * microblaze: "CPU-Family:"			=> KO
+ * sh: "cpu family\t:" + "cpu type\t:"		=> KO
+ * xtensa: "model\t\t:"				=> KO
+ */
+static int
+hwloc_linux_parse_cpuinfo_generic(const char *prefix, const char *value,
+				  struct hwloc_obj_info_s **infos, unsigned *infos_count,
+				  int is_global __hwloc_attribute_unused)
+{
+  if (!strcmp("model name", prefix)
+      || !strcmp("Processor", prefix)
+      || !strcmp("chip type", prefix)
+      || !strcmp("cpu model", prefix)
+      || !strcasecmp("cpu", prefix)) {
+    /* keep the last one, assume it's more precise than the first one.
+     * we should have the Architecture keypair for basic information anyway.
+     */
+    char **valuep = hwloc__find_info_slot(infos, infos_count, "CPUModel");
+    if (*valuep)
+      free(*valuep);
+    *valuep = strdup(value);
+  }
+  return 0;
+}
+
+static int
+hwloc_linux_parse_cpuinfo(struct hwloc_linux_backend_data_s *data,
+			  const char *path,
+			  struct hwloc_linux_cpuinfo_proc ** Lprocs_p,
+			  struct hwloc_obj_info_s **global_infos, unsigned *global_infos_count)
+{
+  FILE *fd;
+  char *str = NULL;
+  char *endptr;
+  unsigned len;
+  unsigned allocated_Lprocs = 0;
+  struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
+  unsigned numprocs = 0;
+  int curproc = -1;
+  int (*parse_cpuinfo_func)(const char *, const char *, struct hwloc_obj_info_s **, unsigned *, int) = NULL;
+
+  if (!(fd=hwloc_fopen(path,"r", data->root_fd)))
+    {
+      hwloc_debug("could not open %s\n", path);
+      return -1;
+    }
+
+#      define PROCESSOR	"processor"
+#      define PACKAGEID "physical id" /* the longest one */
+#      define COREID "core id"
+  len = 128; /* vendor/model can be very long */
+  str = malloc(len);
+  hwloc_debug("\n\n * Topology extraction from %s *\n\n", path);
+  while (fgets(str,len,fd)!=NULL) {
+    unsigned long Ppkg, Pcore, Pproc;
+    char *end, *dot, *prefix, *value;
+    int noend = 0;
+
+    /* remove the ending \n */
+    end = strchr(str, '\n');
+    if (end)
+      *end = 0;
+    else
+      noend = 1;
+    /* if empty line, skip and reset curproc */
+    if (!*str) {
+      curproc = -1;
+      continue;
+    }
+    /* skip lines with no dot */
+    dot = strchr(str, ':');
+    if (!dot)
+      continue;
+    /* skip lines not starting with a letter */
+    if ((*str > 'z' || *str < 'a')
+	&& (*str > 'Z' || *str < 'A'))
+      continue;
+
+    /* mark the end of the prefix */
+    prefix = str;
+    end = dot;
+    while (end[-1] == ' ' || end[-1] == '	') end--; /* need a strrspn() */
+    *end = 0;
+    /* find beginning of value, its end is already marked */
+    value = dot+1 + strspn(dot+1, " 	");
+
+    /* defines for parsing numbers */
+#   define getprocnb_begin(field, var)					\
+    if (!strcmp(field,prefix)) {					\
+      var = strtoul(value,&endptr,0);					\
+      if (endptr==value) {						\
+	hwloc_debug("no number in "field" field of %s\n", path);	\
+	goto err;							\
+      } else if (var==ULONG_MAX) {					\
+	hwloc_debug("too big "field" number in %s\n", path); 		\
+	goto err;							\
+      }									\
+      hwloc_debug(field " %lu\n", var)
+#   define getprocnb_end()						\
+    }
+    /* actually parse numbers */
+    getprocnb_begin(PROCESSOR, Pproc);
+    curproc = numprocs++;
+    if (numprocs > allocated_Lprocs) {
+      if (!allocated_Lprocs)
+	allocated_Lprocs = 8;
+      else
+        allocated_Lprocs *= 2;
+      Lprocs = realloc(Lprocs, allocated_Lprocs * sizeof(*Lprocs));
+    }
+    Lprocs[curproc].Pproc = Pproc;
+    Lprocs[curproc].Pcore = -1;
+    Lprocs[curproc].Ppkg = -1;
+    Lprocs[curproc].Lcore = -1;
+    Lprocs[curproc].Lpkg = -1;
+    Lprocs[curproc].infos = NULL;
+    Lprocs[curproc].infos_count = 0;
+    getprocnb_end() else
+    getprocnb_begin(PACKAGEID, Ppkg);
+    Lprocs[curproc].Ppkg = Ppkg;
+    getprocnb_end() else
+    getprocnb_begin(COREID, Pcore);
+    Lprocs[curproc].Pcore = Pcore;
+    getprocnb_end() else {
+
+      /* architecture specific or default routine for parsing cpumodel */
+      if (!parse_cpuinfo_func) {
+	parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_generic;
+	if (*data->utsname.machine) {
+	  /* x86_32 x86_64 k1om => x86 */
+	  if (!strcmp(data->utsname.machine, "x86_64")
+	      || (data->utsname.machine[0] == 'i' && !strcmp(data->utsname.machine+2, "86"))
+	      || !strcmp(data->utsname.machine, "k1om"))
+	    parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_x86;
+	  /* ia64 */
+	  else if (!strcmp(data->utsname.machine, "ia64"))
+	    parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ia64;
+	  /* arm */
+	  else if (!strncmp(data->utsname.machine, "arm", 3))
+	    parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_arm;
+	  else if (!strncmp(data->utsname.machine, "ppc", 3)
+		   || !strncmp(data->utsname.machine, "power", 5))
+	    parse_cpuinfo_func = hwloc_linux_parse_cpuinfo_ppc;
+	}
+      }
+      /* we can't assume that we already got a processor index line:
+       * alpha/frv/h8300/m68k/microblaze/sparc have no processor lines at all, only a global entry.
+       * tile has a global section with model name before the list of processor lines.
+       */
+      parse_cpuinfo_func(prefix, value,
+			 curproc >= 0 ? &Lprocs[curproc].infos : global_infos,
+			 curproc >= 0 ? &Lprocs[curproc].infos_count : global_infos_count,
+			 curproc < 0);
+    }
+
+    if (noend) {
+      /* ignore end of line */
+      if (fscanf(fd,"%*[^\n]") == EOF)
+	break;
+      getc(fd);
+    }
+  }
+  fclose(fd);
+  free(str);
+
+  *Lprocs_p = Lprocs;
+  return numprocs;
+
+ err:
+  fclose(fd);
+  free(str);
+  free(Lprocs);
+  return -1;
+}
+
+static void
+hwloc_linux_free_cpuinfo(struct hwloc_linux_cpuinfo_proc * Lprocs, unsigned numprocs,
+			 struct hwloc_obj_info_s *global_infos, unsigned global_infos_count)
+{
+  if (Lprocs) {
+    unsigned i;
+    for(i=0; i<numprocs; i++) {
+      hwloc__free_infos(Lprocs[i].infos, Lprocs[i].infos_count);
+    }
+    free(Lprocs);
+  }
+  hwloc__free_infos(global_infos, global_infos_count);
+}
+
+static int
+look_cpuinfo(struct hwloc_topology *topology,
+	     struct hwloc_linux_backend_data_s *data,
+	     const char *path)
+{
+  struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
+  struct hwloc_obj_info_s *global_infos = NULL;
+  unsigned global_infos_count = 0;
+  /* P for physical/OS index, L for logical (e.g. in we order we get them, not in the final hwloc logical order) */
+  unsigned *Lcore_to_Pcore;
+  unsigned *Lcore_to_Ppkg; /* needed because Lcore is equivalent to Pcore+Ppkg, not to Pcore alone */
+  unsigned *Lpkg_to_Ppkg;
+  int _numprocs;
+  unsigned numprocs;
+  unsigned numpkgs=0;
+  unsigned numcores=0;
+  unsigned long Lproc;
+  unsigned missingpkg;
+  unsigned missingcore;
+  unsigned i,j;
+
+  /* parse the entire cpuinfo first, fill the Lprocs array and numprocs */
+  _numprocs = hwloc_linux_parse_cpuinfo(data, path, &Lprocs, &global_infos, &global_infos_count);
+
+
+  /* setup root info */
+  hwloc__move_infos(&hwloc_get_root_obj(topology)->infos, &hwloc_get_root_obj(topology)->infos_count,
+		    &global_infos, &global_infos_count);
+
+
+  if (_numprocs <= 0)
+    /* found no processor */
+    return -1;
+  numprocs = _numprocs;
+
+  /* initialize misc arrays, there can be at most numprocs entries */
+  Lcore_to_Pcore = malloc(numprocs * sizeof(*Lcore_to_Pcore));
+  Lcore_to_Ppkg = malloc(numprocs * sizeof(*Lcore_to_Ppkg));
+  Lpkg_to_Ppkg = malloc(numprocs * sizeof(*Lpkg_to_Ppkg));
+  for (i = 0; i < numprocs; i++) {
+    Lcore_to_Pcore[i] = -1;
+    Lcore_to_Ppkg[i] = -1;
+    Lpkg_to_Ppkg[i] = -1;
+  }
+
+  /* create PU objects */
+  for(Lproc=0; Lproc<numprocs; Lproc++) {
+    unsigned long Pproc = Lprocs[Lproc].Pproc;
+    hwloc_obj_t obj = hwloc_alloc_setup_object(HWLOC_OBJ_PU, Pproc);
+    obj->cpuset = hwloc_bitmap_alloc();
+    hwloc_bitmap_only(obj->cpuset, Pproc);
+    hwloc_debug_2args_bitmap("cpu %lu (os %lu) has cpuset %s\n",
+			     Lproc, Pproc, obj->cpuset);
+    hwloc_insert_object_by_cpuset(topology, obj);
+  }
+
+  topology->support.discovery->pu = 1;
+
+  hwloc_debug("%s", "\n * Topology summary *\n");
+  hwloc_debug("%u processors)\n", numprocs);
+
+  /* fill Lprocs[].Lpkg and Lpkg_to_Ppkg */
+  for(Lproc=0; Lproc<numprocs; Lproc++) {
+    long Ppkg = Lprocs[Lproc].Ppkg;
+    if (Ppkg != -1) {
+      unsigned long Pproc = Lprocs[Lproc].Pproc;
+      for (i=0; i<numpkgs; i++)
+	if ((unsigned) Ppkg == Lpkg_to_Ppkg[i])
+	  break;
+      Lprocs[Lproc].Lpkg = i;
+      hwloc_debug("%lu on package %u (%lx)\n", Pproc, i, Ppkg);
+      if (i==numpkgs) {
+	Lpkg_to_Ppkg[numpkgs] = Ppkg;
+	numpkgs++;
+      }
+    }
+  }
+  /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
+   * provide bogus information. We should rather drop it. */
+  missingpkg=0;
+  for(j=0; j<numprocs; j++)
+    if (Lprocs[i].Ppkg == -1) {
+      missingpkg=1;
+      break;
+    }
+  /* create package objects */
+  hwloc_debug("%u pkgs%s\n", numpkgs, missingpkg ? ", but some missing package" : "");
+  if (!missingpkg && numpkgs>0) {
+    for (i = 0; i < numpkgs; i++) {
+      struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, Lpkg_to_Ppkg[i]);
+      int doneinfos = 0;
+      obj->cpuset = hwloc_bitmap_alloc();
+      for(j=0; j<numprocs; j++)
+	if ((unsigned) Lprocs[j].Lpkg == i) {
+	  hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
+	  if (!doneinfos) {
+	    hwloc__move_infos(&obj->infos, &obj->infos_count, &Lprocs[j].infos, &Lprocs[j].infos_count);
+	    doneinfos = 1;
+	  }
+	}
+      hwloc_debug_1arg_bitmap("package %d has cpuset %s\n", i, obj->cpuset);
+      hwloc_insert_object_by_cpuset(topology, obj);
+    }
+    hwloc_debug("%s", "\n");
+  }
+
+  /* fill Lprocs[].Lcore, Lcore_to_Ppkg and Lcore_to_Pcore */
+  for(Lproc=0; Lproc<numprocs; Lproc++) {
+    long Pcore = Lprocs[Lproc].Pcore;
+    if (Pcore != -1) {
+      for (i=0; i<numcores; i++)
+	if ((unsigned) Pcore == Lcore_to_Pcore[i] && (unsigned) Lprocs[Lproc].Ppkg == Lcore_to_Ppkg[i])
+	  break;
+      Lprocs[Lproc].Lcore = i;
+      if (i==numcores) {
+	Lcore_to_Ppkg[numcores] = Lprocs[Lproc].Ppkg;
+	Lcore_to_Pcore[numcores] = Pcore;
+	numcores++;
+      }
+    }
+  }
+  /* Some buggy Linuxes don't provide numbers for processor 0, which makes us
+   * provide bogus information. We should rather drop it. */
+  missingcore=0;
+  for(j=0; j<numprocs; j++)
+    if (Lprocs[i].Pcore == -1) {
+      missingcore=1;
+      break;
+    }
+  /* create Core objects */
+  hwloc_debug("%u cores%s\n", numcores, missingcore ? ", but some missing core" : "");
+  if (!missingcore && numcores>0) {
+    for (i = 0; i < numcores; i++) {
+      struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, Lcore_to_Pcore[i]);
+      obj->cpuset = hwloc_bitmap_alloc();
+      for(j=0; j<numprocs; j++)
+	if ((unsigned) Lprocs[j].Lcore == i)
+	  hwloc_bitmap_set(obj->cpuset, Lprocs[j].Pproc);
+      hwloc_debug_1arg_bitmap("Core %d has cpuset %s\n", i, obj->cpuset);
+      hwloc_insert_object_by_cpuset(topology, obj);
+    }
+    hwloc_debug("%s", "\n");
+  }
+
+  free(Lcore_to_Pcore);
+  free(Lcore_to_Ppkg);
+  free(Lpkg_to_Ppkg);
+
+  hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
+
+  look_powerpc_device_tree(topology, data);
+  return 0;
+}
+
+
+
+/*************************************
+ ****** Main Topology Discovery ******
+ *************************************/
+
+static void
+hwloc__linux_get_mic_sn(struct hwloc_topology *topology, struct hwloc_linux_backend_data_s *data)
+{
+  FILE *file;
+  char line[64], *tmp, *end;
+  file = hwloc_fopen("/proc/elog", "r", data->root_fd);
+  if (!file)
+    return;
+  if (!fgets(line, sizeof(line), file))
+    goto out_with_file;
+  if (strncmp(line, "Card ", 5))
+    goto out_with_file;
+  tmp = line + 5;
+  end = strchr(tmp, ':');
+  if (!end)
+    goto out_with_file;
+  *end = '\0';
+  hwloc_obj_add_info(hwloc_get_root_obj(topology), "MICSerialNumber", tmp);
+
+ out_with_file:
+  fclose(file);
+}
+
+static void
+hwloc_linux_fallback_pu_level(struct hwloc_topology *topology)
+{
+  if (topology->is_thissystem)
+    hwloc_setup_pu_level(topology, hwloc_fallback_nbprocessors(topology));
+  else
+    /* fsys-root but not this system, no way, assume there's just 1
+     * processor :/ */
+    hwloc_setup_pu_level(topology, 1);
+}
+
+static void
+hwloc_gather_system_info(struct hwloc_topology *topology,
+			 struct hwloc_linux_backend_data_s *data)
+{
+  FILE *file;
+  char line[128]; /* enough for utsname fields */
+  const char *env;
+
+  /* initialize to something sane */
+  memset(&data->utsname, 0, sizeof(data->utsname));
+
+  /* read thissystem info */
+  if (topology->is_thissystem)
+    uname(&data->utsname);
+
+  /* overwrite with optional /proc/hwloc-nofile-info */
+  file = hwloc_fopen("/proc/hwloc-nofile-info", "r", data->root_fd);
+  if (file) {
+    while (fgets(line, sizeof(line), file)) {
+      char *tmp = strchr(line, '\n');
+      if (!strncmp("OSName: ", line, 8)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.sysname, line+8, sizeof(data->utsname.sysname));
+	data->utsname.sysname[sizeof(data->utsname.sysname)-1] = '\0';
+      } else if (!strncmp("OSRelease: ", line, 11)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.release, line+11, sizeof(data->utsname.release));
+	data->utsname.release[sizeof(data->utsname.release)-1] = '\0';
+      } else if (!strncmp("OSVersion: ", line, 11)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.version, line+11, sizeof(data->utsname.version));
+	data->utsname.version[sizeof(data->utsname.version)-1] = '\0';
+      } else if (!strncmp("HostName: ", line, 10)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.nodename, line+10, sizeof(data->utsname.nodename));
+	data->utsname.nodename[sizeof(data->utsname.nodename)-1] = '\0';
+      } else if (!strncmp("Architecture: ", line, 14)) {
+	if (tmp)
+	  *tmp = '\0';
+	strncpy(data->utsname.machine, line+14, sizeof(data->utsname.machine));
+	data->utsname.machine[sizeof(data->utsname.machine)-1] = '\0';
+      } else {
+	hwloc_debug("ignored /proc/hwloc-nofile-info line %s\n", line);
+	/* ignored */
+      }
+    }
+    fclose(file);
+  }
+
+  env = getenv("HWLOC_DUMP_NOFILE_INFO");
+  if (env && *env) {
+    file = fopen(env, "w");
+    if (file) {
+      if (*data->utsname.sysname)
+	fprintf(file, "OSName: %s\n", data->utsname.sysname);
+      if (*data->utsname.release)
+	fprintf(file, "OSRelease: %s\n", data->utsname.release);
+      if (*data->utsname.version)
+	fprintf(file, "OSVersion: %s\n", data->utsname.version);
+      if (*data->utsname.nodename)
+	fprintf(file, "HostName: %s\n", data->utsname.nodename);
+      if (*data->utsname.machine)
+	fprintf(file, "Architecture: %s\n", data->utsname.machine);
+      fclose(file);
+    }
+  }
+}
+
+static int
+hwloc_look_linuxfs(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  unsigned nbnodes;
+  char *cpuset_mntpnt, *cgroup_mntpnt, *cpuset_name = NULL;
+  int err;
+
+  if (topology->levels[0][0]->cpuset)
+    /* somebody discovered things */
+    return 0;
+
+  hwloc_gather_system_info(topology, data);
+
+  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+
+  /* Gather the list of admin-disabled cpus and mems */
+  hwloc_find_linux_cpuset_mntpnt(&cgroup_mntpnt, &cpuset_mntpnt, data->root_fd);
+  if (cgroup_mntpnt || cpuset_mntpnt) {
+    cpuset_name = hwloc_read_linux_cpuset_name(data->root_fd, topology->pid);
+    if (cpuset_name) {
+      hwloc_admin_disable_set_from_cpuset(data, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "cpus", topology->levels[0][0]->allowed_cpuset);
+      hwloc_admin_disable_set_from_cpuset(data, cgroup_mntpnt, cpuset_mntpnt, cpuset_name, "mems", topology->levels[0][0]->allowed_nodeset);
+    }
+    free(cgroup_mntpnt);
+    free(cpuset_mntpnt);
+  }
+
+    /* Get the machine memory attributes */
+    hwloc_get_procfs_meminfo_info(topology, data, &topology->levels[0][0]->memory);
+
+    /* Gather NUMA information. Must be after hwloc_get_procfs_meminfo_info so that the hugepage size is known */
+    if (look_sysfsnode(topology, data, "/sys/bus/node/devices", &nbnodes) < 0)
+      look_sysfsnode(topology, data, "/sys/devices/system/node", &nbnodes);
+
+    /* if we found some numa nodes, the machine object has no local memory */
+    if (nbnodes) {
+      unsigned i;
+      topology->levels[0][0]->memory.local_memory = 0;
+      if (topology->levels[0][0]->memory.page_types)
+        for(i=0; i<topology->levels[0][0]->memory.page_types_len; i++)
+          topology->levels[0][0]->memory.page_types[i].count = 0;
+    }
+
+    /* Gather the list of cpus now */
+    if (getenv("HWLOC_LINUX_USE_CPUINFO")
+	|| (hwloc_access("/sys/devices/system/cpu/cpu0/topology/core_siblings", R_OK, data->root_fd) < 0
+	    && hwloc_access("/sys/devices/system/cpu/cpu0/topology/thread_siblings", R_OK, data->root_fd) < 0
+	    && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/thread_siblings", R_OK, data->root_fd) < 0
+	    && hwloc_access("/sys/bus/cpu/devices/cpu0/topology/core_siblings", R_OK, data->root_fd) < 0)) {
+	/* revert to reading cpuinfo only if /sys/.../topology unavailable (before 2.6.16)
+	 * or not containing anything interesting */
+      err = look_cpuinfo(topology, data, "/proc/cpuinfo");
+      if (err < 0)
+	hwloc_linux_fallback_pu_level(topology);
+
+    } else {
+      struct hwloc_linux_cpuinfo_proc * Lprocs = NULL;
+      struct hwloc_obj_info_s *global_infos = NULL;
+      unsigned global_infos_count = 0;
+      int numprocs = hwloc_linux_parse_cpuinfo(data, "/proc/cpuinfo", &Lprocs, &global_infos, &global_infos_count);
+      if (numprocs <= 0)
+	Lprocs = NULL;
+      if (look_sysfscpu(topology, data, "/sys/bus/cpu/devices", Lprocs, numprocs) < 0)
+        if (look_sysfscpu(topology, data, "/sys/devices/system/cpu", Lprocs, numprocs) < 0)
+	  /* sysfs but we failed to read cpu topology, fallback */
+	  hwloc_linux_fallback_pu_level(topology);
+      hwloc__move_infos(&hwloc_get_root_obj(topology)->infos, &hwloc_get_root_obj(topology)->infos_count,
+			&global_infos, &global_infos_count);
+      hwloc_linux_free_cpuinfo(Lprocs, numprocs, global_infos, global_infos_count);
+    }
+
+  /* Gather DMI info */
+  hwloc__get_dmi_id_info(data, topology->levels[0][0]);
+  if (hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO))
+    hwloc__get_firmware_dmi_memory_info(topology, data);
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "Linux");
+  if (cpuset_name) {
+    hwloc_obj_add_info(topology->levels[0][0], "LinuxCgroup", cpuset_name);
+    free(cpuset_name);
+  }
+
+  hwloc__linux_get_mic_sn(topology, data);
+
+  /* data->utsname was filled with real uname or \0, we can safely pass it */
+  hwloc_add_uname_info(topology, &data->utsname);
+
+  return 1;
+}
+
+
+
+/****************************************
+ ***** Linux PCI backend callbacks ******
+ ****************************************
+ * Do not support changing the fsroot (use sysfs)
+ */
+
+static hwloc_obj_t
+hwloc_linux_add_os_device(struct hwloc_backend *backend, struct hwloc_obj *pcidev, hwloc_obj_osdev_type_t type, const char *name)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_obj *obj = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1);
+  obj->name = strdup(name);
+  obj->logical_index = -1;
+  obj->attr->osdev.type = type;
+
+  hwloc_insert_object_by_parent(topology, pcidev, obj);
+  /* insert_object_by_parent() doesn't merge during insert, so obj is still valid */
+
+  return obj;
+}
+
+typedef void (*hwloc_linux_class_fillinfos_t)(struct hwloc_backend *backend, struct hwloc_obj *osdev, const char *osdevpath);
+
+/* cannot be used in fsroot-aware code, would have to move to a per-topology variable */
+
+static void
+hwloc_linux_check_deprecated_classlinks_model(struct hwloc_linux_backend_data_s *data)
+{
+  int root_fd = data->root_fd;
+  DIR *dir;
+  struct dirent *dirent;
+  char path[128];
+  struct stat st;
+
+  data->deprecated_classlinks_model = -1;
+
+  dir = hwloc_opendir("/sys/class/net", root_fd);
+  if (!dir)
+    return;
+  while ((dirent = readdir(dir)) != NULL) {
+    if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, "..") || !strcmp(dirent->d_name, "lo"))
+      continue;
+    snprintf(path, sizeof(path), "/sys/class/net/%s/device/net/%s", dirent->d_name, dirent->d_name);
+    if (hwloc_stat(path, &st, root_fd) == 0) {
+      data->deprecated_classlinks_model = 0;
+      goto out;
+    }
+    snprintf(path, sizeof(path), "/sys/class/net/%s/device/net:%s", dirent->d_name, dirent->d_name);
+    if (hwloc_stat(path, &st, root_fd) == 0) {
+      data->deprecated_classlinks_model = 1;
+      goto out;
+    }
+  }
+out:
+  closedir(dir);
+}
+
+/* class objects that are immediately below pci devices:
+ * look for objects of the given classname below a sysfs (pcidev) directory
+ */
+static int
+hwloc_linux_class_readdir(struct hwloc_backend *backend,
+			  struct hwloc_obj *pcidev, const char *devicepath,
+			  hwloc_obj_osdev_type_t type, const char *classname,
+			  hwloc_linux_class_fillinfos_t fillinfo)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  size_t classnamelen = strlen(classname);
+  char path[256];
+  DIR *dir;
+  struct dirent *dirent;
+  hwloc_obj_t obj;
+  int res = 0, err;
+
+  if (data->deprecated_classlinks_model == -2)
+    hwloc_linux_check_deprecated_classlinks_model(data);
+
+  if (data->deprecated_classlinks_model != 1) {
+    /* modern sysfs: <device>/<class>/<name> */
+    struct stat st;
+    snprintf(path, sizeof(path), "%s/%s", devicepath, classname);
+
+    /* some very host kernel (2.6.9/RHEL4) have <device>/<class> symlink without any way to find <name>.
+     * make sure <device>/<class> is a directory to avoid this case.
+     */
+    err = hwloc_lstat(path, &st, root_fd);
+    if (err < 0 || !S_ISDIR(st.st_mode))
+      goto trydeprecated;
+
+    dir = hwloc_opendir(path, root_fd);
+    if (dir) {
+      data->deprecated_classlinks_model = 0;
+      while ((dirent = readdir(dir)) != NULL) {
+	if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+	  continue;
+	obj = hwloc_linux_add_os_device(backend, pcidev, type, dirent->d_name);
+	if (fillinfo) {
+	  snprintf(path, sizeof(path), "%s/%s/%s", devicepath, classname, dirent->d_name);
+	  fillinfo(backend, obj, path);
+	}
+	res++;
+      }
+      closedir(dir);
+      return res;
+    }
+  }
+
+trydeprecated:
+  if (data->deprecated_classlinks_model != 0) {
+    /* deprecated sysfs: <device>/<class>:<name> */
+    dir = hwloc_opendir(devicepath, root_fd);
+    if (dir) {
+      while ((dirent = readdir(dir)) != NULL) {
+	if (strncmp(dirent->d_name, classname, classnamelen) || dirent->d_name[classnamelen] != ':')
+	  continue;
+	data->deprecated_classlinks_model = 1;
+	obj = hwloc_linux_add_os_device(backend, pcidev, type, dirent->d_name + classnamelen+1);
+	if (fillinfo) {
+	  snprintf(path, sizeof(path), "%s/%s", devicepath, dirent->d_name);
+	  fillinfo(backend, obj, path);
+	}
+	res++;
+      }
+      closedir(dir);
+      return res;
+    }
+  }
+
+  return 0;
+}
+
+/*
+ * look for net objects below a pcidev in sysfs
+ */
+static void
+hwloc_linux_net_class_fillinfos(struct hwloc_backend *backend,
+				struct hwloc_obj *obj, const char *osdevpath)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  FILE *fd;
+  struct stat st;
+  char path[256];
+  snprintf(path, sizeof(path), "%s/address", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char address[128];
+    if (fgets(address, sizeof(address), fd)) {
+      char *eol = strchr(address, '\n');
+      if (eol)
+        *eol = 0;
+      hwloc_obj_add_info(obj, "Address", address);
+    }
+    fclose(fd);
+  }
+  snprintf(path, sizeof(path), "%s/device/infiniband", osdevpath);
+  if (!hwloc_stat(path, &st, root_fd)) {
+    snprintf(path, sizeof(path), "%s/dev_id", osdevpath);
+    fd = hwloc_fopen(path, "r", root_fd);
+    if (fd) {
+      char hexid[16];
+      if (fgets(hexid, sizeof(hexid), fd)) {
+	char *eoid;
+	unsigned long port;
+	port = strtoul(hexid, &eoid, 0);
+	if (eoid != hexid) {
+	  char portstr[16];
+	  snprintf(portstr, sizeof(portstr), "%ld", port+1);
+	  hwloc_obj_add_info(obj, "Port", portstr);
+	}
+      }
+      fclose(fd);
+    }
+  }
+}
+
+static int
+hwloc_linux_lookup_net_class(struct hwloc_backend *backend,
+			     struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_NETWORK, "net", hwloc_linux_net_class_fillinfos);
+}
+
+/*
+ * look for infiniband objects below a pcidev in sysfs
+ */
+static void
+hwloc_linux_infiniband_class_fillinfos(struct hwloc_backend *backend,
+				       struct hwloc_obj *obj, const char *osdevpath)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  FILE *fd;
+  char path[256];
+  unsigned i,j;
+
+  snprintf(path, sizeof(path), "%s/node_guid", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char guidvalue[20];
+    if (fgets(guidvalue, sizeof(guidvalue), fd)) {
+      size_t len;
+      len = strspn(guidvalue, "0123456789abcdefx:");
+      assert(len == 19);
+      guidvalue[len] = '\0';
+      hwloc_obj_add_info(obj, "NodeGUID", guidvalue);
+    }
+    fclose(fd);
+  }
+
+  snprintf(path, sizeof(path), "%s/sys_image_guid", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char guidvalue[20];
+    if (fgets(guidvalue, sizeof(guidvalue), fd)) {
+      size_t len;
+      len = strspn(guidvalue, "0123456789abcdefx:");
+      assert(len == 19);
+      guidvalue[len] = '\0';
+      hwloc_obj_add_info(obj, "SysImageGUID", guidvalue);
+    }
+    fclose(fd);
+  }
+
+  for(i=1; ; i++) {
+    snprintf(path, sizeof(path), "%s/ports/%u/state", osdevpath, i);
+    fd = hwloc_fopen(path, "r", root_fd);
+    if (fd) {
+      char statevalue[2];
+      if (fgets(statevalue, sizeof(statevalue), fd)) {
+	char statename[32];
+	statevalue[1] = '\0'; /* only keep the first byte/digit */
+	snprintf(statename, sizeof(statename), "Port%uState", i);
+	hwloc_obj_add_info(obj, statename, statevalue);
+      }
+      fclose(fd);
+    } else {
+      /* no such port */
+      break;
+    }
+
+    snprintf(path, sizeof(path), "%s/ports/%u/lid", osdevpath, i);
+    fd = hwloc_fopen(path, "r", root_fd);
+    if (fd) {
+      char lidvalue[11];
+      if (fgets(lidvalue, sizeof(lidvalue), fd)) {
+	char lidname[32];
+	size_t len;
+	len = strspn(lidvalue, "0123456789abcdefx");
+	lidvalue[len] = '\0';
+	snprintf(lidname, sizeof(lidname), "Port%uLID", i);
+	hwloc_obj_add_info(obj, lidname, lidvalue);
+      }
+      fclose(fd);
+    }
+
+    snprintf(path, sizeof(path), "%s/ports/%u/lid_mask_count", osdevpath, i);
+    fd = hwloc_fopen(path, "r", root_fd);
+    if (fd) {
+      char lidvalue[11];
+      if (fgets(lidvalue, sizeof(lidvalue), fd)) {
+	char lidname[32];
+	size_t len;
+	len = strspn(lidvalue, "0123456789");
+	lidvalue[len] = '\0';
+	snprintf(lidname, sizeof(lidname), "Port%uLMC", i);
+	hwloc_obj_add_info(obj, lidname, lidvalue);
+      }
+      fclose(fd);
+    }
+
+    for(j=0; ; j++) {
+      snprintf(path, sizeof(path), "%s/ports/%u/gids/%u", osdevpath, i, j);
+      fd = hwloc_fopen(path, "r", root_fd);
+      if (fd) {
+	char gidvalue[40];
+	if (fgets(gidvalue, sizeof(gidvalue), fd)) {
+	  char gidname[32];
+	  size_t len;
+	  len = strspn(gidvalue, "0123456789abcdefx:");
+	  assert(len == 39);
+	  gidvalue[len] = '\0';
+	  if (strncmp(gidvalue+20, "0000:0000:0000:0000", 19)) {
+	    /* only keep initialized GIDs */
+	    snprintf(gidname, sizeof(gidname), "Port%uGID%u", i, j);
+	    hwloc_obj_add_info(obj, gidname, gidvalue);
+	  }
+	}
+	fclose(fd);
+      } else {
+	/* no such port */
+	break;
+      }
+    }
+  }
+}
+
+static int
+hwloc_linux_lookup_openfabrics_class(struct hwloc_backend *backend,
+				     struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_OPENFABRICS, "infiniband", hwloc_linux_infiniband_class_fillinfos);
+}
+
+/* look for dma objects below a pcidev in sysfs */
+static int
+hwloc_linux_lookup_dma_class(struct hwloc_backend *backend,
+			     struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_DMA, "dma", NULL);
+}
+
+/* look for drm objects below a pcidev in sysfs */
+static int
+hwloc_linux_lookup_drm_class(struct hwloc_backend *backend,
+			     struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_GPU, "drm", NULL);
+
+  /* we could look at the "graphics" class too, but it doesn't help for proprietary drivers either */
+
+  /* GPU devices (even with a proprietary driver) seem to have a boot_vga field in their PCI device directory (since 2.6.30),
+   * so we could create a OS device for each PCI devices with such a field.
+   * boot_vga is actually created when class >> 8 == VGA (it contains 1 for boot vga device), so it's trivial anyway.
+   */
+}
+
+/*
+ * look for block objects below a pcidev in sysfs
+ */
+
+static void
+hwloc_linux_block_class_fillinfos(struct hwloc_backend *backend,
+				  struct hwloc_obj *obj, const char *osdevpath)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  FILE *fd;
+  char path[256];
+  char line[128];
+  char vendor[64] = "";
+  char model[64] = "";
+  char serial[64] = "";
+  char revision[64] = "";
+  char blocktype[64] = "";
+  unsigned major_id, minor_id;
+  char *tmp;
+
+  snprintf(path, sizeof(path), "%s/dev", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (!fd)
+    return;
+
+  if (NULL == fgets(line, sizeof(line), fd)) {
+    fclose(fd);
+    return;
+  }
+  fclose(fd);
+
+  if (sscanf(line, "%u:%u", &major_id, &minor_id) != 2)
+    return;
+  tmp = strchr(line, '\n');
+  if (tmp)
+    *tmp = '\0';
+  hwloc_obj_add_info(obj, "LinuxDeviceID", line);
+
+#ifdef HAVE_LIBUDEV_H
+  if (data->udev) {
+    struct udev_device *dev;
+    const char *prop;
+    dev = udev_device_new_from_subsystem_sysname(data->udev, "block", obj->name);
+    if (!dev)
+      return;
+    prop = udev_device_get_property_value(dev, "ID_VENDOR");
+    if (prop)
+      strcpy(vendor, prop);
+    prop = udev_device_get_property_value(dev, "ID_MODEL");
+    if (prop)
+      strcpy(model, prop);
+    prop = udev_device_get_property_value(dev, "ID_REVISION");
+    if (prop)
+      strcpy(revision, prop);
+    prop = udev_device_get_property_value(dev, "ID_SERIAL_SHORT");
+    if (prop)
+      strcpy(serial, prop);
+    prop = udev_device_get_property_value(dev, "ID_TYPE");
+    if (prop)
+      strcpy(blocktype, prop);
+
+    udev_device_unref(dev);
+  } else
+    /* fallback to reading files, works with any fsroot */
+#endif
+ {
+  snprintf(path, sizeof(path), "/run/udev/data/b%u:%u", major_id, minor_id);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (!fd)
+    return;
+
+  while (NULL != fgets(line, sizeof(line), fd)) {
+    tmp = strchr(line, '\n');
+    if (tmp)
+      *tmp = '\0';
+    if (!strncmp(line, "E:ID_VENDOR=", strlen("E:ID_VENDOR="))) {
+      strcpy(vendor, line+strlen("E:ID_VENDOR="));
+    } else if (!strncmp(line, "E:ID_MODEL=", strlen("E:ID_MODEL="))) {
+      strcpy(model, line+strlen("E:ID_MODEL="));
+    } else if (!strncmp(line, "E:ID_REVISION=", strlen("E:ID_REVISION="))) {
+      strcpy(revision, line+strlen("E:ID_REVISION="));
+    } else if (!strncmp(line, "E:ID_SERIAL_SHORT=", strlen("E:ID_SERIAL_SHORT="))) {
+      strcpy(serial, line+strlen("E:ID_SERIAL_SHORT="));
+    } else if (!strncmp(line, "E:ID_TYPE=", strlen("E:ID_TYPE="))) {
+      strcpy(blocktype, line+strlen("E:ID_TYPE="));
+    }
+  }
+  fclose(fd);
+ }
+
+  /* clear fake "ATA" vendor name */
+  if (!strcasecmp(vendor, "ATA"))
+    *vendor = '\0';
+  /* overwrite vendor name from model when possible */
+  if (!*vendor) {
+    if (!strncasecmp(model, "wd", 2))
+      strcpy(vendor, "Western Digital");
+    else if (!strncasecmp(model, "st", 2))
+      strcpy(vendor, "Seagate");
+    else if (!strncasecmp(model, "samsung", 7))
+      strcpy(vendor, "Samsung");
+    else if (!strncasecmp(model, "sandisk", 7))
+      strcpy(vendor, "SanDisk");
+    else if (!strncasecmp(model, "toshiba", 7))
+      strcpy(vendor, "Toshiba");
+  }
+
+  if (*vendor)
+    hwloc_obj_add_info(obj, "Vendor", vendor);
+  if (*model)
+    hwloc_obj_add_info(obj, "Model", model);
+  if (*revision)
+    hwloc_obj_add_info(obj, "Revision", revision);
+  if (*serial)
+    hwloc_obj_add_info(obj, "SerialNumber", serial);
+
+  if (!strcmp(blocktype, "disk"))
+    hwloc_obj_add_info(obj, "Type", "Disk");
+  else if (!strcmp(blocktype, "tape"))
+    hwloc_obj_add_info(obj, "Type", "Tape");
+  else if (!strcmp(blocktype, "cd") || !strcmp(blocktype, "floppy") || !strcmp(blocktype, "optical"))
+    hwloc_obj_add_info(obj, "Type", "Removable Media Device");
+  else /* generic, usb mass storage/rbc, usb mass storage/scsi */
+    hwloc_obj_add_info(obj, "Type", "Other");
+}
+
+/* block class objects are in
+ * host%d/target%d:%d:%d/%d:%d:%d:%d/
+ * or
+ * host%d/port-%d:%d/end_device-%d:%d/target%d:%d:%d/%d:%d:%d:%d/
+ * or
+ * ide%d/%d.%d/
+ * below pci devices */
+static int
+hwloc_linux_lookup_host_block_class(struct hwloc_backend *backend,
+				    struct hwloc_obj *pcidev, char *path, size_t pathlen)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  DIR *hostdir, *portdir, *targetdir;
+  struct dirent *hostdirent, *portdirent, *targetdirent;
+  size_t hostdlen, portdlen, targetdlen;
+  int dummy;
+  int res = 0;
+
+  hostdir = hwloc_opendir(path, root_fd);
+  if (!hostdir)
+    return 0;
+
+  while ((hostdirent = readdir(hostdir)) != NULL) {
+    if (sscanf(hostdirent->d_name, "port-%d:%d", &dummy, &dummy) == 2)
+    {
+      /* found host%d/port-%d:%d */
+      path[pathlen] = '/';
+      strcpy(&path[pathlen+1], hostdirent->d_name);
+      pathlen += hostdlen = 1+strlen(hostdirent->d_name);
+      portdir = hwloc_opendir(path, root_fd);
+      if (!portdir)
+	continue;
+      while ((portdirent = readdir(portdir)) != NULL) {
+	if (sscanf(portdirent->d_name, "end_device-%d:%d", &dummy, &dummy) == 2) {
+	  /* found host%d/port-%d:%d/end_device-%d:%d */
+	  path[pathlen] = '/';
+	  strcpy(&path[pathlen+1], portdirent->d_name);
+	  pathlen += portdlen = 1+strlen(portdirent->d_name);
+	  res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
+	  /* restore parent path */
+	  pathlen -= portdlen;
+	  path[pathlen] = '\0';
+	}
+      }
+      closedir(portdir);
+      /* restore parent path */
+      pathlen -= hostdlen;
+      path[pathlen] = '\0';
+      continue;
+    } else if (sscanf(hostdirent->d_name, "target%d:%d:%d", &dummy, &dummy, &dummy) == 3) {
+      /* found host%d/target%d:%d:%d */
+      path[pathlen] = '/';
+      strcpy(&path[pathlen+1], hostdirent->d_name);
+      pathlen += hostdlen = 1+strlen(hostdirent->d_name);
+      targetdir = hwloc_opendir(path, root_fd);
+      if (!targetdir)
+	continue;
+      while ((targetdirent = readdir(targetdir)) != NULL) {
+	if (sscanf(targetdirent->d_name, "%d:%d:%d:%d", &dummy, &dummy, &dummy, &dummy) != 4)
+	  continue;
+	/* found host%d/target%d:%d:%d/%d:%d:%d:%d */
+	path[pathlen] = '/';
+	strcpy(&path[pathlen+1], targetdirent->d_name);
+	pathlen += targetdlen = 1+strlen(targetdirent->d_name);
+	/* lookup block class for real */
+	res += hwloc_linux_class_readdir(backend, pcidev, path, HWLOC_OBJ_OSDEV_BLOCK, "block", hwloc_linux_block_class_fillinfos);
+	/* restore parent path */
+	pathlen -= targetdlen;
+	path[pathlen] = '\0';
+      }
+      closedir(targetdir);
+      /* restore parent path */
+      pathlen -= hostdlen;
+      path[pathlen] = '\0';
+    }
+  }
+  closedir(hostdir);
+
+  return res;
+}
+
+static int
+hwloc_linux_lookup_block_class(struct hwloc_backend *backend,
+			       struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  size_t pathlen;
+  DIR *devicedir, *hostdir;
+  struct dirent *devicedirent, *hostdirent;
+  size_t devicedlen, hostdlen;
+  char path[256];
+  int dummy;
+  int res = 0;
+
+  strcpy(path, pcidevpath);
+  pathlen = strlen(path);
+
+  devicedir = hwloc_opendir(pcidevpath, root_fd);
+  if (!devicedir)
+    return 0;
+
+  while ((devicedirent = readdir(devicedir)) != NULL) {
+    if (sscanf(devicedirent->d_name, "ide%d", &dummy) == 1) {
+      /* found ide%d */
+      path[pathlen] = '/';
+      strcpy(&path[pathlen+1], devicedirent->d_name);
+      pathlen += devicedlen = 1+strlen(devicedirent->d_name);
+      hostdir = hwloc_opendir(path, root_fd);
+      if (!hostdir)
+	continue;
+      while ((hostdirent = readdir(hostdir)) != NULL) {
+	if (sscanf(hostdirent->d_name, "%d.%d", &dummy, &dummy) == 2) {
+	  /* found ide%d/%d.%d */
+	  path[pathlen] = '/';
+	  strcpy(&path[pathlen+1], hostdirent->d_name);
+	  pathlen += hostdlen = 1+strlen(hostdirent->d_name);
+	  /* lookup block class for real */
+	  res += hwloc_linux_class_readdir(backend, pcidev, path, HWLOC_OBJ_OSDEV_BLOCK, "block", NULL);
+	  /* restore parent path */
+	  pathlen -= hostdlen;
+	  path[pathlen] = '\0';
+	}
+      }
+      closedir(hostdir);
+      /* restore parent path */
+      pathlen -= devicedlen;
+      path[pathlen] = '\0';
+    } else if (sscanf(devicedirent->d_name, "host%d", &dummy) == 1) {
+      /* found host%d */
+      path[pathlen] = '/';
+      strcpy(&path[pathlen+1], devicedirent->d_name);
+      pathlen += devicedlen = 1+strlen(devicedirent->d_name);
+      res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
+      /* restore parent path */
+      pathlen -= devicedlen;
+      path[pathlen] = '\0';
+    } else if (sscanf(devicedirent->d_name, "ata%d", &dummy) == 1) {
+      /* found ata%d */
+      path[pathlen] = '/';
+      strcpy(&path[pathlen+1], devicedirent->d_name);
+      pathlen += devicedlen = 1+strlen(devicedirent->d_name);
+      hostdir = hwloc_opendir(path, root_fd);
+      if (!hostdir)
+	continue;
+      while ((hostdirent = readdir(hostdir)) != NULL) {
+	if (sscanf(hostdirent->d_name, "host%d", &dummy) == 1) {
+	  /* found ata%d/host%d */
+	  path[pathlen] = '/';
+	  strcpy(&path[pathlen+1], hostdirent->d_name);
+	  pathlen += hostdlen = 1+strlen(hostdirent->d_name);
+	  /* lookup block class for real */
+          res += hwloc_linux_lookup_host_block_class(backend, pcidev, path, pathlen);
+	  /* restore parent path */
+	  pathlen -= hostdlen;
+	  path[pathlen] = '\0';
+	}
+      }
+      closedir(hostdir);
+      /* restore parent path */
+      pathlen -= devicedlen;
+      path[pathlen] = '\0';
+    }
+  }
+  closedir(devicedir);
+
+  return res;
+}
+
+static void
+hwloc_linux_mic_class_fillinfos(struct hwloc_backend *backend,
+				struct hwloc_obj *obj, const char *osdevpath)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  FILE *fd;
+  char path[256];
+
+  hwloc_obj_add_info(obj, "CoProcType", "MIC");
+
+  snprintf(path, sizeof(path), "%s/family", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char family[64];
+    if (fgets(family, sizeof(family), fd)) {
+      char *eol = strchr(family, '\n');
+      if (eol)
+        *eol = 0;
+      hwloc_obj_add_info(obj, "MICFamily", family);
+    }
+    fclose(fd);
+  }
+
+  snprintf(path, sizeof(path), "%s/sku", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char sku[64];
+    if (fgets(sku, sizeof(sku), fd)) {
+      char *eol = strchr(sku, '\n');
+      if (eol)
+        *eol = 0;
+      hwloc_obj_add_info(obj, "MICSKU", sku);
+    }
+    fclose(fd);
+  }
+
+  snprintf(path, sizeof(path), "%s/serialnumber", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char sn[64];
+    if (fgets(sn, sizeof(sn), fd)) {
+      char *eol = strchr(sn, '\n');
+      if (eol)
+        *eol = 0;
+      hwloc_obj_add_info(obj, "MICSerialNumber", sn);
+    }
+    fclose(fd);
+  }
+
+  snprintf(path, sizeof(path), "%s/active_cores", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char string[10];
+    if (fgets(string, sizeof(string), fd)) {
+      unsigned long count = strtoul(string, NULL, 16);
+      snprintf(string, sizeof(string), "%lu", count);
+      hwloc_obj_add_info(obj, "MICActiveCores", string);
+    }
+    fclose(fd);
+  }
+
+  snprintf(path, sizeof(path), "%s/memsize", osdevpath);
+  fd = hwloc_fopen(path, "r", root_fd);
+  if (fd) {
+    char string[20];
+    if (fgets(string, sizeof(string), fd)) {
+      unsigned long count = strtoul(string, NULL, 16);
+      snprintf(string, sizeof(string), "%lu", count);
+      hwloc_obj_add_info(obj, "MICMemorySize", string);
+    }
+    fclose(fd);
+  }
+}
+
+static int
+hwloc_linux_lookup_mic_class(struct hwloc_backend *backend,
+			     struct hwloc_obj *pcidev, const char *pcidevpath)
+{
+  return hwloc_linux_class_readdir(backend, pcidev, pcidevpath, HWLOC_OBJ_OSDEV_COPROC, "mic", hwloc_linux_mic_class_fillinfos);
+}
+
+static int
+hwloc_linux_directlookup_mic_class(struct hwloc_backend *backend,
+				   struct hwloc_obj *pcidev)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  int root_fd = data->root_fd;
+  char path[256];
+  struct stat st;
+  hwloc_obj_t obj;
+  unsigned idx;
+  int res = 0;
+
+  if (!data->mic_directlookup_id_max)
+    /* already tried, nothing to do */
+    return 0;
+
+  if (data->mic_directlookup_id_max == (unsigned) -1) {
+    /* never tried, find out the max id */
+    DIR *dir;
+    struct dirent *dirent;
+
+    /* make sure we never do this lookup again */
+    data->mic_directlookup_id_max = 0;
+
+    /* read the entire class and find the max id of mic%u dirents */
+    dir = hwloc_opendir("/sys/devices/virtual/mic", root_fd);
+    if (!dir) {
+      dir = opendir("/sys/class/mic");
+      if (!dir)
+	return 0;
+    }
+    while ((dirent = readdir(dir)) != NULL) {
+      if (!strcmp(dirent->d_name, ".") || !strcmp(dirent->d_name, ".."))
+	continue;
+      if (sscanf(dirent->d_name, "mic%u", &idx) != 1)
+	continue;
+      if (idx >= data->mic_directlookup_id_max)
+	data->mic_directlookup_id_max = idx+1;
+    }
+    closedir(dir);
+  }
+
+  /* now iterate over the mic ids and see if one matches our pcidev */
+  for(idx=0; idx<data->mic_directlookup_id_max; idx++) {
+    snprintf(path, sizeof(path), "/sys/class/mic/mic%u/pci_%02x:%02x.%02x",
+	     idx, pcidev->attr->pcidev.bus,  pcidev->attr->pcidev.dev,  pcidev->attr->pcidev.func);
+    if (hwloc_stat(path, &st, root_fd) < 0)
+      continue;
+    snprintf(path, sizeof(path), "mic%u", idx);
+    obj = hwloc_linux_add_os_device(backend, pcidev, HWLOC_OBJ_OSDEV_COPROC, path);
+    snprintf(path, sizeof(path), "/sys/class/mic/mic%u", idx);
+    hwloc_linux_mic_class_fillinfos(backend, obj, path);
+    res++;
+  }
+
+  return res;
+}
+
+/*
+ * backend callback for inserting objects inside a pci device
+ */
+static int
+hwloc_linux_backend_notify_new_object(struct hwloc_backend *backend, struct hwloc_backend *caller __hwloc_attribute_unused,
+				      struct hwloc_obj *obj)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  char pcidevpath[256];
+  int res = 0;
+
+  /* this callback is only used in the libpci backend for now */
+  assert(obj->type == HWLOC_OBJ_PCI_DEVICE);
+
+  snprintf(pcidevpath, sizeof(pcidevpath), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/",
+	   obj->attr->pcidev.domain, obj->attr->pcidev.bus,
+	   obj->attr->pcidev.dev, obj->attr->pcidev.func);
+
+  res += hwloc_linux_lookup_net_class(backend, obj, pcidevpath);
+  res += hwloc_linux_lookup_openfabrics_class(backend, obj, pcidevpath);
+  res += hwloc_linux_lookup_dma_class(backend, obj, pcidevpath);
+  res += hwloc_linux_lookup_drm_class(backend, obj, pcidevpath);
+  res += hwloc_linux_lookup_block_class(backend, obj, pcidevpath);
+
+  if (data->mic_need_directlookup == -1) {
+    struct stat st;
+    if (hwloc_stat("/sys/class/mic/mic0", &st, data->root_fd) == 0
+	&& hwloc_stat("/sys/class/mic/mic0/device/mic/mic0", &st, data->root_fd) == -1)
+      /* hwloc_linux_lookup_mic_class will fail because pcidev sysfs directories
+       * do not have mic/mic%u symlinks to mic devices (old mic driver).
+       * if so, try from the mic class.
+       */
+      data->mic_need_directlookup = 1;
+    else
+      data->mic_need_directlookup = 0;
+  }
+  if (data->mic_need_directlookup)
+    res += hwloc_linux_directlookup_mic_class(backend, obj);
+  else
+    res += hwloc_linux_lookup_mic_class(backend, obj, pcidevpath);
+
+  return res;
+}
+
+/*
+ * backend callback for retrieving the location of a pci device
+ */
+static int
+hwloc_linux_backend_get_obj_cpuset(struct hwloc_backend *backend,
+				   struct hwloc_backend *caller __hwloc_attribute_unused,
+				   struct hwloc_obj *obj, hwloc_bitmap_t cpuset)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+  char path[256];
+  FILE *file;
+  int err;
+
+  /* this callback is only used in the libpci backend for now */
+  assert(obj->type == HWLOC_OBJ_PCI_DEVICE
+	 || (obj->type == HWLOC_OBJ_BRIDGE && obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI));
+
+  snprintf(path, sizeof(path), "/sys/bus/pci/devices/%04x:%02x:%02x.%01x/local_cpus",
+	   obj->attr->pcidev.domain, obj->attr->pcidev.bus,
+	   obj->attr->pcidev.dev, obj->attr->pcidev.func);
+  file = hwloc_fopen(path, "r", data->root_fd);
+  if (file) {
+    err = hwloc_linux_parse_cpumap_file(file, cpuset);
+    fclose(file);
+    if (!err && !hwloc_bitmap_iszero(cpuset))
+      return 0;
+  }
+  return -1;
+}
+
+
+
+/*******************************
+ ******* Linux component *******
+ *******************************/
+
+static void
+hwloc_linux_backend_disable(struct hwloc_backend *backend)
+{
+  struct hwloc_linux_backend_data_s *data = backend->private_data;
+#ifdef HAVE_OPENAT
+  close(data->root_fd);
+#endif
+#ifdef HAVE_LIBUDEV_H
+  if (data->udev)
+    udev_unref(data->udev);
+#endif
+  free(data);
+}
+
+static struct hwloc_backend *
+hwloc_linux_component_instantiate(struct hwloc_disc_component *component,
+				  const void *_data1,
+				  const void *_data2 __hwloc_attribute_unused,
+				  const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  struct hwloc_linux_backend_data_s *data;
+  const char * fsroot_path = _data1;
+  int flags, root = -1;
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    goto out;
+
+  data = malloc(sizeof(*data));
+  if (!data) {
+    errno = ENOMEM;
+    goto out_with_backend;
+  }
+
+  backend->private_data = data;
+  backend->discover = hwloc_look_linuxfs;
+  backend->get_obj_cpuset = hwloc_linux_backend_get_obj_cpuset;
+  backend->notify_new_object = hwloc_linux_backend_notify_new_object;
+  backend->disable = hwloc_linux_backend_disable;
+
+  /* default values */
+  data->is_real_fsroot = 1;
+  if (!fsroot_path)
+    fsroot_path = "/";
+
+#ifdef HAVE_OPENAT
+  root = open(fsroot_path, O_RDONLY | O_DIRECTORY);
+  if (root < 0)
+    goto out_with_data;
+
+  if (strcmp(fsroot_path, "/")) {
+    backend->is_thissystem = 0;
+    data->is_real_fsroot = 0;
+  }
+
+  /* Since this fd stays open after hwloc returns, mark it as
+     close-on-exec so that children don't inherit it.  Stevens says
+     that we should GETFD before we SETFD, so we do. */
+  flags = fcntl(root, F_GETFD, 0);
+  if (-1 == flags ||
+      -1 == fcntl(root, F_SETFD, FD_CLOEXEC | flags)) {
+      close(root);
+      root = -1;
+      goto out_with_data;
+  }
+#else
+  if (strcmp(fsroot_path, "/")) {
+    errno = ENOSYS;
+    goto out_with_data;
+  }
+#endif
+  data->root_fd = root;
+
+#ifdef HAVE_LIBUDEV_H
+  data->udev = NULL;
+  if (data->is_real_fsroot) {
+    data->udev = udev_new();
+  }
+#endif
+
+  data->deprecated_classlinks_model = -2; /* never tried */
+  data->mic_need_directlookup = -1; /* not initialized */
+  data->mic_directlookup_id_max = -1; /* not initialized */
+
+  return backend;
+
+ out_with_data:
+  free(data);
+ out_with_backend:
+  free(backend);
+ out:
+  return NULL;
+}
+
+static struct hwloc_disc_component hwloc_linux_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "linux",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_linux_component_instantiate,
+  50,
+  NULL
+};
+
+const struct hwloc_component hwloc_linux_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_linux_disc_component
+};
+
+
+
+
+#ifdef HWLOC_HAVE_LINUXPCI
+
+/***********************************
+ ******* Linux PCI component *******
+ ***********************************/
+
+#define HWLOC_PCI_REVISION_ID 0x08
+#define HWLOC_PCI_CAP_ID_EXP 0x10
+#define HWLOC_PCI_CLASS_NOT_DEFINED 0x0000
+
+static int
+hwloc_look_linuxfs_pci(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_backend *tmpbackend;
+  hwloc_obj_t first_obj = NULL, last_obj = NULL;
+  int root_fd = -1;
+  DIR *dir;
+  struct dirent *dirent;
+  int res = 0;
+
+  if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
+    return 0;
+
+  if (hwloc_get_next_pcidev(topology, NULL)) {
+    hwloc_debug("%s", "PCI objects already added, ignoring linuxpci backend.\n");
+    return 0;
+  }
+
+  /* hackily find the linux backend to steal its fsroot */
+  tmpbackend = topology->backends;
+  while (tmpbackend) {
+    if (tmpbackend->component == &hwloc_linux_disc_component) {
+      root_fd = ((struct hwloc_linux_backend_data_s *) tmpbackend->private_data)->root_fd;
+      hwloc_debug("linuxpci backend stole linux backend root_fd %d\n", root_fd);
+      break;    }
+    tmpbackend = tmpbackend->next;
+  }
+  /* take our own descriptor, either pointing to linux fsroot, or to / if not found */
+  if (root_fd >= 0)
+    root_fd = dup(root_fd);
+  else
+    root_fd = open("/", O_RDONLY | O_DIRECTORY);
+
+  dir = hwloc_opendir("/sys/bus/pci/devices/", root_fd);
+  if (!dir)
+    goto out_with_rootfd;
+
+  while ((dirent = readdir(dir)) != NULL) {
+    unsigned domain, bus, dev, func;
+    hwloc_obj_t obj;
+    struct hwloc_pcidev_attr_s *attr;
+    unsigned os_index;
+    char path[64];
+    char value[16];
+    size_t read;
+    FILE *file;
+
+    if (sscanf(dirent->d_name, "%04x:%02x:%02x.%01x", &domain, &bus, &dev, &func) != 4)
+      continue;
+
+    os_index = (domain << 20) + (bus << 12) + (dev << 4) + func;
+    obj = hwloc_alloc_setup_object(HWLOC_OBJ_PCI_DEVICE, os_index);
+    if (!obj)
+      break;
+    attr = &obj->attr->pcidev;
+
+    attr->domain = domain;
+    attr->bus = bus;
+    attr->dev = dev;
+    attr->func = func;
+
+    /* default (unknown) values */
+    attr->vendor_id = 0;
+    attr->device_id = 0;
+    attr->class_id = HWLOC_PCI_CLASS_NOT_DEFINED;
+    attr->revision = 0;
+    attr->subvendor_id = 0;
+    attr->subdevice_id = 0;
+    attr->linkspeed = 0;
+
+    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/vendor", dirent->d_name);
+    file = hwloc_fopen(path, "r", root_fd);
+    if (file) {
+      read = fread(value, 1, sizeof(value), file);
+      fclose(file);
+      if (read)
+        attr->vendor_id = strtoul(value, NULL, 16);
+    }
+    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/device", dirent->d_name);
+    file = hwloc_fopen(path, "r", root_fd);
+    if (file) {
+      read = fread(value, 1, sizeof(value), file);
+      fclose(file);
+      if (read)
+        attr->device_id = strtoul(value, NULL, 16);
+    }
+    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/class", dirent->d_name);
+    file = hwloc_fopen(path, "r", root_fd);
+    if (file) {
+      read = fread(value, 1, sizeof(value), file);
+      fclose(file);
+      if (read)
+        attr->class_id = strtoul(value, NULL, 16) >> 8;
+    }
+    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_vendor", dirent->d_name);
+    file = hwloc_fopen(path, "r", root_fd);
+    if (file) {
+      read = fread(value, 1, sizeof(value), file);
+      fclose(file);
+      if (read)
+        attr->subvendor_id = strtoul(value, NULL, 16);
+    }
+    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/subsystem_device", dirent->d_name);
+    file = hwloc_fopen(path, "r", root_fd);
+    if (file) {
+      read = fread(value, 1, sizeof(value), file);
+      fclose(file);
+      if (read)
+        attr->subdevice_id = strtoul(value, NULL, 16);
+    }
+
+    snprintf(path, sizeof(path), "/sys/bus/pci/devices/%s/config", dirent->d_name);
+    file = hwloc_fopen(path, "r", root_fd);
+    if (file) {
+#define CONFIG_SPACE_CACHESIZE 256
+      unsigned char config_space_cache[CONFIG_SPACE_CACHESIZE];
+      unsigned offset;
+
+      /* initialize the config space in case we fail to read it (missing permissions, etc). */
+      memset(config_space_cache, 0xff, CONFIG_SPACE_CACHESIZE);
+      read = fread(config_space_cache, 1, CONFIG_SPACE_CACHESIZE, file);
+      (void) read; /* we initialized config_space_cache in case we don't read enough, ignore the read length */
+      fclose(file);
+
+      /* is this a bridge? */
+      hwloc_pci_prepare_bridge(obj, config_space_cache);
+
+      /* get the revision */
+      attr->revision = config_space_cache[HWLOC_PCI_REVISION_ID];
+
+      /* try to get the link speed */
+      offset = hwloc_pci_find_cap(config_space_cache, HWLOC_PCI_CAP_ID_EXP);
+      if (offset > 0 && offset + 20 /* size of PCI express block up to link status */ <= CONFIG_SPACE_CACHESIZE)
+	hwloc_pci_find_linkspeed(config_space_cache, offset, &attr->linkspeed);
+    }
+
+    if (first_obj)
+      last_obj->next_sibling = obj;
+    else
+      first_obj = obj;
+    last_obj = obj;
+  }
+
+  closedir(dir);
+
+  dir = hwloc_opendir("/sys/bus/pci/slots/", root_fd);
+  if (dir) {
+    while ((dirent = readdir(dir)) != NULL) {
+      char path[64];
+      FILE *file;
+      if (dirent->d_name[0] == '.')
+	continue;
+      snprintf(path, sizeof(path), "/sys/bus/pci/slots/%s/address", dirent->d_name);
+      file = hwloc_fopen(path, "r", root_fd);
+      if (file) {
+	unsigned domain, bus, dev;
+	if (fscanf(file, "%x:%x:%x", &domain, &bus, &dev) == 3) {
+	  hwloc_obj_t obj = first_obj;
+	  while (obj) {
+	    if (obj->attr->pcidev.domain == domain
+		&& obj->attr->pcidev.bus == bus
+		&& obj->attr->pcidev.dev == dev
+		&& obj->attr->pcidev.func == 0) {
+	      hwloc_obj_add_info(obj, "PCISlot", dirent->d_name);
+	      break;
+	    }
+	    obj = obj->next_sibling;
+	  }
+	}
+	fclose(file);
+      }
+    }
+    closedir(dir);
+  }
+
+  res = hwloc_insert_pci_device_list(backend, first_obj);
+
+ out_with_rootfd:
+  close(root_fd);
+  return res;
+}
+
+static struct hwloc_backend *
+hwloc_linuxpci_component_instantiate(struct hwloc_disc_component *component,
+				     const void *_data1 __hwloc_attribute_unused,
+				     const void *_data2 __hwloc_attribute_unused,
+				     const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+
+  /* thissystem may not be fully initialized yet, we'll check flags in discover() */
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+  backend->flags = HWLOC_BACKEND_FLAG_NEED_LEVELS;
+  backend->discover = hwloc_look_linuxfs_pci;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_linuxpci_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_MISC,
+  "linuxpci",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_linuxpci_component_instantiate,
+  19, /* after pci */
+  NULL
+};
+
+const struct hwloc_component hwloc_linuxpci_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_linuxpci_disc_component
+};
+
+#endif /* HWLOC_HAVE_LINUXPCI */
diff --git a/ext/hwloc/hwloc/topology-noos.c b/ext/hwloc/hwloc/topology-noos.c
new file mode 100644
index 0000000..a926428
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-noos.c
@@ -0,0 +1,58 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+
+static int
+hwloc_look_noos(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+
+  if (topology->levels[0][0]->cpuset)
+    /* somebody discovered things */
+    return 0;
+
+  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+  hwloc_setup_pu_level(topology, hwloc_fallback_nbprocessors(topology));
+  if (topology->is_thissystem)
+    hwloc_add_uname_info(topology, NULL);
+  return 1;
+}
+
+static struct hwloc_backend *
+hwloc_noos_component_instantiate(struct hwloc_disc_component *component,
+				 const void *_data1 __hwloc_attribute_unused,
+				 const void *_data2 __hwloc_attribute_unused,
+				 const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+  backend->discover = hwloc_look_noos;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_noos_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "no_os",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_noos_component_instantiate,
+  40, /* lower than native OS component, higher than globals */
+  NULL
+};
+
+const struct hwloc_component hwloc_noos_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_noos_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-opencl.cb b/ext/hwloc/hwloc/topology-opencl.cb
new file mode 100644
index 0000000..85057c7
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-opencl.cb
@@ -0,0 +1,346 @@
+/*
+ * Copyright © 2012-2014 Inria.  All rights reserved.
+ * Copyright © 2013 Université Bordeaux.  All right reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/plugins.h>
+
+/* private headers allowed for convenience because this plugin is built within hwloc */
+#include <private/misc.h>
+#include <private/debug.h>
+
+#include <CL/cl_ext.h>
+
+typedef enum hwloc_opencl_device_type_e {
+  HWLOC_OPENCL_DEVICE_AMD
+} hwloc_opencl_device_type_t;
+
+struct hwloc_opencl_backend_data_s {
+  unsigned nr_devices; /* -1 when unknown yet, first callback will setup */
+  struct hwloc_opencl_device_info_s {
+    hwloc_opencl_device_type_t type;
+
+    unsigned platformidx;
+    char platformname[64];
+    unsigned platformdeviceidx;
+    char devicename[64];
+    char devicevendor[64];
+    char devicetype[64];
+
+    unsigned computeunits;
+    unsigned long long globalmemsize;
+
+    union hwloc_opencl_device_info_u {
+      struct hwloc_opencl_device_info_amd_s {
+        unsigned pcidomain, pcibus, pcidev, pcifunc;
+      } amd;
+    } specific;
+  } * devices;
+};
+
+static void
+hwloc_opencl_query_devices(struct hwloc_opencl_backend_data_s *data)
+{
+  cl_platform_id *platform_ids = NULL;
+  cl_uint nr_platforms;
+  cl_device_id *device_ids = NULL;
+  cl_uint nr_devices, nr_total_devices, tmp;
+  cl_int clret;
+  unsigned curpfidx, curpfdvidx, i;
+
+  /* mark the number of devices as 0 in case we fail below,
+   * so that we don't try again later.
+   */
+  data->nr_devices = 0;
+
+  /* count platforms, allocate and get them */
+  clret = clGetPlatformIDs(0, NULL, &nr_platforms);
+  if (CL_SUCCESS != clret || !nr_platforms)
+    goto out;
+  hwloc_debug("%u OpenCL platforms\n", nr_platforms);
+  platform_ids = malloc(nr_platforms * sizeof(*platform_ids));
+  if (!platform_ids)
+    goto out;
+  clret = clGetPlatformIDs(nr_platforms, platform_ids, &nr_platforms);
+  if (CL_SUCCESS != clret || !nr_platforms)
+    goto out_with_platform_ids;
+
+  /* how many devices, total? */
+  tmp = 0;
+  for(i=0; i<nr_platforms; i++) {
+    clret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, 0, NULL, &nr_devices);
+    if (CL_SUCCESS != clret)
+      goto out_with_platform_ids;
+    tmp += nr_devices;
+  }
+  nr_total_devices = tmp;
+  hwloc_debug("%u OpenCL devices total\n", nr_total_devices);
+  /* allocate structs */
+  device_ids = malloc(nr_total_devices * sizeof(*device_ids));
+  data->devices = malloc(nr_total_devices * sizeof(*data->devices));
+  if (!data->devices || !device_ids)
+    goto out_with_device_ids;
+  /* actually query device ids */
+  tmp = 0;
+  for(i=0; i<nr_platforms; i++) {
+    clret = clGetDeviceIDs(platform_ids[i], CL_DEVICE_TYPE_ALL, nr_total_devices - tmp, device_ids + tmp, &nr_devices);
+    if (CL_SUCCESS != clret)
+      goto out_with_device_ids;
+    tmp += nr_devices;
+  }
+
+  /* query individual devices */
+  curpfidx = 0;
+  curpfdvidx = 0;
+  for(i=0; i<nr_total_devices; i++) {
+    struct hwloc_opencl_device_info_s *info = &data->devices[data->nr_devices];
+    cl_platform_id platform_id = 0;
+    cl_device_type type;
+#ifdef CL_DEVICE_TOPOLOGY_AMD
+    cl_device_topology_amd amdtopo;
+#endif
+    cl_ulong globalmemsize;
+    cl_uint computeunits;
+
+    hwloc_debug("Looking device %p\n", device_ids[i]);
+
+    info->platformname[0] = '\0';
+    clret = clGetDeviceInfo(device_ids[i], CL_DEVICE_PLATFORM, sizeof(platform_id), &platform_id, NULL);
+    if (CL_SUCCESS != clret)
+      continue;
+    clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(info->platformname), info->platformname, NULL);
+
+    info->devicename[0] = '\0';
+#ifdef CL_DEVICE_BOARD_NAME_AMD
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_BOARD_NAME_AMD, sizeof(info->devicename), info->devicename, NULL);
+#else
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, sizeof(info->devicename), info->devicename, NULL);
+#endif
+    info->devicevendor[0] = '\0';
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_VENDOR, sizeof(info->devicevendor), info->devicevendor, NULL);
+
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(type), &type, NULL);
+    switch (type) {
+    case CL_DEVICE_TYPE_CPU: /* FIXME: cannot happen in PCI devices? */
+      strcpy(info->devicetype, "CPU");
+      break;
+    case CL_DEVICE_TYPE_GPU:
+      strcpy(info->devicetype, "GPU");
+      break;
+    case CL_DEVICE_TYPE_ACCELERATOR:
+      strcpy(info->devicetype, "Accelerator");
+      break;
+    default:
+      strcpy(info->devicetype, "Unknown");
+      break;
+    }
+
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(globalmemsize), &globalmemsize, NULL);
+    info->globalmemsize = globalmemsize / 1024;
+
+    clGetDeviceInfo(device_ids[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(computeunits), &computeunits, NULL);
+    info->computeunits = computeunits;
+
+    hwloc_debug("platform %s device %s vendor %s type %s\n", info->platformname, info->devicename, info->devicevendor, info->devicetype);
+
+    /* find our indexes */
+    while (platform_id != platform_ids[curpfidx]) {
+      curpfidx++;
+      curpfdvidx = 0;
+    }
+    info->platformidx = curpfidx;
+    info->platformdeviceidx = curpfdvidx;
+    curpfdvidx++;
+
+    hwloc_debug("This is opencl%dd%d\n", info->platformidx, info->platformdeviceidx);
+
+#ifdef CL_DEVICE_TOPOLOGY_AMD
+    clret = clGetDeviceInfo(device_ids[i], CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
+    if (CL_SUCCESS != clret) {
+      hwloc_debug("no AMD-specific device information: %d\n", clret);
+      continue;
+    }
+    if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) {
+      hwloc_debug("not a PCIe device: %u\n", amdtopo.raw.type);
+      continue;
+    }
+
+    info->type = HWLOC_OPENCL_DEVICE_AMD;
+    info->specific.amd.pcidomain = 0;
+    info->specific.amd.pcibus = amdtopo.pcie.bus;
+    info->specific.amd.pcidev = amdtopo.pcie.device;
+    info->specific.amd.pcifunc = amdtopo.pcie.function;
+
+    hwloc_debug("OpenCL device on PCI 0000:%02x:%02x.%u\n", amdtopo.pcie.bus, amdtopo.pcie.device, amdtopo.pcie.function);
+
+    /* validate this device */
+    data->nr_devices++;
+#endif /* HAVE_DECL_CL_DEVICE_TOPOLOGY_AMD */
+  }
+  free(device_ids);
+  free(platform_ids);
+  return;
+
+out_with_device_ids:
+  free(device_ids);
+  free(data->devices);
+  data->devices = NULL;
+out_with_platform_ids:
+  free(platform_ids);
+out:
+  return;
+}
+
+static int
+hwloc_opencl_backend_notify_new_object(struct hwloc_backend *backend, struct hwloc_backend *caller __hwloc_attribute_unused,
+				       struct hwloc_obj *pcidev)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_opencl_backend_data_s *data = backend->private_data;
+  unsigned i;
+
+  if (!(hwloc_topology_get_flags(topology) & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
+    return 0;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    hwloc_debug("%s", "\nno OpenCL detection (not thissystem)\n");
+    return 0;
+  }
+
+  if (HWLOC_OBJ_PCI_DEVICE != pcidev->type)
+    return 0;
+
+  if (data->nr_devices == (unsigned) -1) {
+    /* first call, lookup all devices */
+    hwloc_opencl_query_devices(data);
+    /* if it fails, data->nr_devices = 0 so we won't do anything below and in next callbacks */
+  }
+
+  if (!data->nr_devices)
+    /* found no devices */
+    return 0;
+
+  /* now the devices array is ready to use */
+  for(i=0; i<data->nr_devices; i++) {
+    struct hwloc_opencl_device_info_s *info = &data->devices[i];
+    hwloc_obj_t osdev;
+    char buffer[64];
+
+    assert(info->type == HWLOC_OPENCL_DEVICE_AMD);
+    if (info->specific.amd.pcidomain != pcidev->attr->pcidev.domain)
+      continue;
+    if (info->specific.amd.pcibus != pcidev->attr->pcidev.bus)
+      continue;
+    if (info->specific.amd.pcidev != pcidev->attr->pcidev.dev)
+      continue;
+    if (info->specific.amd.pcifunc != pcidev->attr->pcidev.func)
+      continue;
+
+    osdev = hwloc_alloc_setup_object(HWLOC_OBJ_OS_DEVICE, -1);
+    snprintf(buffer, sizeof(buffer), "opencl%dd%d", info->platformidx, info->platformdeviceidx);
+    osdev->name = strdup(buffer);
+    osdev->depth = (unsigned) HWLOC_TYPE_DEPTH_UNKNOWN;
+    osdev->attr->osdev.type = HWLOC_OBJ_OSDEV_COPROC;
+
+    hwloc_obj_add_info(osdev, "CoProcType", "OpenCL");
+    hwloc_obj_add_info(osdev, "Backend", "OpenCL");
+    hwloc_obj_add_info(osdev, "OpenCLDeviceType", info->devicetype);
+
+    if (info->devicevendor[0] != '\0')
+      hwloc_obj_add_info(osdev, "GPUVendor", info->devicevendor);
+    if (info->devicename[0] != '\0')
+      hwloc_obj_add_info(osdev, "GPUModel", info->devicename);
+
+    snprintf(buffer, sizeof(buffer), "%u", info->platformidx);
+    hwloc_obj_add_info(osdev, "OpenCLPlatformIndex", buffer);
+    if (info->platformname[0] != '\0')
+      hwloc_obj_add_info(osdev, "OpenCLPlatformName", info->platformname);
+
+    snprintf(buffer, sizeof(buffer), "%u", info->platformdeviceidx);
+    hwloc_obj_add_info(osdev, "OpenCLPlatformDeviceIndex", buffer);
+
+    snprintf(buffer, sizeof(buffer), "%u", info->computeunits);
+    hwloc_obj_add_info(osdev, "OpenCLComputeUnits", buffer);
+
+    snprintf(buffer, sizeof(buffer), "%llu", info->globalmemsize);
+    hwloc_obj_add_info(osdev, "OpenCLGlobalMemorySize", buffer);
+
+    hwloc_insert_object_by_parent(topology, pcidev, osdev);
+    return 1;
+  }
+
+  return 0;
+}
+
+static void
+hwloc_opencl_backend_disable(struct hwloc_backend *backend)
+{
+  struct hwloc_opencl_backend_data_s *data = backend->private_data;
+  free(data->devices);
+  free(data);
+}
+
+static struct hwloc_backend *
+hwloc_opencl_component_instantiate(struct hwloc_disc_component *component,
+				   const void *_data1 __hwloc_attribute_unused,
+				   const void *_data2 __hwloc_attribute_unused,
+				   const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  struct hwloc_opencl_backend_data_s *data;
+
+  /* thissystem may not be fully initialized yet, we'll check flags in discover() */
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+
+  data = malloc(sizeof(*data));
+  if (!data) {
+    free(backend);
+    return NULL;
+  }
+  /* the first callback will initialize those */
+  data->nr_devices = (unsigned) -1; /* unknown yet */
+  data->devices = NULL;
+
+  backend->private_data = data;
+  backend->disable = hwloc_opencl_backend_disable;
+
+  backend->notify_new_object = hwloc_opencl_backend_notify_new_object;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_opencl_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_MISC,
+  "opencl",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_opencl_component_instantiate,
+  10, /* after pci */
+  NULL
+};
+
+static int
+hwloc_opencl_component_init(unsigned long flags)
+{
+  if (flags)
+    return -1;
+  if (hwloc_plugin_check_namespace("opencl", "hwloc_backend_alloc") < 0)
+    return -1;
+  return 0;
+}
+
+#ifdef HWLOC_INSIDE_PLUGIN
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_opencl_component;
+#endif
+
+const struct hwloc_component hwloc_opencl_component = {
+  HWLOC_COMPONENT_ABI,
+  hwloc_opencl_component_init, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_opencl_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-osf.cb b/ext/hwloc/hwloc/topology-osf.cb
new file mode 100644
index 0000000..5715888
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-osf.cb
@@ -0,0 +1,392 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+
+#include <sys/types.h>
+#ifdef HAVE_DIRENT_H
+#include <dirent.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <pthread.h>
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+
+#include <numa.h>
+#include <radset.h>
+#include <cpuset.h>
+#include <sys/mman.h>
+
+/*
+ * TODO
+ *
+ * nsg_init(), nsg_attach_pid(), RAD_MIGRATE/RAD_WAIT
+ * assign_pid_to_pset()
+ *
+ * pthread_use_only_cpu too?
+ */
+
+static int
+prepare_radset(hwloc_topology_t topology __hwloc_attribute_unused, radset_t *radset, hwloc_const_bitmap_t hwloc_set)
+{
+  unsigned cpu;
+  cpuset_t target_cpuset;
+  cpuset_t cpuset, xor_cpuset;
+  radid_t radid;
+  int ret = 0;
+  int ret_errno = 0;
+  int nbnodes = rad_get_num();
+
+  cpusetcreate(&target_cpuset);
+  cpuemptyset(target_cpuset);
+  hwloc_bitmap_foreach_begin(cpu, hwloc_set)
+    cpuaddset(target_cpuset, cpu);
+  hwloc_bitmap_foreach_end();
+
+  cpusetcreate(&cpuset);
+  cpusetcreate(&xor_cpuset);
+  for (radid = 0; radid < nbnodes; radid++) {
+    cpuemptyset(cpuset);
+    if (rad_get_cpus(radid, cpuset)==-1) {
+      fprintf(stderr,"rad_get_cpus(%d) failed: %s\n",radid,strerror(errno));
+      continue;
+    }
+    cpuxorset(target_cpuset, cpuset, xor_cpuset);
+    if (cpucountset(xor_cpuset) == 0) {
+      /* Found it */
+      radsetcreate(radset);
+      rademptyset(*radset);
+      radaddset(*radset, radid);
+      ret = 1;
+      goto out;
+    }
+  }
+  /* radset containing exactly this set of CPUs not found */
+  ret_errno = EXDEV;
+
+out:
+  cpusetdestroy(&target_cpuset);
+  cpusetdestroy(&cpuset);
+  cpusetdestroy(&xor_cpuset);
+  errno = ret_errno;
+  return ret;
+}
+
+/* Note: get_cpubind not available on OSF */
+
+static int
+hwloc_osf_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  radset_t radset;
+
+  if (hwloc_bitmap_isequal(hwloc_set, hwloc_topology_get_complete_cpuset(topology))) {
+    if ((errno = pthread_rad_detach(thread)))
+      return -1;
+    return 0;
+  }
+
+  /* Apparently OSF migrates pages */
+  if (flags & HWLOC_CPUBIND_NOMEMBIND) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  if (!prepare_radset(topology, &radset, hwloc_set))
+    return -1;
+
+  if (flags & HWLOC_CPUBIND_STRICT) {
+    if ((errno = pthread_rad_bind(thread, radset, RAD_INSIST | RAD_WAIT)))
+      return -1;
+  } else {
+    if ((errno = pthread_rad_attach(thread, radset, RAD_WAIT)))
+      return -1;
+  }
+  radsetdestroy(&radset);
+
+  return 0;
+}
+
+static int
+hwloc_osf_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  radset_t radset;
+
+  if (hwloc_bitmap_isequal(hwloc_set, hwloc_topology_get_complete_cpuset(topology))) {
+    if (rad_detach_pid(pid))
+      return -1;
+    return 0;
+  }
+
+  /* Apparently OSF migrates pages */
+  if (flags & HWLOC_CPUBIND_NOMEMBIND) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  if (!prepare_radset(topology, &radset, hwloc_set))
+    return -1;
+
+  if (flags & HWLOC_CPUBIND_STRICT) {
+    if (rad_bind_pid(pid, radset, RAD_INSIST | RAD_WAIT))
+      return -1;
+  } else {
+    if (rad_attach_pid(pid, radset, RAD_WAIT))
+      return -1;
+  }
+  radsetdestroy(&radset);
+
+  return 0;
+}
+
+static int
+hwloc_osf_set_thisthread_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  return hwloc_osf_set_thread_cpubind(topology, pthread_self(), hwloc_set, flags);
+}
+
+static int
+hwloc_osf_set_thisproc_cpubind(hwloc_topology_t topology, hwloc_const_bitmap_t hwloc_set, int flags)
+{
+  return hwloc_osf_set_proc_cpubind(topology, getpid(), hwloc_set, flags);
+}
+
+static int
+hwloc_osf_prepare_mattr(hwloc_topology_t topology __hwloc_attribute_unused, memalloc_attr_t *mattr, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags __hwloc_attribute_unused)
+{
+  unsigned long osf_policy;
+  int node;
+
+  switch (policy) {
+    case HWLOC_MEMBIND_FIRSTTOUCH:
+      osf_policy = MPOL_THREAD;
+      break;
+    case HWLOC_MEMBIND_DEFAULT:
+    case HWLOC_MEMBIND_BIND:
+      osf_policy = MPOL_DIRECTED;
+      break;
+    case HWLOC_MEMBIND_INTERLEAVE:
+      osf_policy = MPOL_STRIPPED;
+      break;
+    case HWLOC_MEMBIND_REPLICATE:
+      osf_policy = MPOL_REPLICATED;
+      break;
+    default:
+      errno = ENOSYS;
+      return -1;
+  }
+
+  memset(mattr, 0, sizeof(*mattr));
+  mattr->mattr_policy = osf_policy;
+  mattr->mattr_rad = RAD_NONE;
+  radsetcreate(&mattr->mattr_radset);
+  rademptyset(mattr->mattr_radset);
+
+  hwloc_bitmap_foreach_begin(node, nodeset)
+    radaddset(mattr->mattr_radset, node);
+  hwloc_bitmap_foreach_end();
+  return 0;
+}
+
+static int
+hwloc_osf_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  memalloc_attr_t mattr;
+  int behavior = 0;
+  int ret;
+
+  if (flags & HWLOC_MEMBIND_MIGRATE)
+    behavior |= MADV_CURRENT;
+  if (flags & HWLOC_MEMBIND_STRICT)
+    behavior |= MADV_INSIST;
+
+  if (hwloc_osf_prepare_mattr(topology, &mattr, nodeset, policy, flags))
+    return -1;
+
+  ret = nmadvise(addr, len, MADV_CURRENT, &mattr);
+  radsetdestroy(&mattr.mattr_radset);
+  return ret;
+}
+
+static void *
+hwloc_osf_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  memalloc_attr_t mattr;
+  void *ptr;
+
+  if (hwloc_osf_prepare_mattr(topology, &mattr, nodeset, policy, flags))
+    return hwloc_alloc_or_fail(topology, len, flags);
+
+  /* TODO: rather use acreate/amalloc ? */
+  ptr = nmmap(NULL, len, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1,
+               0, &mattr);
+  radsetdestroy(&mattr.mattr_radset);
+  return ptr;
+}
+
+static int
+hwloc_look_osf(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  cpu_cursor_t cursor;
+  unsigned nbnodes;
+  radid_t radid, radid2;
+  radset_t radset, radset2;
+  cpuid_t cpuid;
+  cpuset_t cpuset;
+  struct hwloc_obj *obj;
+  unsigned distance;
+
+  if (topology->levels[0][0]->cpuset)
+    /* somebody discovered things */
+    return 0;
+
+  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+
+  nbnodes = rad_get_num();
+
+  cpusetcreate(&cpuset);
+  radsetcreate(&radset);
+  radsetcreate(&radset2);
+  {
+    hwloc_obj_t *nodes = calloc(nbnodes, sizeof(hwloc_obj_t));
+    unsigned *indexes = calloc(nbnodes, sizeof(unsigned));
+    float *distances = calloc(nbnodes*nbnodes, sizeof(float));
+    unsigned nfound;
+    numa_attr_t attr;
+
+    attr.nattr_type = R_RAD;
+    attr.nattr_descr.rd_radset = radset;
+    attr.nattr_flags = 0;
+
+    for (radid = 0; radid < (radid_t) nbnodes; radid++) {
+      rademptyset(radset);
+      radaddset(radset, radid);
+      cpuemptyset(cpuset);
+      if (rad_get_cpus(radid, cpuset)==-1) {
+	fprintf(stderr,"rad_get_cpus(%d) failed: %s\n",radid,strerror(errno));
+	continue;
+      }
+
+      indexes[radid] = radid;
+      nodes[radid] = obj = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, radid);
+      obj->nodeset = hwloc_bitmap_alloc();
+      hwloc_bitmap_set(obj->nodeset, radid);
+      obj->cpuset = hwloc_bitmap_alloc();
+      obj->memory.local_memory = rad_get_physmem(radid) * hwloc_getpagesize();
+      obj->memory.page_types_len = 2;
+      obj->memory.page_types = malloc(2*sizeof(*obj->memory.page_types));
+      memset(obj->memory.page_types, 0, 2*sizeof(*obj->memory.page_types));
+      obj->memory.page_types[0].size = hwloc_getpagesize();
+#ifdef HAVE__SC_LARGE_PAGESIZE
+      obj->memory.page_types[1].size = sysconf(_SC_LARGE_PAGESIZE);
+#endif
+
+      cursor = SET_CURSOR_INIT;
+      while((cpuid = cpu_foreach(cpuset, 0, &cursor)) != CPU_NONE)
+	hwloc_bitmap_set(obj->cpuset, cpuid);
+
+      hwloc_debug_1arg_bitmap("node %d has cpuset %s\n",
+		 radid, obj->cpuset);
+
+      hwloc_insert_object_by_cpuset(topology, obj);
+
+      nfound = 0;
+      for (radid2 = 0; radid2 < (radid_t) nbnodes; radid2++)
+	distances[radid*nbnodes+radid2] = RAD_DIST_REMOTE;
+      for (distance = RAD_DIST_LOCAL; distance < RAD_DIST_REMOTE; distance++) {
+	attr.nattr_distance = distance;
+	/* get set of NUMA nodes at distance <= DISTANCE */
+	if (nloc(&attr, radset2)) {
+	  fprintf(stderr,"nloc failed: %s\n", strerror(errno));
+	  continue;
+	}
+	cursor = SET_CURSOR_INIT;
+	while ((radid2 = rad_foreach(radset2, 0, &cursor)) != RAD_NONE) {
+	  if (distances[radid*nbnodes+radid2] == RAD_DIST_REMOTE) {
+            distances[radid*nbnodes+radid2] = (float) distance;
+	    nfound++;
+	  }
+	}
+	if (nfound == nbnodes)
+	  /* Finished finding distances, no need to go up to RAD_DIST_REMOTE */
+	  break;
+      }
+    }
+
+    hwloc_distances_set(topology, HWLOC_OBJ_NUMANODE, nbnodes, indexes, nodes, distances, 0 /* OS cannot force */);
+  }
+  radsetdestroy(&radset2);
+  radsetdestroy(&radset);
+  cpusetdestroy(&cpuset);
+
+  /* add PU objects */
+  hwloc_setup_pu_level(topology, hwloc_fallback_nbprocessors(topology));
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "OSF");
+  if (topology->is_thissystem)
+    hwloc_add_uname_info(topology, NULL);
+  return 1;
+}
+
+void
+hwloc_set_osf_hooks(struct hwloc_binding_hooks *hooks,
+		    struct hwloc_topology_support *support)
+{
+  hooks->set_thread_cpubind = hwloc_osf_set_thread_cpubind;
+  hooks->set_thisthread_cpubind = hwloc_osf_set_thisthread_cpubind;
+  hooks->set_proc_cpubind = hwloc_osf_set_proc_cpubind;
+  hooks->set_thisproc_cpubind = hwloc_osf_set_thisproc_cpubind;
+  hooks->set_area_membind = hwloc_osf_set_area_membind;
+  hooks->alloc_membind = hwloc_osf_alloc_membind;
+  hooks->alloc = hwloc_alloc_mmap;
+  hooks->free_membind = hwloc_free_mmap;
+  support->membind->firsttouch_membind = 1;
+  support->membind->bind_membind = 1;
+  support->membind->interleave_membind = 1;
+  support->membind->replicate_membind = 1;
+}
+
+static struct hwloc_backend *
+hwloc_osf_component_instantiate(struct hwloc_disc_component *component,
+				const void *_data1 __hwloc_attribute_unused,
+				const void *_data2 __hwloc_attribute_unused,
+				const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    return NULL;
+  backend->discover = hwloc_look_osf;
+  return backend;
+}
+
+static struct hwloc_disc_component hwloc_osf_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "osf",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_osf_component_instantiate,
+  50,
+  NULL
+};
+
+const struct hwloc_component hwloc_osf_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_osf_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology-synthetic.c b/ext/hwloc/hwloc/topology-synthetic.c
new file mode 100644
index 0000000..237729a
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-synthetic.c
@@ -0,0 +1,1128 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/misc.h>
+#include <private/debug.h>
+
+#include <limits.h>
+#include <assert.h>
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+
+struct hwloc_synthetic_level_data_s {
+  unsigned arity;
+  unsigned long totalwidth;
+  hwloc_obj_type_t type;
+  unsigned depth; /* For caches/groups */
+  hwloc_obj_cache_type_t cachetype; /* For caches */
+  hwloc_uint64_t memorysize; /* For caches/memory */
+
+  /* the indexes= attribute before parsing */
+  const char *index_string;
+  unsigned long index_string_length;
+  /* the array of explicit indexes after parsing */
+  unsigned *index_array;
+
+  /* used while filling the topology */
+  unsigned next_os_index; /* id of the next object for that level */
+};
+
+struct hwloc_synthetic_backend_data_s {
+  /* synthetic backend parameters */
+  char *string;
+#define HWLOC_SYNTHETIC_MAX_DEPTH 128
+  struct hwloc_synthetic_level_data_s level[HWLOC_SYNTHETIC_MAX_DEPTH];
+};
+
+struct hwloc_synthetic_intlv_loop_s {
+  unsigned step;
+  unsigned nb;
+  unsigned level_depth;
+};
+
+static void
+hwloc_synthetic_process_level_indexes(struct hwloc_synthetic_backend_data_s *data,
+				      unsigned curleveldepth,
+				      int verbose)
+{
+  struct hwloc_synthetic_level_data_s *curlevel = &data->level[curleveldepth];
+  unsigned long total = curlevel->totalwidth;
+  const char *attr = curlevel->index_string;
+  unsigned long length = curlevel->index_string_length;
+  unsigned *array = NULL;
+  struct hwloc_synthetic_intlv_loop_s * loops = NULL;
+  unsigned long i;
+
+  if (!attr)
+    return;
+
+  array = calloc(total, sizeof(*array));
+  if (!array) {
+    if (verbose)
+      fprintf(stderr, "Failed to allocate synthetic index array of size %lu\n", total);
+    goto out;
+  }
+
+  i = strspn(attr, "0123456789,");
+  if (i == length) {
+    /* explicit array of indexes */
+
+    for(i=0; i<total; i++) {
+      const char *next;
+      unsigned idx = strtoul(attr, (char **) &next, 10);
+      if (next == attr) {
+	if (verbose)
+	  fprintf(stderr, "Failed to read synthetic index #%lu at '%s'\n", i, attr);
+	goto out_with_array;
+      }
+
+      array[i] = idx;
+      if (i != total-1) {
+	if (*next != ',') {
+	  if (verbose)
+	    fprintf(stderr, "Missing comma after synthetic index #%lu at '%s'\n", i, attr);
+	  goto out_with_array;
+	}
+	attr = next+1;
+      } else {
+	attr = next;
+      }
+    }
+    curlevel->index_array = array;
+
+  } else {
+    /* interleaving */
+    unsigned nr_loops = 1, cur_loop;
+    unsigned minstep = total;
+    unsigned long nbs = 1;
+    unsigned j, mul;
+    const char *tmp;
+
+    tmp = attr;
+    while (tmp) {
+      tmp = strchr(tmp, ':');
+      if (!tmp || tmp >= attr+length)
+	break;
+      nr_loops++;
+      tmp++;
+    }
+    /* nr_loops colon-separated fields, but we may need one more at the end */
+    loops = malloc((nr_loops+1)*sizeof(*loops));
+    if (!loops) {
+      if (verbose)
+	fprintf(stderr, "Failed to allocate synthetic index interleave loop array of size %u\n", nr_loops);
+      goto out_with_array;
+    }
+
+    if (*attr >= '0' && *attr <= '9') {
+      /* interleaving as x*y:z*t:... */
+      unsigned step, nb;
+
+      tmp = attr;
+      cur_loop = 0;
+      while (tmp) {
+	char *tmp2, *tmp3;
+	step = (unsigned) strtol(tmp, &tmp2, 0);
+	if (tmp2 == tmp || *tmp2 != '*') {
+	  if (verbose)
+	    fprintf(stderr, "Failed to read synthetic index interleaving loop '%s' without number before '*'\n", tmp);
+	  goto out_with_loops;
+	}
+	if (!step) {
+	  if (verbose)
+	    fprintf(stderr, "Invalid interleaving loop with step 0 at '%s'\n", tmp);
+	  goto out_with_loops;
+	}
+	tmp2++;
+	nb = (unsigned) strtol(tmp2, &tmp3, 0);
+	if (tmp3 == tmp2 || (*tmp3 && *tmp3 != ':' && *tmp3 != ')' && *tmp3 != ' ')) {
+	  if (verbose)
+	    fprintf(stderr, "Failed to read synthetic index interleaving loop '%s' without number between '*' and ':'\n", tmp);
+	  goto out_with_loops;
+	}
+	if (!nb) {
+	  if (verbose)
+	    fprintf(stderr, "Invalid interleaving loop with number 0 at '%s'\n", tmp2);
+	  goto out_with_loops;
+	}
+	loops[cur_loop].step = step;
+	loops[cur_loop].nb = nb;
+	if (step < minstep)
+	  minstep = step;
+	nbs *= nb;
+	cur_loop++;
+	if (*tmp3 == ')' || *tmp3 == ' ')
+	  break;
+	tmp = (const char*) (tmp3+1);
+      }
+
+    } else {
+      /* interleaving as type1:type2:... */
+      hwloc_obj_type_t type;
+      hwloc_obj_cache_type_t cachetypeattr;
+      int depthattr;
+      int err;
+
+      /* find level depths for each interleaving loop */
+      tmp = attr;
+      cur_loop = 0;
+      while (tmp) {
+	err = hwloc_obj_type_sscanf(tmp, &type, &depthattr, &cachetypeattr, sizeof(cachetypeattr));
+	if (err < 0) {
+	  if (verbose)
+	    fprintf(stderr, "Failed to read synthetic index interleaving loop type '%s'\n", tmp);
+	  goto out_with_loops;
+	}
+	if (type == HWLOC_OBJ_MISC || type == HWLOC_OBJ_BRIDGE || type == HWLOC_OBJ_PCI_DEVICE || type == HWLOC_OBJ_OS_DEVICE) {
+	  if (verbose)
+	    fprintf(stderr, "Misc object type disallowed in synthetic index interleaving loop type '%s'\n", tmp);
+	  goto out_with_loops;
+	}
+	for(i=0; i<curleveldepth; i++) {
+	  if (type != data->level[i].type)
+	    continue;
+	  if ((type == HWLOC_OBJ_GROUP || type == HWLOC_OBJ_CACHE)
+	      && depthattr != -1
+	      && (unsigned) depthattr != data->level[i].depth)
+	    continue;
+	  if (type == HWLOC_OBJ_CACHE
+	      && cachetypeattr != (hwloc_obj_cache_type_t) -1
+	      && cachetypeattr != data->level[i].cachetype)
+	    continue;
+	  loops[cur_loop].level_depth = i;
+	  break;
+	}
+	if (i == curleveldepth) {
+	  if (verbose)
+	    fprintf(stderr, "Failed to find level for synthetic index interleaving loop type '%s' above '%s'\n",
+		    tmp, hwloc_obj_type_string(curlevel->type));
+	  goto out_with_loops;
+	}
+	tmp = strchr(tmp, ':');
+	if (!tmp || tmp > attr+length)
+	  break;
+	tmp++;
+	cur_loop++;
+      }
+
+      /* compute actual loop step/nb */
+      for(cur_loop=0; cur_loop<nr_loops; cur_loop++) {
+	unsigned mydepth = loops[cur_loop].level_depth;
+	unsigned prevdepth = 0;
+	unsigned step, nb;
+	for(i=0; i<nr_loops; i++) {
+	  if (loops[i].level_depth == mydepth && i != cur_loop) {
+	    if (verbose)
+	      fprintf(stderr, "Invalid duplicate interleaving loop type in synthetic index '%s'\n", attr);
+	    goto out_with_loops;
+	  }
+	  if (loops[i].level_depth < mydepth
+	      && loops[i].level_depth > prevdepth)
+	    prevdepth = loops[i].level_depth;
+	}
+	step = curlevel->totalwidth / data->level[mydepth].totalwidth; /* number of objects below us */
+	nb = data->level[mydepth].totalwidth / data->level[prevdepth].totalwidth; /* number of us within parent */
+
+	loops[cur_loop].step = step;
+	loops[cur_loop].nb = nb;
+	assert(nb);
+	assert(step);
+	if (step < minstep)
+	  minstep = step;
+	nbs *= nb;
+      }
+    }
+    assert(nbs);
+
+    if (nbs != total) {
+      /* one loop of total/nbs steps is missing, add it if it's just the smallest one */
+      if (minstep == total/nbs) {
+	loops[nr_loops].step = 1;
+	loops[nr_loops].nb = total/nbs;
+	nr_loops++;
+      } else {
+	if (verbose)
+	  fprintf(stderr, "Invalid index interleaving total width %lu instead of %lu\n", nbs, total);
+	goto out_with_loops;
+      }
+    }
+
+    /* generate the array of indexes */
+    mul = 1;
+    for(i=0; i<nr_loops; i++) {
+      unsigned step = loops[i].step;
+      unsigned nb = loops[i].nb;
+      for(j=0; j<total; j++)
+	array[j] += ((j / step) % nb) * mul;
+      mul *= nb;
+    }
+
+    /* check that we have the right values (cannot pass total, cannot give duplicate 0) */
+    for(j=0; j<total; j++) {
+      if (array[j] >= total) {
+	if (verbose)
+	  fprintf(stderr, "Invalid index interleaving generates out-of-range index %u\n", array[j]);
+	goto out_with_loops;
+      }
+      if (!array[j] && j) {
+	if (verbose)
+	  fprintf(stderr, "Invalid index interleaving generates duplicate index values\n");
+	goto out_with_loops;
+      }
+    }
+
+    free(loops);
+    curlevel->index_array = array;
+  }
+
+  return;
+
+ out_with_loops:
+  free(loops);
+ out_with_array:
+  free(array);
+ out:
+  return;
+}
+
+static hwloc_uint64_t
+hwloc_synthetic_parse_memory_attr(const char *attr, const char **endp)
+{
+  const char *endptr;
+  hwloc_uint64_t size;
+  size = strtoull(attr, (char **) &endptr, 0);
+  if (!hwloc_strncasecmp(endptr, "TB", 2)) {
+    size <<= 40;
+    endptr += 2;
+  } else if (!hwloc_strncasecmp(endptr, "GB", 2)) {
+    size <<= 30;
+    endptr += 2;
+  } else if (!hwloc_strncasecmp(endptr, "MB", 2)) {
+    size <<= 20;
+    endptr += 2;
+  } else if (!hwloc_strncasecmp(endptr, "kB", 2)) {
+    size <<= 10;
+    endptr += 2;
+  }
+  *endp = endptr;
+  return size;
+}
+
+static int
+hwloc_synthetic_parse_level_attrs(const char *attrs, const char **next_posp,
+				  struct hwloc_synthetic_level_data_s *curlevel,
+				  int verbose)
+{
+  hwloc_obj_type_t type = curlevel->type;
+  const char *next_pos;
+  hwloc_uint64_t memorysize = 0;
+  const char *index_string = NULL;
+  unsigned long index_string_length = 0;
+
+  next_pos = (const char *) strchr(attrs, ')');
+  if (!next_pos) {
+    if (verbose)
+      fprintf(stderr, "Missing attribute closing bracket in synthetic string doesn't have a number of objects at '%s'\n", attrs);
+    errno = EINVAL;
+    return -1;
+  }
+
+  while (')' != *attrs) {
+    if (HWLOC_OBJ_CACHE == type && !strncmp("size=", attrs, 5)) {
+      memorysize = hwloc_synthetic_parse_memory_attr(attrs+5, &attrs);
+
+    } else if (HWLOC_OBJ_CACHE != type && !strncmp("memory=", attrs, 7)) {
+      memorysize = hwloc_synthetic_parse_memory_attr(attrs+7, &attrs);
+
+    } else if (!strncmp("indexes=", attrs, 8)) {
+      index_string = attrs+8;
+      attrs += 8;
+      index_string_length = strcspn(attrs, " )");
+      attrs += index_string_length;
+
+    } else {
+      if (verbose)
+	fprintf(stderr, "Unknown attribute at '%s'\n", attrs);
+      errno = EINVAL;
+      return -1;
+    }
+
+    if (' ' == *attrs)
+      attrs++;
+    else if (')' != *attrs) {
+      if (verbose)
+	fprintf(stderr, "Missing parameter separator at '%s'\n", attrs);
+      errno = EINVAL;
+      return -1;
+    }
+  }
+
+  curlevel->memorysize = memorysize;
+  curlevel->index_string = index_string;
+  curlevel->index_string_length = index_string_length;
+  *next_posp = next_pos+1;
+  return 0;
+}
+
+/* Read from description a series of integers describing a symmetrical
+   topology and update the hwloc_synthetic_backend_data_s accordingly.  On
+   success, return zero.  */
+static int
+hwloc_backend_synthetic_init(struct hwloc_synthetic_backend_data_s *data,
+			     const char *description)
+{
+  const char *pos, *next_pos;
+  unsigned long item, count;
+  unsigned i;
+  int cache_depth = 0, group_depth = 0;
+  int nb_machine_levels = 0, nb_node_levels = 0;
+  int nb_pu_levels = 0;
+  int verbose = 0;
+  const char *env = getenv("HWLOC_SYNTHETIC_VERBOSE");
+  int err;
+  unsigned long totalarity = 1;
+
+  if (env)
+    verbose = atoi(env);
+
+  /* default values before we add root attributes */
+  data->level[0].totalwidth = 1;
+  data->level[0].type = HWLOC_OBJ_MACHINE;
+  data->level[0].index_string = NULL;
+  data->level[0].index_array = NULL;
+  data->level[0].memorysize = 0;
+  if (*description == '(') {
+    err = hwloc_synthetic_parse_level_attrs(description+1, &description, &data->level[0], verbose);
+    if (err < 0)
+      return err;
+  }
+
+  for (pos = description, count = 1; *pos; pos = next_pos) {
+#define HWLOC_OBJ_TYPE_UNKNOWN ((hwloc_obj_type_t) -1)
+    hwloc_obj_type_t type = HWLOC_OBJ_TYPE_UNKNOWN;
+    int typedepth = -1;
+    hwloc_obj_cache_type_t cachetype = (hwloc_obj_cache_type_t) -1;
+
+    /* initialize parent arity to 0 so that the levels are not infinite */
+    data->level[count-1].arity = 0;
+
+    while (*pos == ' ')
+      pos++;
+
+    if (!*pos)
+      break;
+
+    if (*pos < '0' || *pos > '9') {
+      if (hwloc_obj_type_sscanf(pos, &type, &typedepth, &cachetype, sizeof(cachetype)) < 0) {
+	if (verbose)
+	  fprintf(stderr, "Synthetic string with unknown object type at '%s'\n", pos);
+	errno = EINVAL;
+	goto error;
+      }
+      if (type == HWLOC_OBJ_MISC || type == HWLOC_OBJ_BRIDGE || type == HWLOC_OBJ_PCI_DEVICE || type == HWLOC_OBJ_OS_DEVICE) {
+	if (verbose)
+	  fprintf(stderr, "Synthetic string with disallowed object type at '%s'\n", pos);
+	errno = EINVAL;
+	goto error;
+      }
+
+      next_pos = strchr(pos, ':');
+      if (!next_pos) {
+	if (verbose)
+	  fprintf(stderr,"Synthetic string doesn't have a `:' after object type at '%s'\n", pos);
+	errno = EINVAL;
+	goto error;
+      }
+      pos = next_pos + 1;
+    }
+    data->level[count].type = type;
+    data->level[count].depth = (unsigned) typedepth;
+    data->level[count].cachetype = cachetype;
+
+    item = strtoul(pos, (char **)&next_pos, 0);
+    if (next_pos == pos) {
+      if (verbose)
+	fprintf(stderr,"Synthetic string doesn't have a number of objects at '%s'\n", pos);
+      errno = EINVAL;
+      goto error;
+    }
+    data->level[count-1].arity = (unsigned)item;
+
+    totalarity *= item;
+    data->level[count].totalwidth = totalarity;
+    data->level[count].index_string = NULL;
+    data->level[count].index_array = NULL;
+    data->level[count].memorysize = 0;
+    if (*next_pos == '(') {
+      err = hwloc_synthetic_parse_level_attrs(next_pos+1, &next_pos, &data->level[count], verbose);
+      if (err < 0)
+	goto error;
+    }
+
+    if (count + 1 >= HWLOC_SYNTHETIC_MAX_DEPTH) {
+      if (verbose)
+	fprintf(stderr,"Too many synthetic levels, max %d\n", HWLOC_SYNTHETIC_MAX_DEPTH);
+      errno = EINVAL;
+      goto error;
+    }
+    if (item > UINT_MAX) {
+      if (verbose)
+	fprintf(stderr,"Too big arity, max %u\n", UINT_MAX);
+      errno = EINVAL;
+      goto error;
+    }
+
+    count++;
+  }
+
+  if (count <= 0) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string doesn't contain any object\n");
+    errno = EINVAL;
+    goto error;
+  }
+
+  for(i=count-1; i>0; i--) {
+    struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+    hwloc_obj_type_t type;
+
+    type = curlevel->type;
+
+    if (type == HWLOC_OBJ_TYPE_UNKNOWN) {
+      if (i == count-1)
+	type = HWLOC_OBJ_PU;
+      else {
+	switch (data->level[i+1].type) {
+	case HWLOC_OBJ_PU: type = HWLOC_OBJ_CORE; break;
+	case HWLOC_OBJ_CORE: type = HWLOC_OBJ_CACHE; break;
+	case HWLOC_OBJ_CACHE: type = HWLOC_OBJ_PACKAGE; break;
+	case HWLOC_OBJ_PACKAGE: type = HWLOC_OBJ_NUMANODE; break;
+	case HWLOC_OBJ_NUMANODE:
+	case HWLOC_OBJ_MACHINE:
+	case HWLOC_OBJ_GROUP: type = HWLOC_OBJ_GROUP; break;
+	default:
+	  assert(0);
+	}
+      }
+      curlevel->type = type;
+    }
+    switch (type) {
+      case HWLOC_OBJ_PU:
+	nb_pu_levels++;
+	break;
+      case HWLOC_OBJ_CACHE:
+	cache_depth++;
+	break;
+      case HWLOC_OBJ_GROUP:
+	group_depth++;
+	break;
+      case HWLOC_OBJ_NUMANODE:
+	nb_node_levels++;
+	break;
+      case HWLOC_OBJ_MACHINE:
+	nb_machine_levels++;
+	break;
+      default:
+	break;
+    }
+  }
+
+  if (!nb_pu_levels) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string missing ending number of PUs\n");
+    errno = EINVAL;
+    return -1;
+  }
+  if (nb_pu_levels > 1) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string can not have several PU levels\n");
+    errno = EINVAL;
+    return -1;
+  }
+  if (nb_node_levels > 1) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string can not have several NUMA node levels\n");
+    errno = EINVAL;
+    return -1;
+  }
+  if (nb_machine_levels > 1) {
+    if (verbose)
+      fprintf(stderr, "Synthetic string can not have several machine levels\n");
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (nb_machine_levels)
+    data->level[0].type = HWLOC_OBJ_SYSTEM;
+  else {
+    data->level[0].type = HWLOC_OBJ_MACHINE;
+    nb_machine_levels++;
+  }
+
+  /* enforce a NUMA level */
+  if (!nb_node_levels) {
+    /* insert a NUMA level and the machine level */
+    if (data->level[1].type == HWLOC_OBJ_MACHINE)
+      /* there's an explicit machine level after the automatic system root, insert below both */
+      i = 2;
+    else
+      /* insert below the automatic machine root */
+      i = 1;
+    if (verbose)
+      fprintf(stderr, "Inserting a NUMA level with a single object at depth %u\n", i);
+    /* move existing levels by one */
+    memmove(&data->level[i+1], &data->level[i], (count*i)*sizeof(struct hwloc_synthetic_level_data_s));
+    data->level[i].type = HWLOC_OBJ_NUMANODE;
+    data->level[i].index_string = NULL;
+    data->level[i].index_array = NULL;
+    data->level[i].memorysize = 0;
+    data->level[i].totalwidth = data->level[i-1].totalwidth;
+    /* update arity to insert a single NUMA node per parent */
+    data->level[i].arity = data->level[i-1].arity;
+    data->level[i-1].arity = 1;
+    count++;
+  }
+
+  if (cache_depth == 1)
+    /* if there is a single cache level, make it L2 */
+    cache_depth = 2;
+
+  for (i=0; i<count; i++) {
+    struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+    hwloc_obj_type_t type = curlevel->type;
+
+    if (type == HWLOC_OBJ_GROUP) {
+      if (curlevel->depth == (unsigned)-1)
+	curlevel->depth = group_depth--;
+
+    } else if (type == HWLOC_OBJ_CACHE) {
+      if (curlevel->depth == (unsigned)-1)
+	curlevel->depth = cache_depth--;
+      if (curlevel->cachetype == (hwloc_obj_cache_type_t) -1)
+	curlevel->cachetype = curlevel->depth == 1 ? HWLOC_OBJ_CACHE_DATA : HWLOC_OBJ_CACHE_UNIFIED;
+      if (!curlevel->memorysize) {
+	if (1 == curlevel->depth)
+	  /* 32Kb in L1 */
+	  curlevel->memorysize = 32*1024;
+	else
+	  /* *4 at each level, starting from 1MB for L2, unified */
+	  curlevel->memorysize = 256*1024 << (2*curlevel->depth);
+      }
+
+    } else if (type == HWLOC_OBJ_NUMANODE && !curlevel->memorysize) {
+      /* 1GB in memory nodes. */
+      curlevel->memorysize = 1024*1024*1024;
+    }
+
+    hwloc_synthetic_process_level_indexes(data, i, verbose);
+  }
+
+  data->string = strdup(description);
+  data->level[count-1].arity = 0;
+  return 0;
+
+ error:
+  for(i=0; i<HWLOC_SYNTHETIC_MAX_DEPTH; i++) {
+    struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+    free(curlevel->index_array);
+    if (!curlevel->arity)
+      break;
+  }
+  return -1;
+}
+
+static void
+hwloc_synthetic__post_look_hooks(struct hwloc_synthetic_level_data_s *curlevel,
+				 hwloc_obj_t obj)
+{
+  switch (obj->type) {
+  case HWLOC_OBJ_GROUP:
+    obj->attr->group.depth = curlevel->depth;
+    break;
+  case HWLOC_OBJ_SYSTEM:
+    break;
+  case HWLOC_OBJ_MACHINE:
+    break;
+  case HWLOC_OBJ_NUMANODE:
+    break;
+  case HWLOC_OBJ_PACKAGE:
+    break;
+  case HWLOC_OBJ_CACHE:
+    obj->attr->cache.depth = curlevel->depth;
+    obj->attr->cache.linesize = 64;
+    obj->attr->cache.type = curlevel->cachetype;
+    obj->attr->cache.size = curlevel->memorysize;
+    break;
+  case HWLOC_OBJ_CORE:
+    break;
+  case HWLOC_OBJ_PU:
+    break;
+  case HWLOC_OBJ_BRIDGE:
+  case HWLOC_OBJ_PCI_DEVICE:
+  case HWLOC_OBJ_OS_DEVICE:
+  case HWLOC_OBJ_MISC:
+  case HWLOC_OBJ_TYPE_MAX:
+    /* Should never happen */
+    assert(0);
+    break;
+  }
+  if (curlevel->memorysize && HWLOC_OBJ_CACHE != obj->type) {
+    obj->memory.local_memory = curlevel->memorysize;
+    obj->memory.page_types_len = 1;
+    obj->memory.page_types = malloc(sizeof(*obj->memory.page_types));
+    memset(obj->memory.page_types, 0, sizeof(*obj->memory.page_types));
+    obj->memory.page_types[0].size = 4096;
+    obj->memory.page_types[0].count = curlevel->memorysize / 4096;
+  }
+}
+
+/*
+ * Recursively build objects whose cpu start at first_cpu
+ * - level gives where to look in the type, arity and id arrays
+ * - the id array is used as a variable to get unique IDs for a given level.
+ * - generated memory should be added to *memory_kB.
+ * - generated cpus should be added to parent_cpuset.
+ * - next cpu number to be used should be returned.
+ */
+static void
+hwloc__look_synthetic(struct hwloc_topology *topology,
+		      struct hwloc_synthetic_backend_data_s *data,
+		      int level,
+		      hwloc_bitmap_t parent_cpuset)
+{
+  hwloc_obj_t obj;
+  unsigned i;
+  struct hwloc_synthetic_level_data_s *curlevel = &data->level[level];
+  hwloc_obj_type_t type = curlevel->type;
+  unsigned os_index;
+
+  /* pre-hooks */
+  switch (type) {
+    case HWLOC_OBJ_GROUP:
+      break;
+    case HWLOC_OBJ_MACHINE:
+      break;
+    case HWLOC_OBJ_NUMANODE:
+      break;
+    case HWLOC_OBJ_PACKAGE:
+      break;
+    case HWLOC_OBJ_CACHE:
+      break;
+    case HWLOC_OBJ_CORE:
+      break;
+    case HWLOC_OBJ_PU:
+      break;
+    case HWLOC_OBJ_SYSTEM:
+    case HWLOC_OBJ_BRIDGE:
+    case HWLOC_OBJ_PCI_DEVICE:
+    case HWLOC_OBJ_OS_DEVICE:
+    case HWLOC_OBJ_MISC:
+    case HWLOC_OBJ_TYPE_MAX:
+      /* Should never happen */
+      assert(0);
+      break;
+  }
+
+  os_index = curlevel->next_os_index++;
+  if (curlevel->index_array)
+    os_index = curlevel->index_array[os_index];
+  obj = hwloc_alloc_setup_object(type, os_index);
+  obj->cpuset = hwloc_bitmap_alloc();
+
+  if (!curlevel->arity) {
+    hwloc_bitmap_set(obj->cpuset, os_index);
+  } else {
+    for (i = 0; i < curlevel->arity; i++)
+      hwloc__look_synthetic(topology, data, level + 1, obj->cpuset);
+  }
+
+  if (type == HWLOC_OBJ_NUMANODE) {
+    obj->nodeset = hwloc_bitmap_alloc();
+    hwloc_bitmap_set(obj->nodeset, os_index);
+  }
+
+  hwloc_bitmap_or(parent_cpuset, parent_cpuset, obj->cpuset);
+
+  hwloc_synthetic__post_look_hooks(curlevel, obj);
+
+  hwloc_insert_object_by_cpuset(topology, obj);
+}
+
+static int
+hwloc_look_synthetic(struct hwloc_backend *backend)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_synthetic_backend_data_s *data = backend->private_data;
+  hwloc_bitmap_t cpuset = hwloc_bitmap_alloc();
+  unsigned i;
+
+  assert(!topology->levels[0][0]->cpuset);
+
+  hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+
+  topology->support.discovery->pu = 1;
+
+  /* start with os_index 0 for each level */
+  for (i = 0; data->level[i].arity > 0; i++)
+    data->level[i].next_os_index = 0;
+  /* ... including the last one */
+  data->level[i].next_os_index = 0;
+
+  /* update first level type according to the synthetic type array */
+  topology->levels[0][0]->type = data->level[0].type;
+  hwloc_synthetic__post_look_hooks(&data->level[0], topology->levels[0][0]);
+
+  for (i = 0; i < data->level[0].arity; i++)
+    hwloc__look_synthetic(topology, data, 1, cpuset);
+
+  hwloc_bitmap_free(cpuset);
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "Synthetic");
+  hwloc_obj_add_info(topology->levels[0][0], "SyntheticDescription", data->string);
+  return 1;
+}
+
+static void
+hwloc_synthetic_backend_disable(struct hwloc_backend *backend)
+{
+  struct hwloc_synthetic_backend_data_s *data = backend->private_data;
+  unsigned i;
+  for(i=0; i<HWLOC_SYNTHETIC_MAX_DEPTH; i++) {
+    struct hwloc_synthetic_level_data_s *curlevel = &data->level[i];
+    free(curlevel->index_array);
+    if (!curlevel->arity)
+      break;
+  }
+  free(data->string);
+  free(data);
+}
+
+static struct hwloc_backend *
+hwloc_synthetic_component_instantiate(struct hwloc_disc_component *component,
+				      const void *_data1,
+				      const void *_data2 __hwloc_attribute_unused,
+				      const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  struct hwloc_synthetic_backend_data_s *data;
+  int err;
+
+  if (!_data1) {
+    const char *env = getenv("HWLOC_SYNTHETIC");
+    if (env) {
+      /* 'synthetic' was given in HWLOC_COMPONENTS without a description */
+      _data1 = env;
+    } else {
+      errno = EINVAL;
+      goto out;
+    }
+  }
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    goto out;
+
+  data = malloc(sizeof(*data));
+  if (!data) {
+    errno = ENOMEM;
+    goto out_with_backend;
+  }
+
+  err = hwloc_backend_synthetic_init(data, (const char *) _data1);
+  if (err < 0)
+    goto out_with_data;
+
+  backend->private_data = data;
+  backend->discover = hwloc_look_synthetic;
+  backend->disable = hwloc_synthetic_backend_disable;
+  backend->is_thissystem = 0;
+
+  return backend;
+
+ out_with_data:
+  free(data);
+ out_with_backend:
+  free(backend);
+ out:
+  return NULL;
+}
+
+static struct hwloc_disc_component hwloc_synthetic_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  "synthetic",
+  ~0,
+  hwloc_synthetic_component_instantiate,
+  30,
+  NULL
+};
+
+const struct hwloc_component hwloc_synthetic_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_synthetic_disc_component
+};
+
+static int hwloc_topology_export_synthetic_indexes(struct hwloc_topology * topology,
+						   hwloc_obj_t obj,
+						   char *buffer, size_t buflen)
+{
+  unsigned depth = obj->depth;
+  unsigned total = topology->level_nbobjects[depth];
+  unsigned step = 1;
+  unsigned nr_loops = 0;
+  struct hwloc_synthetic_intlv_loop_s *loops = NULL;
+  hwloc_obj_t cur;
+  unsigned i, j;
+  ssize_t tmplen = buflen;
+  char *tmp = buffer;
+  int res, ret = 0;
+
+  /* must start with 0 */
+  if (obj->os_index)
+    goto exportall;
+
+  while (step != total) {
+    /* must be a divider of the total */
+    if (total % step)
+      goto exportall;
+
+    /* look for os_index == step */
+    for(i=1; i<total; i++)
+      if (topology->levels[depth][i]->os_index == step)
+	break;
+    if (i == total)
+      goto exportall;
+    for(j=2; j<total/i; j++)
+      if (topology->levels[depth][i*j]->os_index != step*j)
+	break;
+
+    nr_loops++;
+    loops = realloc(loops, nr_loops*sizeof(*loops));
+    if (!loops)
+      goto exportall;
+    loops[nr_loops-1].step = i;
+    loops[nr_loops-1].nb = j;
+    step *= j;
+  }
+
+  /* check this interleaving */
+  for(i=0; i<total; i++) {
+    unsigned ind = 0;
+    unsigned mul = 1;
+    for(j=0; j<nr_loops; j++) {
+      ind += (i / loops[j].step) % loops[j].nb * mul;
+      mul *= loops[j].nb;
+    }
+    if (topology->levels[depth][i]->os_index != ind)
+      goto exportall;
+  }
+
+  /* success, print it */
+  for(j=0; j<nr_loops; j++) {
+    res = hwloc_snprintf(tmp, tmplen, "%u*%u%s", loops[j].step, loops[j].nb,
+			 j == nr_loops-1 ? ")" : ":");
+    if (res < 0) {
+      free(loops);
+      return -1;
+    }
+    ret += res;
+    if (res >= tmplen)
+      res = tmplen>0 ? tmplen - 1 : 0;
+    tmp += res;
+    tmplen -= res;
+  }
+
+  if (loops)
+    free(loops);
+
+  return ret;
+
+ exportall:
+  if (loops)
+    free(loops);
+
+  /* dump all indexes */
+  cur = obj;
+  while (cur) {
+    res = snprintf(tmp, tmplen, "%u%s", cur->os_index,
+		   cur->next_cousin ? "," : ")");
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= tmplen)
+      res = tmplen>0 ? tmplen - 1 : 0;
+    tmp += res;
+    tmplen -= res;
+    cur = cur->next_cousin;
+  }
+  return ret;
+}
+
+static int hwloc_topology_export_synthetic_obj_attr(struct hwloc_topology * topology,
+						    hwloc_obj_t obj,
+						    char *buffer, size_t buflen)
+{
+  const char * separator = " ";
+  const char * prefix = "(";
+  char cachesize[64] = "";
+  char memsize[64] = "";
+  int needindexes = 0;
+
+  if (HWLOC_OBJ_CACHE == obj->type && obj->attr->cache.size) {
+    snprintf(cachesize, sizeof(cachesize), "%ssize=%llu",
+	     prefix, (unsigned long long) obj->attr->cache.size);
+    prefix = separator;
+  }
+  if (obj->memory.local_memory) {
+    snprintf(memsize, sizeof(memsize), "%smemory=%llu",
+	     prefix, (unsigned long long) obj->memory.local_memory);
+    prefix = separator;
+  }
+  if (obj->type == HWLOC_OBJ_PU || obj->type == HWLOC_OBJ_NUMANODE) {
+    hwloc_obj_t cur = obj;
+    while (cur) {
+      if (cur->os_index != cur->logical_index) {
+	needindexes = 1;
+	break;
+      }
+      cur = cur->next_cousin;
+    }
+  }
+  if (*cachesize || *memsize || needindexes) {
+    ssize_t tmplen = buflen;
+    char *tmp = buffer;
+    int res, ret = 0;
+
+    res = hwloc_snprintf(tmp, tmplen, "%s%s%s", cachesize, memsize, needindexes ? "" : ")");
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= tmplen)
+      res = tmplen>0 ? tmplen - 1 : 0;
+    tmp += res;
+    tmplen -= res;
+
+    if (needindexes) {
+      res = snprintf(tmp, tmplen, "%sindexes=", prefix);
+      if (res < 0)
+	return -1;
+      ret += res;
+      if (res >= tmplen)
+	res = tmplen>0 ? tmplen - 1 : 0;
+      tmp += res;
+      tmplen -= res;
+
+      res = hwloc_topology_export_synthetic_indexes(topology, obj, tmp, tmplen);
+      if (res < 0)
+	return -1;
+      ret += res;
+      if (res >= tmplen)
+	res = tmplen>0 ? tmplen - 1 : 0;
+      tmp += res;
+      tmplen -= res;
+    }
+    return ret;
+  } else {
+    return 0;
+  }
+}
+
+int
+hwloc_topology_export_synthetic(struct hwloc_topology * topology,
+				char *buffer, size_t buflen,
+				unsigned long flags)
+{
+  hwloc_obj_t obj = hwloc_get_root_obj(topology);
+  ssize_t tmplen = buflen;
+  char *tmp = buffer;
+  int res, ret = 0;
+   int arity;
+  const char * separator = " ";
+  const char * prefix = "";
+
+  if (flags & ~(HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES|HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* TODO: add a flag to ignore symmetric_subtree and I/Os.
+   * just assume things are symmetric with the left branches of the tree.
+   * but the number of objects per level may be wrong, what to do with OS index array in this case?
+   * only allow ignoring symmetric_subtree if the level width remains OK?
+   */
+
+  /* TODO: add a root object by default, with a prefix such as tree=
+   * so that we can backward-compatibly recognize whether there's a root or not.
+   * and add a flag to disable it.
+   */
+
+  /* TODO: flag to force all indexes, not only for PU and NUMA? */
+
+  if (!obj->symmetric_subtree) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
+    /* root attributes */
+    res = hwloc_topology_export_synthetic_obj_attr(topology, obj, tmp, tmplen);
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (ret > 0)
+      prefix = separator;
+    if (res >= tmplen)
+      res = tmplen>0 ? tmplen - 1 : 0;
+    tmp += res;
+    tmplen -= res;
+  }
+
+  arity = obj->arity;
+  while (arity) {
+    /* for each level */
+    obj = obj->first_child;
+    if (flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES) {
+      res = hwloc_snprintf(tmp, tmplen, "%s%s:%u", prefix, hwloc_obj_type_string(obj->type), arity);
+    } else {
+      char types[64];
+      hwloc_obj_type_snprintf(types, sizeof(types), obj, 1);
+      res = hwloc_snprintf(tmp, tmplen, "%s%s:%u", prefix, types, arity);
+    }
+    if (res < 0)
+      return -1;
+    ret += res;
+    if (res >= tmplen)
+      res = tmplen>0 ? tmplen - 1 : 0;
+    tmp += res;
+    tmplen -= res;
+
+    if (!(flags & HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)) {
+      /* obj attributes */
+      res = hwloc_topology_export_synthetic_obj_attr(topology, obj, tmp, tmplen);
+      if (res < 0)
+	return -1;
+      ret += res;
+      if (res >= tmplen)
+	res = tmplen>0 ? tmplen - 1 : 0;
+      tmp += res;
+      tmplen -= res;
+    }
+
+    /* next level */
+    prefix = separator;
+    arity = obj->arity;
+  }
+
+  return ret;
+}
diff --git a/ext/hwloc/hwloc/topology-x86.c b/ext/hwloc/hwloc/topology-x86.c
new file mode 100644
index 0000000..1234ce4
--- /dev/null
+++ b/ext/hwloc/hwloc/topology-x86.c
@@ -0,0 +1,1386 @@
+/*
+ * Copyright © 2010-2015 Inria.  All rights reserved.
+ * Copyright © 2010-2013 Université Bordeaux
+ * Copyright © 2010-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ *
+ *
+ * This backend is only used when the operating system does not export
+ * the necessary hardware topology information to user-space applications.
+ * Currently, only the FreeBSD backend relies on this x86 backend.
+ *
+ * Other backends such as Linux have their own way to retrieve various
+ * pieces of hardware topology information from the operating system
+ * on various architectures, without having to use this x86-specific code.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+#include <private/misc.h>
+
+#include <private/cpuid-x86.h>
+
+#include <sys/types.h>
+#include <dirent.h>
+
+struct hwloc_x86_backend_data_s {
+  unsigned nbprocs;
+  hwloc_bitmap_t apicid_set;
+  int apicid_unique;
+  char *src_cpuiddump_path;
+};
+
+/************************************
+ * Management of cpuid dump as input
+ */
+
+struct cpuiddump {
+  unsigned nr;
+  struct cpuiddump_entry {
+    unsigned inmask; /* which of ine[abcd]x are set on input */
+    unsigned ineax;
+    unsigned inebx;
+    unsigned inecx;
+    unsigned inedx;
+    unsigned outeax;
+    unsigned outebx;
+    unsigned outecx;
+    unsigned outedx;
+  } *entries;
+};
+
+static void
+cpuiddump_free(struct cpuiddump *cpuiddump)
+{
+  if (cpuiddump->nr)
+    free(cpuiddump->entries);
+  free(cpuiddump);
+}
+
+static struct cpuiddump *
+cpuiddump_read(const char *dirpath, unsigned idx)
+{
+  struct cpuiddump *cpuiddump;
+  struct cpuiddump_entry *cur;
+  char *filename;
+  size_t filenamelen = strlen(dirpath) + 15;
+  FILE *file;
+  char line[128];
+  unsigned nr;
+
+  cpuiddump = malloc(sizeof(*cpuiddump));
+  cpuiddump->nr = 0; /* return a cpuiddump that will raise errors because it matches nothing */
+
+  filename = malloc(filenamelen);
+  snprintf(filename, filenamelen, "%s/pu%u", dirpath, idx);
+  file = fopen(filename, "r");
+  if (!file) {
+    fprintf(stderr, "Could not read dumped cpuid file %s\n", filename);
+    free(filename);
+    return cpuiddump;
+  }
+  free(filename);
+
+  nr = 0;
+  while (fgets(line, sizeof(line), file))
+    nr++;
+  cpuiddump->entries = malloc(nr * sizeof(struct cpuiddump_entry));
+
+  fseek(file, 0, SEEK_SET);
+  cur = &cpuiddump->entries[0];
+  nr = 0;
+  while (fgets(line, sizeof(line), file)) {
+    if (*line == '#')
+      continue;
+    if (sscanf(line, "%x %x %x %x %x => %x %x %x %x",
+	      &cur->inmask,
+	      &cur->ineax, &cur->inebx, &cur->inecx, &cur->inedx,
+	      &cur->outeax, &cur->outebx, &cur->outecx, &cur->outedx) == 9) {
+      cur++;
+      nr++;
+    }
+  }
+  cpuiddump->nr = nr;
+  fclose(file);
+  return cpuiddump;
+}
+
+static void
+cpuiddump_find_by_input(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx, struct cpuiddump *cpuiddump)
+{
+  unsigned i;
+
+  for(i=0; i<cpuiddump->nr; i++) {
+    struct cpuiddump_entry *entry = &cpuiddump->entries[i];
+    if ((entry->inmask & 0x1) && *eax != entry->ineax)
+      continue;
+    if ((entry->inmask & 0x2) && *ebx != entry->inebx)
+      continue;
+    if ((entry->inmask & 0x4) && *ecx != entry->inecx)
+      continue;
+    if ((entry->inmask & 0x8) && *edx != entry->inedx)
+      continue;
+    *eax = entry->outeax;
+    *ebx = entry->outebx;
+    *ecx = entry->outecx;
+    *edx = entry->outedx;
+    return;
+  }
+
+  fprintf(stderr, "Couldn't find %x,%x,%x,%x in dumped cpuid, returning 0s.\n",
+	  *eax, *ebx, *ecx, *edx);
+  *eax = 0;
+  *ebx = 0;
+  *ecx = 0;
+  *edx = 0;
+}
+
+static void cpuid_or_from_dump(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx, struct cpuiddump *src_cpuiddump)
+{
+  if (src_cpuiddump) {
+    cpuiddump_find_by_input(eax, ebx, ecx, edx, src_cpuiddump);
+  } else {
+    hwloc_x86_cpuid(eax, ebx, ecx, edx);
+  }
+}
+
+/*******************************
+ * Core detection routines and structures
+ */
+
+#define has_topoext(features) ((features)[6] & (1 << 22))
+#define has_x2apic(features) ((features)[4] & (1 << 21))
+
+struct cacheinfo {
+  unsigned type;
+  unsigned level;
+  unsigned nbthreads_sharing;
+
+  unsigned linesize;
+  unsigned linepart;
+  int ways;
+  unsigned sets;
+  unsigned long size;
+  char inclusiveness;
+
+};
+
+struct procinfo {
+  unsigned present;
+  unsigned apicid;
+  unsigned max_log_proc;
+  unsigned max_nbcores;
+  unsigned max_nbthreads;
+  unsigned packageid;
+  unsigned nodeid;
+  unsigned unitid;
+  unsigned logprocid;
+  unsigned threadid;
+  unsigned coreid;
+  unsigned *otherids;
+  unsigned levels;
+  unsigned numcaches;
+  struct cacheinfo *cache;
+  char cpuvendor[13];
+  char cpumodel[3*4*4+1];
+  unsigned cpustepping;
+  unsigned cpumodelnumber;
+  unsigned cpufamilynumber;
+};
+
+enum cpuid_type {
+  intel,
+  amd,
+  unknown
+};
+
+static void fill_amd_cache(struct procinfo *infos, unsigned level, int type, unsigned cpuid)
+{
+  struct cacheinfo *cache;
+  unsigned cachenum;
+  unsigned long size = 0;
+
+  if (level == 1)
+    size = ((cpuid >> 24)) << 10;
+  else if (level == 2)
+    size = ((cpuid >> 16)) << 10;
+  else if (level == 3)
+    size = ((cpuid >> 18)) << 19;
+  if (!size)
+    return;
+
+  cachenum = infos->numcaches++;
+  infos->cache = realloc(infos->cache, infos->numcaches*sizeof(*infos->cache));
+  cache = &infos->cache[cachenum];
+
+  cache->type = type;
+  cache->level = level;
+  if (level <= 2)
+    cache->nbthreads_sharing = 1;
+  else
+    cache->nbthreads_sharing = infos->max_log_proc;
+  cache->linesize = cpuid & 0xff;
+  cache->linepart = 0;
+  if (level == 1) {
+    cache->inclusiveness = 0;//get inclusiveness old AMD ( suposed to be L1 false)
+
+    cache->ways = (cpuid >> 16) & 0xff;
+    if (cache->ways == 0xff)
+      /* Fully associative */
+      cache->ways = -1;
+  } else {
+    cache->inclusiveness = 1;//get inclusivenessold AMD ( suposed to be L2 L3 true)
+
+    static const unsigned ways_tab[] = { 0, 1, 2, 0, 4, 0, 8, 0, 16, 0, 32, 48, 64, 96, 128, -1 };
+    unsigned ways = (cpuid >> 12) & 0xf;
+    cache->ways = ways_tab[ways];
+  }
+  cache->size = size;
+  cache->sets = 0;
+
+  hwloc_debug("cache L%u t%u linesize %u ways %u size %luKB\n", cache->level, cache->nbthreads_sharing, cache->linesize, cache->ways, cache->size >> 10);
+}
+
+/* Fetch information from the processor itself thanks to cpuid and store it in
+ * infos for summarize to analyze them globally */
+static void look_proc(struct hwloc_backend *backend, struct procinfo *infos, unsigned highest_cpuid, unsigned highest_ext_cpuid, unsigned *features, enum cpuid_type cpuid_type, struct cpuiddump *src_cpuiddump)
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  unsigned eax, ebx, ecx = 0, edx;
+  unsigned cachenum;
+  struct cacheinfo *cache;
+  unsigned regs[4];
+  unsigned _model, _extendedmodel, _family, _extendedfamily;
+
+  infos->present = 1;
+
+  /* on return from this function, the following fields must be set in infos:
+   * packageid, nodeid, unitid, coreid, threadid, or -1
+   * apicid
+   * levels and levels slots in otherids[]
+   * numcaches and numcaches slots in caches[]
+   *
+   * max_log_proc, max_nbthreads, max_nbcores, logprocid
+   * are only used temporarily inside this function and its callees.
+   */
+
+  /* Get apicid, max_log_proc, packageid, logprocid from cpuid 0x01 */
+  eax = 0x01;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  infos->apicid = ebx >> 24;
+  if (edx & (1 << 28))
+    infos->max_log_proc = 1 << hwloc_flsl(((ebx >> 16) & 0xff) - 1);
+  else
+    infos->max_log_proc = 1;
+  hwloc_debug("APIC ID 0x%02x max_log_proc %u\n", infos->apicid, infos->max_log_proc);
+  infos->packageid = infos->apicid / infos->max_log_proc;
+  infos->logprocid = infos->apicid % infos->max_log_proc;
+  hwloc_debug("phys %u thread %u\n", infos->packageid, infos->logprocid);
+
+  /* Get cpu model/family/stepping numbers from same cpuid */
+  _model          = (eax>>4) & 0xf;
+  _extendedmodel  = (eax>>16) & 0xf;
+  _family         = (eax>>8) & 0xf;
+  _extendedfamily = (eax>>20) & 0xff;
+  if ((cpuid_type == intel || cpuid_type == amd) && _family == 0xf) {
+    infos->cpufamilynumber = _family + _extendedfamily;
+  } else {
+    infos->cpufamilynumber = _family;
+  }
+  if ((cpuid_type == intel && (_family == 0x6 || _family == 0xf))
+      || (cpuid_type == amd && _family == 0xf)) {
+    infos->cpumodelnumber = _model + (_extendedmodel << 4);
+  } else {
+    infos->cpumodelnumber = _model;
+  }
+  infos->cpustepping = eax & 0xf;
+
+  /* Get cpu vendor string from cpuid 0x00 */
+  memset(regs, 0, sizeof(regs));
+  regs[0] = 0;
+  cpuid_or_from_dump(&regs[0], &regs[1], &regs[3], &regs[2], src_cpuiddump);
+  memcpy(infos->cpuvendor, regs+1, 4*3);
+  /* infos was calloc'ed, already ends with \0 */
+
+  /* Get cpu model string from cpuid 0x80000002-4 */
+  if (highest_ext_cpuid >= 0x80000004) {
+    memset(regs, 0, sizeof(regs));
+    regs[0] = 0x80000002;
+    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
+    memcpy(infos->cpumodel, regs, 4*4);
+    regs[0] = 0x80000003;
+    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
+    memcpy(infos->cpumodel + 4*4, regs, 4*4);
+    regs[0] = 0x80000004;
+    cpuid_or_from_dump(&regs[0], &regs[1], &regs[2], &regs[3], src_cpuiddump);
+    memcpy(infos->cpumodel + 4*4*2, regs, 4*4);
+    /* infos was calloc'ed, already ends with \0 */
+  }
+
+  /* Get core/thread information from cpuid 0x80000008
+   * (not supported on Intel)
+   */
+  if (cpuid_type != intel && highest_ext_cpuid >= 0x80000008) {
+    unsigned coreidsize;
+    eax = 0x80000008;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    coreidsize = (ecx >> 12) & 0xf;
+    hwloc_debug("core ID size: %u\n", coreidsize);
+    if (!coreidsize) {
+      infos->max_nbcores = (ecx & 0xff) + 1;
+    } else
+      infos->max_nbcores = 1 << coreidsize;
+    hwloc_debug("Thus max # of cores: %u\n", infos->max_nbcores);
+    /* Still no multithreaded AMD */
+    infos->max_nbthreads = 1 ;
+    hwloc_debug("and max # of threads: %u\n", infos->max_nbthreads);
+    /* The legacy max_log_proc is deprecated, it can be smaller than max_nbcores,
+     * which is the maximum number of cores that the processor could theoretically support
+     * (see "Multiple Core Calculation" in the AMD CPUID specification).
+     * Recompute packageid/logprocid/threadid/coreid accordingly.
+     */
+    infos->packageid = infos->apicid / infos->max_nbcores;
+    infos->logprocid = infos->apicid % infos->max_nbcores;
+    infos->threadid = infos->logprocid % infos->max_nbthreads;
+    infos->coreid = infos->logprocid / infos->max_nbthreads;
+    hwloc_debug("this is thread %u of core %u\n", infos->threadid, infos->coreid);
+  }
+
+  infos->numcaches = 0;
+  infos->cache = NULL;
+
+  /* Get apicid, nodeid, unitid from cpuid 0x8000001e
+   * and cache information from cpuid 0x8000001d
+   * (AMD topology extension)
+   */
+  if (cpuid_type != intel && has_topoext(features)) {
+    unsigned apic_id, node_id, nodes_per_proc, unit_id, cores_per_unit;
+
+    eax = 0x8000001e;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    infos->apicid = apic_id = eax;
+    infos->nodeid = node_id = ecx & 0xff;
+    nodes_per_proc = ((ecx >> 8) & 7) + 1;
+    if (nodes_per_proc > 2) {
+      hwloc_debug("warning: undefined value %d, assuming it means %d\n", nodes_per_proc, nodes_per_proc);
+    }
+    infos->unitid = unit_id = ebx & 0xff;
+    cores_per_unit = ((ebx >> 8) & 3) + 1;
+    hwloc_debug("x2APIC %08x, %d nodes, node %d, %d cores in unit %d\n", apic_id, nodes_per_proc, node_id, cores_per_unit, unit_id);
+
+    for (cachenum = 0; ; cachenum++) {
+      unsigned type;
+      eax = 0x8000001d;
+      ecx = cachenum;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+      type = eax & 0x1f;
+      if (type == 0)
+	break;
+      infos->numcaches++;
+    }
+
+    cache = infos->cache = malloc(infos->numcaches * sizeof(*infos->cache));
+
+    for (cachenum = 0; ; cachenum++) {
+      unsigned long linesize, linepart, ways, sets;
+      unsigned type;
+      eax = 0x8000001d;
+      ecx = cachenum;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+
+      type = eax & 0x1f;
+
+      if (type == 0)
+	break;
+
+      cache->type = type;
+      cache->level = (eax >> 5) & 0x7;
+      /* Note: actually number of cores */
+      cache->nbthreads_sharing = ((eax >> 14) &  0xfff) + 1;
+
+      cache->linesize = linesize = (ebx & 0xfff) + 1;
+      cache->linepart = linepart = ((ebx >> 12) & 0x3ff) + 1;
+      ways = ((ebx >> 22) & 0x3ff) + 1;
+
+      if (eax & (1 << 9))
+	/* Fully associative */
+	cache->ways = -1;
+      else
+	cache->ways = ways;
+      cache->sets = sets = ecx + 1;
+      cache->size = linesize * linepart * ways * sets;
+      cache->inclusiveness = edx & 0x2;
+
+
+      hwloc_debug("cache %u type %u L%u t%u c%u linesize %lu linepart %lu ways %lu sets %lu, size %uKB\n", cachenum, cache->type, cache->level, cache->nbthreads_sharing, infos->max_nbcores, linesize, linepart, ways, sets, cache->size >> 10);
+
+      cache++;
+    }
+  } else {
+    /* If there's no topoext,
+     * get cache information from cpuid 0x80000005 and 0x80000006
+     * (not supported on Intel)
+     */
+    if (cpuid_type != intel && highest_ext_cpuid >= 0x80000005) {
+      eax = 0x80000005;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+      fill_amd_cache(infos, 1, 1, ecx); /* L1d */
+      fill_amd_cache(infos, 1, 2, edx); /* L1i */
+    }
+    if (cpuid_type != intel && highest_ext_cpuid >= 0x80000006) {
+      eax = 0x80000006;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+      if (ecx & 0xf000)
+	/* This is actually supported on Intel but LinePerTag isn't returned in bits 8-11.
+	 * Could be useful if some Intels (at least before Core micro-architecture)
+	 * support this leaf without leaf 0x4.
+	 */
+	fill_amd_cache(infos, 2, 3, ecx); /* L2u */
+      if (edx & 0xf000)
+	fill_amd_cache(infos, 3, 3, edx); /* L3u */
+      /* FIXME: AMD MagnyCours family 0x10 model 0x9 with 8 cores or more actually
+       * have the L3 split in two halves, and associativity is divided as well (48)
+       */
+    }
+  }
+
+  /* Get thread/core + cache information from cpuid 0x04
+   * (not supported on AMD)
+   */
+  if (cpuid_type != amd && highest_cpuid >= 0x04) {
+    for (cachenum = 0; ; cachenum++) {
+      unsigned type;
+      eax = 0x04;
+      ecx = cachenum;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+
+      type = eax & 0x1f;
+
+      hwloc_debug("cache %u type %u\n", cachenum, type);
+
+      if (type == 0)
+	break;
+      infos->numcaches++;
+
+      if (!cachenum) {
+	/* by the way, get thread/core information from the first cache */
+	infos->max_nbcores = ((eax >> 26) & 0x3f) + 1;
+	infos->max_nbthreads = infos->max_log_proc / infos->max_nbcores;
+	hwloc_debug("thus %u threads\n", infos->max_nbthreads);
+	infos->threadid = infos->logprocid % infos->max_nbthreads;
+	infos->coreid = infos->logprocid / infos->max_nbthreads;
+	hwloc_debug("this is thread %u of core %u\n", infos->threadid, infos->coreid);
+      }
+    }
+
+    cache = infos->cache = malloc(infos->numcaches * sizeof(*infos->cache));
+
+    for (cachenum = 0; ; cachenum++) {
+      unsigned long linesize, linepart, ways, sets;
+      unsigned type;
+      eax = 0x04;
+      ecx = cachenum;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+
+      type = eax & 0x1f;
+
+      if (type == 0)
+	break;
+
+      cache->type = type;
+      cache->level = (eax >> 5) & 0x7;
+      cache->nbthreads_sharing = ((eax >> 14) & 0xfff) + 1;
+
+      cache->linesize = linesize = (ebx & 0xfff) + 1;
+      cache->linepart = linepart = ((ebx >> 12) & 0x3ff) + 1;
+      ways = ((ebx >> 22) & 0x3ff) + 1;
+      if (eax & (1 << 9))
+        /* Fully associative */
+        cache->ways = -1;
+      else
+        cache->ways = ways;
+      cache->sets = sets = ecx + 1;
+      cache->size = linesize * linepart * ways * sets;
+      cache->inclusiveness = edx & 0x2;
+
+      hwloc_debug("cache %u type %u L%u t%u c%u linesize %lu linepart %lu ways %lu sets %lu, size %uKB\n", cachenum, cache->type, cache->level, cache->nbthreads_sharing, infos->max_nbcores, linesize, linepart, ways, sets, cache->size >> 10);
+
+      cache++;
+    }
+  }
+
+  /* Get package/core/thread information from cpuid 0x0b
+   * (Intel x2APIC)
+   */
+  if (cpuid_type == intel && has_x2apic(features)) {
+    unsigned level, apic_nextshift, apic_number, apic_type, apic_id = 0, apic_shift = 0, id;
+    for (level = 0; ; level++) {
+      ecx = level;
+      eax = 0x0b;
+      cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+      if (!eax && !ebx)
+        break;
+    }
+    if (level) {
+      infos->levels = level;
+      infos->otherids = malloc(level * sizeof(*infos->otherids));
+      for (level = 0; ; level++) {
+	ecx = level;
+	eax = 0x0b;
+	cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+	if (!eax && !ebx)
+	  break;
+	apic_nextshift = eax & 0x1f;
+	apic_number = ebx & 0xffff;
+	apic_type = (ecx & 0xff00) >> 8;
+	apic_id = edx;
+	id = (apic_id >> apic_shift) & ((1 << (apic_nextshift - apic_shift)) - 1);
+	hwloc_debug("x2APIC %08x %d: nextshift %d num %2d type %d id %2d\n", apic_id, level, apic_nextshift, apic_number, apic_type, id);
+	infos->apicid = apic_id;
+	infos->otherids[level] = UINT_MAX;
+	switch (apic_type) {
+	case 1:
+	  infos->threadid = id;
+	  break;
+	case 2:
+	  infos->coreid = id;
+	  break;
+	default:
+	  hwloc_debug("x2APIC %d: unknown type %d\n", level, apic_type);
+	  infos->otherids[level] = apic_id >> apic_shift;
+	  break;
+	}
+	apic_shift = apic_nextshift;
+      }
+      infos->apicid = apic_id;
+      infos->packageid = apic_id >> apic_shift;
+      hwloc_debug("x2APIC remainder: %d\n", infos->packageid);
+      hwloc_debug("this is thread %u of core %u\n", infos->threadid, infos->coreid);
+    }
+  }
+
+  if (hwloc_bitmap_isset(data->apicid_set, infos->apicid))
+    data->apicid_unique = 0;
+  else
+    hwloc_bitmap_set(data->apicid_set, infos->apicid);
+}
+
+static void
+hwloc_x86_add_cpuinfos(hwloc_obj_t obj, struct procinfo *info, int nodup)
+{
+  char number[8];
+  hwloc_obj_add_info_nodup(obj, "CPUVendor", info->cpuvendor, nodup);
+  snprintf(number, sizeof(number), "%u", info->cpufamilynumber);
+  hwloc_obj_add_info_nodup(obj, "CPUFamilyNumber", number, nodup);
+  snprintf(number, sizeof(number), "%u", info->cpumodelnumber);
+  hwloc_obj_add_info_nodup(obj, "CPUModelNumber", number, nodup);
+  if (info->cpumodel[0]) {
+    const char *c = info->cpumodel;
+    while (*c == ' ')
+      c++;
+    hwloc_obj_add_info_nodup(obj, "CPUModel", c, nodup);
+  }
+  snprintf(number, sizeof(number), "%u", info->cpustepping);
+  hwloc_obj_add_info_nodup(obj, "CPUStepping", number, nodup);
+}
+
+/* Analyse information stored in infos, and build/annotate topology levels accordingly */
+static void summarize(struct hwloc_backend *backend, struct procinfo *infos, int fulldiscovery)
+{
+  struct hwloc_topology *topology = backend->topology;
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  unsigned nbprocs = data->nbprocs;
+  hwloc_bitmap_t complete_cpuset = hwloc_bitmap_alloc();
+  unsigned i, j, l, level, type;
+  unsigned nbpackages = 0;
+  int one = -1;
+  unsigned next_group_depth = topology->next_group_depth;
+
+  for (i = 0; i < nbprocs; i++)
+    if (infos[i].present) {
+      hwloc_bitmap_set(complete_cpuset, i);
+      one = i;
+    }
+
+  if (one == -1) {
+    hwloc_bitmap_free(complete_cpuset);
+    return;
+  }
+
+  /* Ideally, when fulldiscovery=0, we could add any object that doesn't exist yet.
+   * But what if the x86 and the native backends disagree because one is buggy? Which one to trust?
+   * Only annotate existing objects for now.
+   */
+
+ /*Anotate previously existing objects*/
+  if(!fulldiscovery){
+    hwloc_obj_t pu;
+    nbpackages = hwloc_get_nbobjs_by_type(topology, HWLOC_OBJ_PACKAGE);
+    for(pu = hwloc_get_next_obj_by_type(topology,HWLOC_OBJ_PU  ,NULL);
+     pu!=NULL;
+     pu = hwloc_get_next_obj_by_type(topology,HWLOC_OBJ_PU ,pu)){
+      unsigned infoId = pu->os_index;
+      if(infoId<0)
+        continue;
+      
+      int numCaches = infos[infoId].numcaches;
+      struct cacheinfo **caches = malloc(numCaches*sizeof(struct cacheinfo*));
+      int i;
+      for(i = 0 ;i<numCaches;i++){
+        caches[i] = &(infos[infoId].cache[i]);
+      }
+
+
+      hwloc_obj_t object;
+      for(object = pu;object!=NULL;object = object->parent) {
+        switch(object->type){
+        /* Annotate packages previously-existing cache */
+        case HWLOC_OBJ_CACHE:
+          {
+            if (hwloc_obj_get_info_by_name(object,"inclusiveness"))
+              break;
+            unsigned char type = 0;
+            switch(object->attr->cache.type){
+              case HWLOC_OBJ_CACHE_DATA : type = 1;
+                break;
+              case HWLOC_OBJ_CACHE_INSTRUCTION : type = 2;
+                break;
+              case HWLOC_OBJ_CACHE_UNIFIED : type = 3;
+                break;
+            }
+            int cacheId =-1; 
+            for(i=0;i<numCaches;i++)
+              if(caches[i]->level == object->attr->cache.depth){ // the level is exact, not always the type. If at the level there is a cache with the good type we return it. Else we return a random cache of the level. 
+                cacheId = i;
+                if(caches[i]->type == type)
+                  break;
+              }
+            hwloc_obj_add_info(object,"inclusiveness",caches[cacheId]->inclusiveness?"true":"false");
+
+          }
+          break;
+        case HWLOC_OBJ_PACKAGE:
+          { 
+            /* Annotate packages previously-existing package */
+	    // FIXME: ideally, we should check all bits in case x86 and the native backend disagree. 
+	       
+            //We already know the pakage from topology-linux. We only check if the package detected by x86 doesn't disagree
+	    if (infos[i].packageid == object->os_index || object->os_index == (unsigned) -1) { 
+	      hwloc_x86_add_cpuinfos(object, &infos[infoId], 1);
+            }
+          }
+        break;
+	default:
+	break;
+	}
+      }
+      free(caches);
+    }
+  }
+
+
+  /* Look for packages */
+  if (fulldiscovery) {
+    hwloc_bitmap_t packages_cpuset = hwloc_bitmap_dup(complete_cpuset);
+    hwloc_bitmap_t package_cpuset;
+    hwloc_obj_t package;
+
+    while ((i = hwloc_bitmap_first(packages_cpuset)) != (unsigned) -1) {
+      unsigned packageid = infos[i].packageid;
+
+      package_cpuset = hwloc_bitmap_alloc();
+      for (j = i; j < nbprocs; j++) {
+        if (infos[j].packageid == packageid) {
+          hwloc_bitmap_set(package_cpuset, j);
+          hwloc_bitmap_clr(packages_cpuset, j);
+        }
+      }
+      package = hwloc_alloc_setup_object(HWLOC_OBJ_PACKAGE, packageid);
+      package->cpuset = package_cpuset;
+
+      hwloc_x86_add_cpuinfos(package, &infos[i], 0);
+
+      hwloc_debug_1arg_bitmap("os package %u has cpuset %s\n",
+          packageid, package_cpuset);
+      hwloc_insert_object_by_cpuset(topology, package);
+      nbpackages++;
+    }
+    hwloc_bitmap_free(packages_cpuset);
+
+  }
+
+  /* If there was no package, annotate the Machine instead */
+  if ((!nbpackages) && infos[0].cpumodel[0]) {
+    hwloc_x86_add_cpuinfos(hwloc_get_root_obj(topology), &infos[0], 1);
+  }
+
+  /* Look for Numa nodes inside packages */
+  if (fulldiscovery) {
+    hwloc_bitmap_t nodes_cpuset = hwloc_bitmap_dup(complete_cpuset);
+    hwloc_bitmap_t node_cpuset;
+    hwloc_obj_t node;
+
+    /* FIXME: if there's memory inside the root object, divide it into NUMA nodes? */
+
+    while ((i = hwloc_bitmap_first(nodes_cpuset)) != (unsigned) -1) {
+      unsigned packageid = infos[i].packageid;
+      unsigned nodeid = infos[i].nodeid;
+
+      if (nodeid == (unsigned)-1) {
+        hwloc_bitmap_clr(nodes_cpuset, i);
+	continue;
+      }
+
+      node_cpuset = hwloc_bitmap_alloc();
+      for (j = i; j < nbprocs; j++) {
+	if (infos[j].nodeid == (unsigned) -1) {
+	  hwloc_bitmap_clr(nodes_cpuset, j);
+	  continue;
+	}
+
+        if (infos[j].packageid == packageid && infos[j].nodeid == nodeid) {
+          hwloc_bitmap_set(node_cpuset, j);
+          hwloc_bitmap_clr(nodes_cpuset, j);
+        }
+      }
+      node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, nodeid);
+      node->cpuset = node_cpuset;
+      node->nodeset = hwloc_bitmap_alloc();
+      hwloc_bitmap_set(node->nodeset, nodeid);
+      hwloc_debug_1arg_bitmap("os node %u has cpuset %s\n",
+          nodeid, node_cpuset);
+      hwloc_insert_object_by_cpuset(topology, node);
+    }
+    hwloc_bitmap_free(nodes_cpuset);
+  }
+
+  /* Look for Compute units inside packages */
+  if (fulldiscovery) {
+    hwloc_bitmap_t units_cpuset = hwloc_bitmap_dup(complete_cpuset);
+    hwloc_bitmap_t unit_cpuset;
+    hwloc_obj_t unit;
+
+    while ((i = hwloc_bitmap_first(units_cpuset)) != (unsigned) -1) {
+      unsigned packageid = infos[i].packageid;
+      unsigned unitid = infos[i].unitid;
+
+      if (unitid == (unsigned)-1) {
+        hwloc_bitmap_clr(units_cpuset, i);
+	continue;
+      }
+
+      unit_cpuset = hwloc_bitmap_alloc();
+      for (j = i; j < nbprocs; j++) {
+	if (infos[j].unitid == (unsigned) -1) {
+	  hwloc_bitmap_clr(units_cpuset, j);
+	  continue;
+	}
+
+        if (infos[j].packageid == packageid && infos[j].unitid == unitid) {
+          hwloc_bitmap_set(unit_cpuset, j);
+          hwloc_bitmap_clr(units_cpuset, j);
+        }
+      }
+      unit = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, unitid);
+      unit->cpuset = unit_cpuset;
+      hwloc_debug_1arg_bitmap("os unit %u has cpuset %s\n",
+          unitid, unit_cpuset);
+      hwloc_insert_object_by_cpuset(topology, unit);
+    }
+    hwloc_bitmap_free(units_cpuset);
+  }
+
+  /* Look for unknown objects */
+  if (infos[one].otherids) {
+    for (level = infos[one].levels-1; level <= infos[one].levels-1; level--) {
+      if (infos[one].otherids[level] != UINT_MAX) {
+	hwloc_bitmap_t unknowns_cpuset = hwloc_bitmap_dup(complete_cpuset);
+	hwloc_bitmap_t unknown_cpuset;
+	hwloc_obj_t unknown_obj;
+
+	while ((i = hwloc_bitmap_first(unknowns_cpuset)) != (unsigned) -1) {
+	  unsigned unknownid = infos[i].otherids[level];
+
+	  unknown_cpuset = hwloc_bitmap_alloc();
+	  for (j = i; j < nbprocs; j++) {
+	    if (infos[j].otherids[level] == unknownid) {
+	      hwloc_bitmap_set(unknown_cpuset, j);
+	      hwloc_bitmap_clr(unknowns_cpuset, j);
+	    }
+	  }
+	  unknown_obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, unknownid);
+	  unknown_obj->cpuset = unknown_cpuset;
+	  unknown_obj->attr->group.depth = topology->next_group_depth + level;
+	  if (next_group_depth <= topology->next_group_depth + level)
+	    next_group_depth = topology->next_group_depth + level + 1;
+	  hwloc_debug_2args_bitmap("os unknown%d %u has cpuset %s\n",
+	      level, unknownid, unknown_cpuset);
+	  hwloc_insert_object_by_cpuset(topology, unknown_obj);
+	}
+	hwloc_bitmap_free(unknowns_cpuset);
+      }
+    }
+  }
+
+  /* Look for cores */
+  if (fulldiscovery) {
+    hwloc_bitmap_t cores_cpuset = hwloc_bitmap_dup(complete_cpuset);
+    hwloc_bitmap_t core_cpuset;
+    hwloc_obj_t core;
+
+    while ((i = hwloc_bitmap_first(cores_cpuset)) != (unsigned) -1) {
+      unsigned packageid = infos[i].packageid;
+      unsigned coreid = infos[i].coreid;
+
+      if (coreid == (unsigned) -1) {
+        hwloc_bitmap_clr(cores_cpuset, i);
+	continue;
+      }
+
+      core_cpuset = hwloc_bitmap_alloc();
+      for (j = i; j < nbprocs; j++) {
+	if (infos[j].coreid == (unsigned) -1) {
+	  hwloc_bitmap_clr(cores_cpuset, j);
+	  continue;
+	}
+
+        if (infos[j].packageid == packageid && infos[j].coreid == coreid) {
+          hwloc_bitmap_set(core_cpuset, j);
+          hwloc_bitmap_clr(cores_cpuset, j);
+        }
+      }
+      core = hwloc_alloc_setup_object(HWLOC_OBJ_CORE, coreid);
+      core->cpuset = core_cpuset;
+      hwloc_debug_1arg_bitmap("os core %u has cpuset %s\n",
+          coreid, core_cpuset);
+      hwloc_insert_object_by_cpuset(topology, core);
+    }
+    hwloc_bitmap_free(cores_cpuset);
+  }
+
+  /* Look for caches */
+  /* First find max level */
+  level = 0;
+  for (i = 0; i < nbprocs; i++)
+    for (j = 0; j < infos[i].numcaches; j++)
+      if (infos[i].cache[j].level > level)
+        level = infos[i].cache[j].level;
+
+  /* Look for known types */
+  if (fulldiscovery) while (level > 0) {
+    for (type = 1; type <= 3; type++) {
+      /* Look for caches of that type at level level */
+      {
+	hwloc_bitmap_t caches_cpuset = hwloc_bitmap_dup(complete_cpuset);
+	hwloc_bitmap_t cache_cpuset;
+	hwloc_obj_t cache;
+
+	while ((i = hwloc_bitmap_first(caches_cpuset)) != (unsigned) -1) {
+	  unsigned packageid = infos[i].packageid;
+
+	  for (l = 0; l < infos[i].numcaches; l++) {
+	    if (infos[i].cache[l].level == level && infos[i].cache[l].type == type)
+	      break;
+	  }
+	  if (l == infos[i].numcaches) {
+	    /* no cache Llevel of that type in i */
+	    hwloc_bitmap_clr(caches_cpuset, i);
+	    continue;
+	  }
+
+	  /* Found a matching cache, now look for others sharing it */
+	  {
+	    unsigned cacheid = infos[i].apicid / infos[i].cache[l].nbthreads_sharing;
+
+	    cache_cpuset = hwloc_bitmap_alloc();
+	    for (j = i; j < nbprocs; j++) {
+	      unsigned l2;
+	      for (l2 = 0; l2 < infos[j].numcaches; l2++) {
+		if (infos[j].cache[l2].level == level && infos[j].cache[l2].type == type)
+		  break;
+	      }
+	      if (l2 == infos[j].numcaches) {
+		/* no cache Llevel of that type in j */
+		hwloc_bitmap_clr(caches_cpuset, j);
+		continue;
+	      }
+	      if (infos[j].packageid == packageid && infos[j].apicid / infos[j].cache[l2].nbthreads_sharing == cacheid) {
+		hwloc_bitmap_set(cache_cpuset, j);
+		hwloc_bitmap_clr(caches_cpuset, j);
+	      }
+	    }
+	    cache = hwloc_alloc_setup_object(HWLOC_OBJ_CACHE, cacheid);
+	    cache->attr->cache.depth = level;
+	    cache->attr->cache.size = infos[i].cache[l].size;
+	    cache->attr->cache.linesize = infos[i].cache[l].linesize;
+	    cache->attr->cache.associativity = infos[i].cache[l].ways;
+	    switch (infos[i].cache[l].type) {
+	      case 1:
+		cache->attr->cache.type = HWLOC_OBJ_CACHE_DATA;
+		break;
+	      case 2:
+		cache->attr->cache.type = HWLOC_OBJ_CACHE_INSTRUCTION;
+		break;
+	      case 3:
+		cache->attr->cache.type = HWLOC_OBJ_CACHE_UNIFIED;
+		break;
+	    }
+            hwloc_obj_add_info(cache,"inclusiveness",infos[i].cache[l].inclusiveness?"true":"false");
+	    cache->cpuset = cache_cpuset;
+	    hwloc_debug_2args_bitmap("os L%u cache %u has cpuset %s\n",
+		level, cacheid, cache_cpuset);
+	    hwloc_insert_object_by_cpuset(topology, cache);
+	  }
+	}
+	hwloc_bitmap_free(caches_cpuset);
+      }
+    }
+    level--;
+  }
+
+  for (i = 0; i < nbprocs; i++) {
+    free(infos[i].cache);
+    if (infos[i].otherids)
+      free(infos[i].otherids);
+  }
+
+  hwloc_bitmap_free(complete_cpuset);
+  topology->next_group_depth = next_group_depth;
+}
+
+static int
+look_procs(struct hwloc_backend *backend, struct procinfo *infos, int fulldiscovery,
+	   unsigned highest_cpuid, unsigned highest_ext_cpuid, unsigned *features, enum cpuid_type cpuid_type,
+	   int (*get_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags),
+	   int (*set_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags))
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  struct hwloc_topology *topology = backend->topology;
+  unsigned nbprocs = data->nbprocs;
+  hwloc_bitmap_t orig_cpuset = NULL;
+  hwloc_bitmap_t set = NULL;
+  unsigned i;
+
+  if (!data->src_cpuiddump_path) {
+    orig_cpuset = hwloc_bitmap_alloc();
+    if (get_cpubind(topology, orig_cpuset, HWLOC_CPUBIND_STRICT)) {
+      hwloc_bitmap_free(orig_cpuset);
+      return -1;
+    }
+    set = hwloc_bitmap_alloc();
+  }
+
+  for (i = 0; i < nbprocs; i++) {
+    struct cpuiddump *src_cpuiddump = NULL;
+    if (data->src_cpuiddump_path) {
+      src_cpuiddump = cpuiddump_read(data->src_cpuiddump_path, i);
+    } else {
+      hwloc_bitmap_only(set, i);
+      hwloc_debug("binding to CPU%d\n", i);
+      if (set_cpubind(topology, set, HWLOC_CPUBIND_STRICT)) {
+	hwloc_debug("could not bind to CPU%d: %s\n", i, strerror(errno));
+	continue;
+      }
+    }
+
+    look_proc(backend, &infos[i], highest_cpuid, highest_ext_cpuid, features, cpuid_type, src_cpuiddump);
+
+    if (data->src_cpuiddump_path) {
+      cpuiddump_free(src_cpuiddump);
+    }
+  }
+
+  if (!data->src_cpuiddump_path) {
+    set_cpubind(topology, orig_cpuset, 0);
+    hwloc_bitmap_free(set);
+    hwloc_bitmap_free(orig_cpuset);
+  }
+
+  if (!data->apicid_unique)
+    fulldiscovery = 0;
+  summarize(backend, infos, fulldiscovery);
+  return fulldiscovery; /* success, but objects added only if fulldiscovery */
+}
+
+#if defined HWLOC_FREEBSD_SYS && defined HAVE_CPUSET_SETID
+#include <sys/param.h>
+#include <sys/cpuset.h>
+typedef cpusetid_t hwloc_x86_os_state_t;
+static void hwloc_x86_os_state_save(hwloc_x86_os_state_t *state, struct cpuiddump *src_cpuiddump)
+{
+  if (!src_cpuiddump) {
+    /* temporary make all cpus available during discovery */
+    cpuset_getid(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, state);
+    cpuset_setid(CPU_WHICH_PID, -1, 0);
+  }
+}
+static void hwloc_x86_os_state_restore(hwloc_x86_os_state_t *state, struct cpuiddump *src_cpuiddump)
+{
+  if (!src_cpuiddump) {
+    /* restore initial cpuset */
+    cpuset_setid(CPU_WHICH_PID, -1, *state);
+  }
+}
+#else /* !defined HWLOC_FREEBSD_SYS || !defined HAVE_CPUSET_SETID */
+typedef void * hwloc_x86_os_state_t;
+static void hwloc_x86_os_state_save(hwloc_x86_os_state_t *state __hwloc_attribute_unused, struct cpuiddump *src_cpuiddump __hwloc_attribute_unused) { }
+static void hwloc_x86_os_state_restore(hwloc_x86_os_state_t *state __hwloc_attribute_unused, struct cpuiddump *src_cpuiddump __hwloc_attribute_unused) { }
+#endif /* !defined HWLOC_FREEBSD_SYS || !defined HAVE_CPUSET_SETID */
+
+
+#define INTEL_EBX ('G' | ('e'<<8) | ('n'<<16) | ('u'<<24))
+#define INTEL_EDX ('i' | ('n'<<8) | ('e'<<16) | ('I'<<24))
+#define INTEL_ECX ('n' | ('t'<<8) | ('e'<<16) | ('l'<<24))
+
+#define AMD_EBX ('A' | ('u'<<8) | ('t'<<16) | ('h'<<24))
+#define AMD_EDX ('e' | ('n'<<8) | ('t'<<16) | ('i'<<24))
+#define AMD_ECX ('c' | ('A'<<8) | ('M'<<16) | ('D'<<24))
+
+/* fake cpubind for when nbprocs=1 and no binding support */
+static int fake_get_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,
+			    hwloc_cpuset_t set __hwloc_attribute_unused,
+			    int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+static int fake_set_cpubind(hwloc_topology_t topology __hwloc_attribute_unused,
+			    hwloc_const_cpuset_t set __hwloc_attribute_unused,
+			    int flags __hwloc_attribute_unused)
+{
+  return 0;
+}
+
+static
+int hwloc_look_x86(struct hwloc_backend *backend, int fulldiscovery)
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  unsigned nbprocs = data->nbprocs;
+  unsigned eax, ebx, ecx = 0, edx;
+  unsigned i;
+  unsigned highest_cpuid;
+  unsigned highest_ext_cpuid;
+  /* This stores cpuid features with the same indexing as Linux */
+  unsigned features[10] = { 0 };
+  struct procinfo *infos = NULL;
+  enum cpuid_type cpuid_type = unknown;
+  hwloc_x86_os_state_t os_state;
+  struct hwloc_binding_hooks hooks;
+  struct hwloc_topology_support support;
+  struct hwloc_topology_membind_support memsupport __hwloc_attribute_unused;
+  int (*get_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags) = NULL;
+  int (*set_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags) = NULL;
+  struct cpuiddump *src_cpuiddump = NULL;
+  int ret = -1;
+
+  if (data->src_cpuiddump_path) {
+    /* just read cpuid from the dump */
+    src_cpuiddump = cpuiddump_read(data->src_cpuiddump_path, 0);
+  } else {
+    /* otherwise check if binding works */
+    memset(&hooks, 0, sizeof(hooks));
+    support.membind = &memsupport;
+    hwloc_set_native_binding_hooks(&hooks, &support);
+    if (hooks.get_thisproc_cpubind && hooks.set_thisproc_cpubind) {
+      get_cpubind = hooks.get_thisproc_cpubind;
+      set_cpubind = hooks.set_thisproc_cpubind;
+    } else if (hooks.get_thisthread_cpubind && hooks.set_thisthread_cpubind) {
+      get_cpubind = hooks.get_thisthread_cpubind;
+      set_cpubind = hooks.set_thisthread_cpubind;
+    } else {
+      /* we need binding support if there are multiple PUs */
+      if (nbprocs > 1)
+	goto out;
+      get_cpubind = fake_get_cpubind;
+      set_cpubind = fake_set_cpubind;
+    }
+  }
+
+  if (!src_cpuiddump && !hwloc_have_x86_cpuid())
+    goto out;
+
+  infos = calloc(nbprocs, sizeof(struct procinfo));
+  if (NULL == infos)
+    goto out;
+  for (i = 0; i < nbprocs; i++) {
+    infos[i].nodeid = (unsigned) -1;
+    infos[i].packageid = (unsigned) -1;
+    infos[i].unitid = (unsigned) -1;
+    infos[i].coreid = (unsigned) -1;
+    infos[i].threadid = (unsigned) -1;
+  }
+
+  eax = 0x00;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  highest_cpuid = eax;
+  if (ebx == INTEL_EBX && ecx == INTEL_ECX && edx == INTEL_EDX)
+    cpuid_type = intel;
+  if (ebx == AMD_EBX && ecx == AMD_ECX && edx == AMD_EDX)
+    cpuid_type = amd;
+
+  hwloc_debug("highest cpuid %x, cpuid type %u\n", highest_cpuid, cpuid_type);
+  if (highest_cpuid < 0x01) {
+      goto out_with_infos;
+  }
+
+  eax = 0x01;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  features[0] = edx;
+  features[4] = ecx;
+
+  eax = 0x80000000;
+  cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+  highest_ext_cpuid = eax;
+
+  hwloc_debug("highest extended cpuid %x\n", highest_ext_cpuid);
+
+  if (highest_cpuid >= 0x7) {
+    eax = 0x7;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    features[9] = ebx;
+  }
+
+  if (cpuid_type != intel && highest_ext_cpuid >= 0x80000001) {
+    eax = 0x80000001;
+    cpuid_or_from_dump(&eax, &ebx, &ecx, &edx, src_cpuiddump);
+    features[1] = edx;
+    features[6] = ecx;
+  }
+
+  hwloc_x86_os_state_save(&os_state, src_cpuiddump);
+
+  ret = look_procs(backend, infos, fulldiscovery,
+		   highest_cpuid, highest_ext_cpuid, features, cpuid_type,
+		   get_cpubind, set_cpubind);
+  if (ret >= 0)
+    /* success, we're done */
+    goto out_with_os_state;
+
+  if (nbprocs == 1) {
+    /* only one processor, no need to bind */
+    look_proc(backend, &infos[0], highest_cpuid, highest_ext_cpuid, features, cpuid_type, src_cpuiddump);
+    summarize(backend, infos, fulldiscovery);
+    ret = fulldiscovery;
+  }
+
+out_with_os_state:
+  hwloc_x86_os_state_restore(&os_state, src_cpuiddump);
+
+out_with_infos:
+  if (NULL != infos) {
+      free(infos);
+  }
+
+out:
+  if (src_cpuiddump)
+    cpuiddump_free(src_cpuiddump);
+  return ret;
+}
+
+static int
+hwloc_x86_discover(struct hwloc_backend *backend)
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  struct hwloc_topology *topology = backend->topology;
+  int alreadypus = 0;
+  int ret;
+
+  if (!data->src_cpuiddump_path) {
+    data->nbprocs = hwloc_fallback_nbprocessors(topology);
+
+    if (!topology->is_thissystem) {
+      hwloc_debug("%s", "\nno x86 detection (not thissystem)\n");
+      return 0;
+    }
+  }
+
+  if (topology->levels[0][0]->cpuset) {
+    /* somebody else discovered things */
+    if (topology->nb_levels == 2 && topology->level_nbobjects[1] == data->nbprocs) {
+      /* only PUs were discovered, as much as we would, complete the topology with everything else */
+      alreadypus = 1;
+      goto fulldiscovery;
+    }
+
+    /* several object types were added, we can't easily complete, just annotate a bit */
+    ret = hwloc_look_x86(backend, 0);
+    if (ret)
+      hwloc_obj_add_info(topology->levels[0][0], "Backend", "x86");
+    return 0;
+  } else {
+    /* topology is empty, initialize it */
+    hwloc_alloc_obj_cpusets(topology->levels[0][0]);
+  }
+
+fulldiscovery:
+  hwloc_look_x86(backend, 1);
+  /* if failed, just continue and create PUs */
+
+  if (!alreadypus)
+    hwloc_setup_pu_level(topology, data->nbprocs);
+
+  hwloc_obj_add_info(topology->levels[0][0], "Backend", "x86");
+
+  if (!data->src_cpuiddump_path) { /* CPUID dump works for both x86 and x86_64 */
+#ifdef HAVE_UNAME
+    hwloc_add_uname_info(topology, NULL); /* we already know is_thissystem() is true */
+#else
+    /* uname isn't available, manually setup the "Architecture" info */
+#ifdef HWLOC_X86_64_ARCH
+    hwloc_obj_add_info(topology->levels[0][0], "Architecture", "x86_64");
+#else
+    hwloc_obj_add_info(topology->levels[0][0], "Architecture", "x86");
+#endif
+#endif
+  }
+
+  return 1;
+}
+
+static int
+hwloc_x86_check_cpuiddump_input(const char *src_cpuiddump_path, hwloc_bitmap_t set)
+{
+  struct dirent *dirent;
+  DIR *dir;
+  char *path;
+  FILE *file;
+  char line [32];
+
+  dir = opendir(src_cpuiddump_path);
+  if (!dir)
+    return -1;
+
+  path = malloc(strlen(src_cpuiddump_path) + strlen("/hwloc-cpuid-info") + 1);
+  if (!path)
+    goto out_with_dir;
+
+  sprintf(path, "%s/hwloc-cpuid-info", src_cpuiddump_path);
+  file = fopen(path, "r");
+  if (!file) {
+    fprintf(stderr, "Couldn't open dumped cpuid summary %s\n", path);
+    free(path);
+    goto out_with_dir;
+  }
+  if (!fgets(line, sizeof(line), file)) {
+    fprintf(stderr, "Found read dumped cpuid summary in %s\n", path);
+    fclose(file);
+    free(path);
+    goto out_with_dir;
+  }
+  fclose(file);
+  if (strcmp(line, "Architecture: x86\n")) {
+    fprintf(stderr, "Found non-x86 dumped cpuid summary in %s: %s\n", path, line);
+    free(path);
+    goto out_with_dir;
+  }
+  free(path);
+
+  while ((dirent = readdir(dir)) != NULL) {
+    if (!strncmp(dirent->d_name, "pu", 2)) {
+      char *end;
+      unsigned long idx = strtoul(dirent->d_name+2, &end, 10);
+      if (!*end)
+	hwloc_bitmap_set(set, idx);
+      else
+	fprintf(stderr, "Ignoring invalid dirent `%s' in dumped cpuid directory `%s'\n",
+		dirent->d_name, src_cpuiddump_path);
+    }
+  }
+  closedir(dir);
+
+  if (hwloc_bitmap_iszero(set)) {
+    fprintf(stderr, "Did not find any valid pu%%u entry in dumped cpuid directory `%s'\n",
+	    src_cpuiddump_path);
+    return -1;
+  } else if (hwloc_bitmap_last(set) != hwloc_bitmap_weight(set) - 1) {
+    /* The x86 backends enforces contigous set of PUs starting at 0 so far */
+    fprintf(stderr, "Found non-contigous pu%%u range in dumped cpuid directory `%s'\n",
+	    src_cpuiddump_path);
+    return -1;
+  }
+
+  return 0;
+
+out_with_dir:
+  closedir(dir);
+  return -1;
+}
+
+static void
+hwloc_x86_backend_disable(struct hwloc_backend *backend)
+{
+  struct hwloc_x86_backend_data_s *data = backend->private_data;
+  hwloc_bitmap_free(data->apicid_set);
+  if (data->src_cpuiddump_path)
+    free(data->src_cpuiddump_path);
+  free(data);
+}
+
+static struct hwloc_backend *
+hwloc_x86_component_instantiate(struct hwloc_disc_component *component,
+				const void *_data1 __hwloc_attribute_unused,
+				const void *_data2 __hwloc_attribute_unused,
+				const void *_data3 __hwloc_attribute_unused)
+{
+  struct hwloc_backend *backend;
+  struct hwloc_x86_backend_data_s *data;
+  const char *src_cpuiddump_path;
+
+  backend = hwloc_backend_alloc(component);
+  if (!backend)
+    goto out;
+
+  data = malloc(sizeof(*data));
+  if (!data) {
+    errno = ENOMEM;
+    goto out_with_backend;
+  }
+
+  backend->private_data = data;
+  backend->flags = HWLOC_BACKEND_FLAG_NEED_LEVELS;
+  backend->discover = hwloc_x86_discover;
+  backend->disable = hwloc_x86_backend_disable;
+
+  /* default values */
+  data->apicid_set = hwloc_bitmap_alloc();
+  data->apicid_unique = 1;
+  data->src_cpuiddump_path = NULL;
+
+  src_cpuiddump_path = getenv("HWLOC_CPUID_PATH");
+  if (src_cpuiddump_path) {
+    hwloc_bitmap_t set = hwloc_bitmap_alloc();
+    if (!hwloc_x86_check_cpuiddump_input(src_cpuiddump_path, set)) {
+      backend->is_thissystem = 0;
+      data->src_cpuiddump_path = strdup(src_cpuiddump_path);
+      data->nbprocs = hwloc_bitmap_weight(set);
+    } else {
+      fprintf(stderr, "Ignoring dumped cpuid directory.\n");
+    }
+    hwloc_bitmap_free(set);
+  }
+
+  return backend;
+
+ out_with_backend:
+  free(backend);
+ out:
+  return NULL;
+}
+
+static struct hwloc_disc_component hwloc_x86_disc_component = {
+  HWLOC_DISC_COMPONENT_TYPE_CPU,
+  "x86",
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL,
+  hwloc_x86_component_instantiate,
+  45, /* between native and no_os */
+  NULL
+};
+
+const struct hwloc_component hwloc_x86_component = {
+  HWLOC_COMPONENT_ABI,
+  NULL, NULL,
+  HWLOC_COMPONENT_TYPE_DISC,
+  0,
+  &hwloc_x86_disc_component
+};
diff --git a/ext/hwloc/hwloc/topology.c b/ext/hwloc/hwloc/topology.c
new file mode 100644
index 0000000..a67d036
--- /dev/null
+++ b/ext/hwloc/hwloc/topology.c
@@ -0,0 +1,3436 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+
+#define _ATFILE_SOURCE
+#include <assert.h>
+#include <sys/types.h>
+#ifdef HAVE_DIRENT_H
+#include <dirent.h>
+#endif
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#include <string.h>
+#include <errno.h>
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <limits.h>
+#include <float.h>
+
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/debug.h>
+#include <private/misc.h>
+
+#ifdef HAVE_MACH_MACH_INIT_H
+#include <mach/mach_init.h>
+#endif
+#ifdef HAVE_MACH_MACH_HOST_H
+#include <mach/mach_host.h>
+#endif
+
+#ifdef HAVE_SYS_PARAM_H
+#include <sys/param.h>
+#endif
+
+#ifdef HAVE_SYS_SYSCTL_H
+#include <sys/sysctl.h>
+#endif
+
+#ifdef HWLOC_WIN_SYS
+#include <windows.h>
+#endif
+
+unsigned hwloc_get_api_version(void)
+{
+  return HWLOC_API_VERSION;
+}
+
+int hwloc_hide_errors(void)
+{
+  static int hide = 0;
+  static int checked = 0;
+  if (!checked) {
+    const char *envvar = getenv("HWLOC_HIDE_ERRORS");
+    if (envvar)
+      hide = atoi(envvar);
+    checked = 1;
+  }
+  return hide;
+}
+
+void hwloc_report_os_error(const char *msg, int line)
+{
+    static int reported = 0;
+
+    if (!reported && !hwloc_hide_errors()) {
+        fprintf(stderr, "****************************************************************************\n");
+        fprintf(stderr, "* hwloc %s has encountered what looks like an error from the operating system.\n", HWLOC_VERSION);
+        fprintf(stderr, "*\n");
+        fprintf(stderr, "* %s\n", msg);
+        fprintf(stderr, "* Error occurred in topology.c line %d\n", line);
+        fprintf(stderr, "*\n");
+        fprintf(stderr, "* The following FAQ entry in the hwloc documentation may help:\n");
+        fprintf(stderr, "*   What should I do when hwloc reports \"operating system\" warnings?\n");
+        fprintf(stderr, "* Otherwise please report this error message to the hwloc user's mailing list,\n");
+#ifdef HWLOC_LINUX_SYS
+        fprintf(stderr, "* along with the output+tarball generated by the hwloc-gather-topology script.\n");
+#else
+	fprintf(stderr, "* along with any relevant topology information from your platform.\n");
+#endif
+        fprintf(stderr, "****************************************************************************\n");
+        reported = 1;
+    }
+}
+
+#if defined(HAVE_SYSCTLBYNAME)
+int hwloc_get_sysctlbyname(const char *name, int64_t *ret)
+{
+  union {
+    int32_t i32;
+    int64_t i64;
+  } n;
+  size_t size = sizeof(n);
+  if (sysctlbyname(name, &n, &size, NULL, 0))
+    return -1;
+  switch (size) {
+    case sizeof(n.i32):
+      *ret = n.i32;
+      break;
+    case sizeof(n.i64):
+      *ret = n.i64;
+      break;
+    default:
+      return -1;
+  }
+  return 0;
+}
+#endif
+
+#if defined(HAVE_SYSCTL)
+int hwloc_get_sysctl(int name[], unsigned namelen, int *ret)
+{
+  int n;
+  size_t size = sizeof(n);
+  if (sysctl(name, namelen, &n, &size, NULL, 0))
+    return -1;
+  if (size != sizeof(n))
+    return -1;
+  *ret = n;
+  return 0;
+}
+#endif
+
+/* Return the OS-provided number of processors.  Unlike other methods such as
+   reading sysfs on Linux, this method is not virtualizable; thus it's only
+   used as a fall-back method, allowing virtual backends (FSROOT, etc) to
+   have the desired effect.  */
+unsigned
+hwloc_fallback_nbprocessors(struct hwloc_topology *topology) {
+  int n;
+#if HAVE_DECL__SC_NPROCESSORS_ONLN
+  n = sysconf(_SC_NPROCESSORS_ONLN);
+#elif HAVE_DECL__SC_NPROC_ONLN
+  n = sysconf(_SC_NPROC_ONLN);
+#elif HAVE_DECL__SC_NPROCESSORS_CONF
+  n = sysconf(_SC_NPROCESSORS_CONF);
+#elif HAVE_DECL__SC_NPROC_CONF
+  n = sysconf(_SC_NPROC_CONF);
+#elif defined(HAVE_HOST_INFO) && HAVE_HOST_INFO
+  struct host_basic_info info;
+  mach_msg_type_number_t count = HOST_BASIC_INFO_COUNT;
+  host_info(mach_host_self(), HOST_BASIC_INFO, (integer_t*) &info, &count);
+  n = info.avail_cpus;
+#elif defined(HAVE_SYSCTLBYNAME)
+  int64_t nn;
+  if (hwloc_get_sysctlbyname("hw.ncpu", &nn))
+    nn = -1;
+  n = nn;
+#elif defined(HAVE_SYSCTL) && HAVE_DECL_CTL_HW && HAVE_DECL_HW_NCPU
+  static int name[2] = {CTL_HW, HW_NPCU};
+  if (hwloc_get_sysctl(name, sizeof(name)/sizeof(*name)), &n)
+    n = -1;
+#elif defined(HWLOC_WIN_SYS)
+  SYSTEM_INFO sysinfo;
+  GetSystemInfo(&sysinfo);
+  n = sysinfo.dwNumberOfProcessors;
+#else
+#ifdef __GNUC__
+#warning No known way to discover number of available processors on this system
+#warning hwloc_fallback_nbprocessors will default to 1
+#endif
+  n = -1;
+#endif
+  if (n >= 1)
+    topology->support.discovery->pu = 1;
+  else
+    n = 1;
+  return n;
+}
+
+/*
+ * Use the given number of processors to set a PU level.
+ */
+void
+hwloc_setup_pu_level(struct hwloc_topology *topology,
+		     unsigned nb_pus)
+{
+  struct hwloc_obj *obj;
+  unsigned oscpu,cpu;
+
+  hwloc_debug("%s", "\n\n * CPU cpusets *\n\n");
+  for (cpu=0,oscpu=0; cpu<nb_pus; oscpu++)
+    {
+      obj = hwloc_alloc_setup_object(HWLOC_OBJ_PU, oscpu);
+      obj->cpuset = hwloc_bitmap_alloc();
+      hwloc_bitmap_only(obj->cpuset, oscpu);
+
+      hwloc_debug_2args_bitmap("cpu %u (os %u) has cpuset %s\n",
+		 cpu, oscpu, obj->cpuset);
+      hwloc_insert_object_by_cpuset(topology, obj);
+
+      cpu++;
+    }
+}
+
+#ifdef HWLOC_DEBUG
+/* Just for debugging.  */
+static void
+hwloc_debug_print_object(int indent __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+  char type[64], idx[10], attr[1024], *cpuset = NULL;
+  hwloc_debug("%*s", 2*indent, "");
+  hwloc_obj_type_snprintf(type, sizeof(type), obj, 1);
+  if (obj->os_index != (unsigned) -1)
+    snprintf(idx, sizeof(idx), "#%u", obj->os_index);
+  else
+    *idx = '\0';
+  hwloc_obj_attr_snprintf(attr, sizeof(attr), obj, " ", 1);
+  hwloc_debug("%s%s%s%s%s", type, idx, *attr ? "(" : "", attr, *attr ? ")" : "");
+  if (obj->name)
+    hwloc_debug(" name %s", obj->name);
+  if (obj->cpuset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->cpuset);
+    hwloc_debug(" cpuset %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->complete_cpuset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->complete_cpuset);
+    hwloc_debug(" complete %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->allowed_cpuset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->allowed_cpuset);
+    hwloc_debug(" allowed %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->nodeset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->nodeset);
+    hwloc_debug(" nodeset %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->complete_nodeset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->complete_nodeset);
+    hwloc_debug(" completeN %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->allowed_nodeset) {
+    hwloc_bitmap_asprintf(&cpuset, obj->allowed_nodeset);
+    hwloc_debug(" allowedN %s", cpuset);
+    free(cpuset);
+  }
+  if (obj->arity)
+    hwloc_debug(" arity %u", obj->arity);
+  hwloc_debug("%s", "\n");
+}
+
+static void
+hwloc_debug_print_objects(int indent __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
+  hwloc_debug_print_object(indent, obj);
+  for (child = obj->first_child; child; child = child->next_sibling)
+    hwloc_debug_print_objects(indent + 1, child);
+  for (child = obj->io_first_child; child; child = child->next_sibling)
+    hwloc_debug_print_objects(indent + 1, child);
+  for (child = obj->misc_first_child; child; child = child->next_sibling)
+    hwloc_debug_print_objects(indent + 1, child);
+}
+#else /* !HWLOC_DEBUG */
+#define hwloc_debug_print_object(indent, obj) do { /* nothing */ } while (0)
+#define hwloc_debug_print_objects(indent, obj) do { /* nothing */ } while (0)
+#endif /* !HWLOC_DEBUG */
+
+void hwloc__free_infos(struct hwloc_obj_info_s *infos, unsigned count)
+{
+  unsigned i;
+  for(i=0; i<count; i++) {
+    free(infos[i].name);
+    free(infos[i].value);
+  }
+  free(infos);
+}
+
+void hwloc__add_info(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name, const char *value)
+{
+  unsigned count = *countp;
+  struct hwloc_obj_info_s *infos = *infosp;
+#define OBJECT_INFO_ALLOC 8
+  /* nothing allocated initially, (re-)allocate by multiple of 8 */
+  unsigned alloccount = (count + 1 + (OBJECT_INFO_ALLOC-1)) & ~(OBJECT_INFO_ALLOC-1);
+  if (count != alloccount)
+    infos = realloc(infos, alloccount*sizeof(*infos));
+  infos[count].name = strdup(name);
+  infos[count].value = value ? strdup(value) : NULL;
+  *infosp = infos;
+  *countp = count+1;
+}
+
+char ** hwloc__find_info_slot(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name)
+{
+  unsigned i;
+  for(i=0; i<*countp; i++) {
+    if (!strcmp((*infosp)[i].name, name))
+      return &(*infosp)[i].value;
+  }
+  hwloc__add_info(infosp, countp, name, NULL);
+  return &(*infosp)[*countp-1].value;
+}
+
+void hwloc__move_infos(struct hwloc_obj_info_s **dst_infosp, unsigned *dst_countp,
+		       struct hwloc_obj_info_s **src_infosp, unsigned *src_countp)
+{
+  unsigned dst_count = *dst_countp;
+  struct hwloc_obj_info_s *dst_infos = *dst_infosp;
+  unsigned src_count = *src_countp;
+  struct hwloc_obj_info_s *src_infos = *src_infosp;
+  unsigned i;
+#define OBJECT_INFO_ALLOC 8
+  /* nothing allocated initially, (re-)allocate by multiple of 8 */
+  unsigned alloccount = (dst_count + src_count + (OBJECT_INFO_ALLOC-1)) & ~(OBJECT_INFO_ALLOC-1);
+  if (dst_count != alloccount)
+    dst_infos = realloc(dst_infos, alloccount*sizeof(*dst_infos));
+  for(i=0; i<src_count; i++, dst_count++) {
+    dst_infos[dst_count].name = src_infos[i].name;
+    dst_infos[dst_count].value = src_infos[i].value;
+  }
+  *dst_infosp = dst_infos;
+  *dst_countp = dst_count;
+  free(src_infos);
+  *src_infosp = NULL;
+  *src_countp = 0;
+}
+
+void hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value)
+{
+  hwloc__add_info(&obj->infos, &obj->infos_count, name, value);
+}
+
+void hwloc_obj_add_info_nodup(hwloc_obj_t obj, const char *name, const char *value, int nodup)
+{
+  if (nodup && hwloc_obj_get_info_by_name(obj, name))
+    return;
+  hwloc__add_info(&obj->infos, &obj->infos_count, name, value);
+}
+
+static int hwloc_obj_type_is_special (hwloc_obj_type_t type)
+{
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_MISC + 1 == HWLOC_OBJ_BRIDGE);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_BRIDGE + 1 == HWLOC_OBJ_PCI_DEVICE);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_PCI_DEVICE + 1 == HWLOC_OBJ_OS_DEVICE);
+  return type >= HWLOC_OBJ_MISC && type <= HWLOC_OBJ_OS_DEVICE;
+}
+static int hwloc_obj_type_is_io (hwloc_obj_type_t type)
+{
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_BRIDGE + 1 == HWLOC_OBJ_PCI_DEVICE);
+  HWLOC_BUILD_ASSERT(HWLOC_OBJ_PCI_DEVICE + 1 == HWLOC_OBJ_OS_DEVICE);
+  return type >= HWLOC_OBJ_BRIDGE && type <= HWLOC_OBJ_OS_DEVICE;
+}
+
+/* Traverse children of a parent in a safe way: reread the next pointer as
+ * appropriate to prevent crash on child deletion:  */
+#define for_each_child_safe(child, parent, pchild) \
+  for (pchild = &(parent)->first_child, child = *pchild; \
+       child; \
+       /* Check whether the current child was not dropped.  */ \
+       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+       /* Get pointer to next child.  */ \
+        child = *pchild)
+#define for_each_io_child_safe(child, parent, pchild) \
+  for (pchild = &(parent)->io_first_child, child = *pchild; \
+       child; \
+       /* Check whether the current child was not dropped.  */ \
+       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+       /* Get pointer to next child.  */ \
+        child = *pchild)
+#define for_each_misc_child_safe(child, parent, pchild) \
+  for (pchild = &(parent)->misc_first_child, child = *pchild; \
+       child; \
+       /* Check whether the current child was not dropped.  */ \
+       (*pchild == child ? pchild = &(child->next_sibling) : NULL), \
+       /* Get pointer to next child.  */ \
+        child = *pchild)
+
+/* Free an object and all its content.  */
+void
+hwloc_free_unlinked_object(hwloc_obj_t obj)
+{
+  switch (obj->type) {
+  default:
+    break;
+  }
+  hwloc__free_infos(obj->infos, obj->infos_count);
+  hwloc_clear_object_distances(obj);
+  free(obj->memory.page_types);
+  free(obj->attr);
+  free(obj->children);
+  free(obj->name);
+  hwloc_bitmap_free(obj->cpuset);
+  hwloc_bitmap_free(obj->complete_cpuset);
+  hwloc_bitmap_free(obj->allowed_cpuset);
+  hwloc_bitmap_free(obj->nodeset);
+  hwloc_bitmap_free(obj->complete_nodeset);
+  hwloc_bitmap_free(obj->allowed_nodeset);
+  free(obj);
+}
+
+/* insert the (non-empty) list of sibling starting at firstnew as new children of newparent,
+ * and return the address of the pointer to the next one
+ */
+static hwloc_obj_t *
+insert_siblings_list(hwloc_obj_t *firstp, hwloc_obj_t firstnew, hwloc_obj_t newparent)
+{
+  hwloc_obj_t tmp;
+  assert(firstnew);
+  *firstp = tmp = firstnew;
+  tmp->parent = newparent;
+  while (tmp->next_sibling) {
+    tmp = tmp->next_sibling;
+    tmp->parent = newparent;
+  }
+  return &tmp->next_sibling;
+}
+
+static void
+append_siblings_list(hwloc_obj_t *firstp, hwloc_obj_t firstnew, hwloc_obj_t newparent)
+{
+  hwloc_obj_t *tmpp, tmp;
+  /* find the end of the list */
+  for(tmpp = firstp ; *tmpp; tmpp = &((*tmpp)->next_sibling));
+  *tmpp = firstnew;
+  /* update parent pointers */
+  for(tmp = firstnew; tmp; tmp = tmp->next_sibling)
+    tmp->parent = newparent;
+}
+
+/* Remove an object from its parent and free it.
+ * Only updates next_sibling/first_child pointers,
+ * so may only be used during early discovery.
+ *
+ * Children are inserted in the parent.
+ * If children should be inserted somewhere else (e.g. when merging with a child),
+ * the caller should move them before calling this function.
+ */
+static void
+unlink_and_free_single_object(hwloc_obj_t *pparent)
+{
+  hwloc_obj_t old = *pparent;
+  hwloc_obj_t *lastp;
+
+  if (old->type == HWLOC_OBJ_MISC) {
+    /* Misc object */
+
+    /* no normal children */
+    assert(!old->first_child);
+
+    /* no I/O children */
+    assert(!old->io_first_child);
+
+    if (old->misc_first_child)
+      /* insert old misc object children as new siblings below parent instead of old */
+      lastp = insert_siblings_list(pparent, old->misc_first_child, old->parent);
+    else
+      lastp = pparent;
+    /* append old siblings back */
+    *lastp = old->next_sibling;
+
+  } else if (hwloc_obj_type_is_io(old->type)) {
+    /* I/O object */
+
+    /* no normal children */
+    assert(!old->first_child);
+
+    if (old->io_first_child)
+      /* insert old I/O object children as new siblings below parent instead of old */
+      lastp = insert_siblings_list(pparent, old->io_first_child, old->parent);
+    else
+      lastp = pparent;
+    /* append old siblings back */
+    *lastp = old->next_sibling;
+
+    /* append old Misc children to parent */
+    if (old->misc_first_child)
+      append_siblings_list(&old->parent->misc_first_child, old->misc_first_child, old->parent);
+
+  } else {
+    /* Normal object */
+
+    if (old->first_child)
+      /* insert old object children as new siblings below parent instead of old */
+      lastp = insert_siblings_list(pparent, old->first_child, old->parent);
+    else
+      lastp = pparent;
+    /* append old siblings back */
+    *lastp = old->next_sibling;
+
+    /* append old I/O and Misc children to parent
+     * old->parent cannot be NULL (removing root), misc children should have been moved by the caller earlier.
+     */
+    if (old->io_first_child)
+      append_siblings_list(&old->parent->io_first_child, old->io_first_child, old->parent);
+    if (old->misc_first_child)
+      append_siblings_list(&old->parent->misc_first_child, old->misc_first_child, old->parent);
+  }
+
+  hwloc_free_unlinked_object(old);
+}
+
+/* Remove an object and its children from its parent and free them.
+ * Only updates next_sibling/first_child pointers,
+ * so may only be used during early discovery.
+ */
+static void
+unlink_and_free_object_and_children(hwloc_obj_t *pobj)
+{
+  hwloc_obj_t obj = *pobj, child, *pchild;
+
+  for_each_child_safe(child, obj, pchild)
+    unlink_and_free_object_and_children(pchild);
+  for_each_io_child_safe(child, obj, pchild)
+    unlink_and_free_object_and_children(pchild);
+  for_each_misc_child_safe(child, obj, pchild)
+    unlink_and_free_object_and_children(pchild);
+
+  *pobj = obj->next_sibling;
+  hwloc_free_unlinked_object(obj);
+}
+
+static void
+hwloc__duplicate_object(struct hwloc_obj *newobj,
+			struct hwloc_obj *src)
+{
+  size_t len;
+  unsigned i;
+
+  newobj->type = src->type;
+  newobj->os_index = src->os_index;
+
+  if (src->name)
+    newobj->name = strdup(src->name);
+  newobj->userdata = src->userdata;
+
+  memcpy(&newobj->memory, &src->memory, sizeof(struct hwloc_obj_memory_s));
+  if (src->memory.page_types_len) {
+    len = src->memory.page_types_len * sizeof(struct hwloc_obj_memory_page_type_s);
+    newobj->memory.page_types = malloc(len);
+    memcpy(newobj->memory.page_types, src->memory.page_types, len);
+  }
+
+  memcpy(newobj->attr, src->attr, sizeof(*newobj->attr));
+
+  newobj->cpuset = hwloc_bitmap_dup(src->cpuset);
+  newobj->complete_cpuset = hwloc_bitmap_dup(src->complete_cpuset);
+  newobj->allowed_cpuset = hwloc_bitmap_dup(src->allowed_cpuset);
+  newobj->nodeset = hwloc_bitmap_dup(src->nodeset);
+  newobj->complete_nodeset = hwloc_bitmap_dup(src->complete_nodeset);
+  newobj->allowed_nodeset = hwloc_bitmap_dup(src->allowed_nodeset);
+
+  /* don't duplicate distances, they'll be recreated at the end of the topology build */
+
+  for(i=0; i<src->infos_count; i++)
+    hwloc__add_info(&newobj->infos, &newobj->infos_count, src->infos[i].name, src->infos[i].value);
+}
+
+void
+hwloc__duplicate_objects(struct hwloc_topology *newtopology,
+			 struct hwloc_obj *newparent,
+			 struct hwloc_obj *src)
+{
+  hwloc_obj_t newobj;
+  hwloc_obj_t child;
+
+  newobj = hwloc_alloc_setup_object(src->type, src->os_index);
+  hwloc__duplicate_object(newobj, src);
+
+  for(child = src->first_child; child; child = child->next_sibling)
+    hwloc__duplicate_objects(newtopology, newobj, child);
+  for(child = src->io_first_child; child; child = child->next_sibling)
+    hwloc__duplicate_objects(newtopology, newobj, child);
+  for(child = src->misc_first_child; child; child = child->next_sibling)
+    hwloc__duplicate_objects(newtopology, newobj, child);
+
+  /* no need to check the children order here, the source topology
+   * is supposed to be OK already, and we have debug asserts.
+   */
+  hwloc_insert_object_by_parent(newtopology, newparent, newobj);
+}
+
+int
+hwloc_topology_dup(hwloc_topology_t *newp,
+		   hwloc_topology_t old)
+{
+  hwloc_topology_t new;
+  hwloc_obj_t newroot;
+  hwloc_obj_t oldroot = hwloc_get_root_obj(old);
+  hwloc_obj_t child;
+
+  if (!old->is_loaded) {
+    errno = -EINVAL;
+    return -1;
+  }
+
+  hwloc_topology_init(&new);
+
+  new->flags = old->flags;
+  memcpy(new->ignored_types, old->ignored_types, sizeof(old->ignored_types));
+  new->is_thissystem = old->is_thissystem;
+  new->is_loaded = 1;
+  new->pid = old->pid;
+
+  memcpy(&new->binding_hooks, &old->binding_hooks, sizeof(old->binding_hooks));
+
+  memcpy(new->support.discovery, old->support.discovery, sizeof(*old->support.discovery));
+  memcpy(new->support.cpubind, old->support.cpubind, sizeof(*old->support.cpubind));
+  memcpy(new->support.membind, old->support.membind, sizeof(*old->support.membind));
+
+  new->userdata_export_cb = old->userdata_export_cb;
+  new->userdata_import_cb = old->userdata_import_cb;
+
+  newroot = hwloc_get_root_obj(new);
+  hwloc__duplicate_object(newroot, oldroot);
+
+  for(child = oldroot->first_child; child; child = child->next_sibling)
+    hwloc__duplicate_objects(new, newroot, child);
+  for(child = oldroot->io_first_child; child; child = child->next_sibling)
+    hwloc__duplicate_objects(new, newroot, child);
+  for(child = oldroot->misc_first_child; child; child = child->next_sibling)
+    hwloc__duplicate_objects(new, newroot, child);
+
+  if (old->first_osdist) {
+    struct hwloc_os_distances_s *olddist = old->first_osdist;
+    while (olddist) {
+      struct hwloc_os_distances_s *newdist = malloc(sizeof(*newdist));
+      newdist->type = olddist->type;
+      newdist->nbobjs = olddist->nbobjs;
+      newdist->indexes = malloc(newdist->nbobjs * sizeof(*newdist->indexes));
+      memcpy(newdist->indexes, olddist->indexes, newdist->nbobjs * sizeof(*newdist->indexes));
+      newdist->objs = NULL; /* will be recomputed when needed */
+      newdist->distances = malloc(newdist->nbobjs * newdist->nbobjs * sizeof(*newdist->distances));
+      memcpy(newdist->distances, olddist->distances, newdist->nbobjs * newdist->nbobjs * sizeof(*newdist->distances));
+
+      newdist->forced = olddist->forced;
+      if (new->first_osdist) {
+	new->last_osdist->next = newdist;
+	newdist->prev = new->last_osdist;
+      } else {
+	new->first_osdist = newdist;
+	newdist->prev = NULL;
+      }
+      new->last_osdist = newdist;
+      newdist->next = NULL;
+
+      olddist = olddist->next;
+    }
+  } else
+    new->first_osdist = old->last_osdist = NULL;
+
+  /* no need to duplicate backends, topology is already loaded */
+  new->backends = NULL;
+
+  hwloc_connect_children(new->levels[0][0]);
+  if (hwloc_connect_levels(new) < 0)
+    goto out;
+  new->modified = 0;
+
+  hwloc_distances_finalize_os(new);
+  hwloc_distances_finalize_logical(new);
+
+#ifndef HWLOC_DEBUG
+  if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+    hwloc_topology_check(new);
+
+  *newp = new;
+  return 0;
+
+ out:
+  hwloc_topology_clear(new);
+  hwloc_distances_destroy(new);
+  hwloc_topology_setup_defaults(new);
+  return -1;
+}
+
+/*
+ * How to compare objects based on types.
+ *
+ * Note that HIGHER/LOWER is only a (consistent) heuristic, used to sort
+ * objects with same cpuset consistently.
+ * Only EQUAL / not EQUAL can be relied upon.
+ */
+
+enum hwloc_type_cmp_e {
+  HWLOC_TYPE_HIGHER,
+  HWLOC_TYPE_DEEPER,
+  HWLOC_TYPE_EQUAL
+};
+
+/* WARNING: The indexes of this array MUST match the ordering that of
+   the obj_order_type[] array, below.  Specifically, the values must
+   be laid out such that:
+
+       obj_order_type[obj_type_order[N]] = N
+
+   for all HWLOC_OBJ_* values of N.  Put differently:
+
+       obj_type_order[A] = B
+
+   where the A values are in order of the hwloc_obj_type_t enum, and
+   the B values are the corresponding indexes of obj_order_type.
+
+   We can't use C99 syntax to initialize this in a little safer manner
+   -- bummer.  :-(
+
+   *************************************************************
+   *** DO NOT CHANGE THE ORDERING OF THIS ARRAY WITHOUT TRIPLE
+   *** CHECKING ITS CORRECTNESS!
+   *************************************************************
+   */
+static const unsigned obj_type_order[] = {
+    /* first entry is HWLOC_OBJ_SYSTEM */  0,
+    /* next entry is HWLOC_OBJ_MACHINE */  1,
+    /* next entry is HWLOC_OBJ_NUMANODE */ 3,
+    /* next entry is HWLOC_OBJ_PACKAGE */  4,
+    /* next entry is HWLOC_OBJ_CACHE */    5,
+    /* next entry is HWLOC_OBJ_CORE */     6,
+    /* next entry is HWLOC_OBJ_PU */       10,
+    /* next entry is HWLOC_OBJ_GROUP */    2,
+    /* next entry is HWLOC_OBJ_MISC */     11,
+    /* next entry is HWLOC_OBJ_BRIDGE */   7,
+    /* next entry is HWLOC_OBJ_PCI_DEVICE */  8,
+    /* next entry is HWLOC_OBJ_OS_DEVICE */   9
+};
+
+static const hwloc_obj_type_t obj_order_type[] = {
+  HWLOC_OBJ_SYSTEM,
+  HWLOC_OBJ_MACHINE,
+  HWLOC_OBJ_GROUP,
+  HWLOC_OBJ_NUMANODE,
+  HWLOC_OBJ_PACKAGE,
+  HWLOC_OBJ_CACHE,
+  HWLOC_OBJ_CORE,
+  HWLOC_OBJ_BRIDGE,
+  HWLOC_OBJ_PCI_DEVICE,
+  HWLOC_OBJ_OS_DEVICE,
+  HWLOC_OBJ_PU,
+  HWLOC_OBJ_MISC,
+};
+
+/* priority to be used when merging identical parent/children object
+ * (in merge_useless_child), keep the highest priority one.
+ *
+ * Always keep Machine/PU/PCIDev/OSDev
+ * then System/Node
+ * then Core
+ * then Package
+ * then Cache
+ * then always drop Group/Misc/Bridge.
+ *
+ * Some type won't actually ever be involved in such merging.
+ */
+static const int obj_type_priority[] = {
+  /* first entry is HWLOC_OBJ_SYSTEM */     80,
+  /* next entry is HWLOC_OBJ_MACHINE */     90,
+  /* next entry is HWLOC_OBJ_NUMANODE */    100,
+  /* next entry is HWLOC_OBJ_PACKAGE */     40,
+  /* next entry is HWLOC_OBJ_CACHE */       20,
+  /* next entry is HWLOC_OBJ_CORE */        60,
+  /* next entry is HWLOC_OBJ_PU */          100,
+  /* next entry is HWLOC_OBJ_GROUP */       0,
+  /* next entry is HWLOC_OBJ_MISC */        0,
+  /* next entry is HWLOC_OBJ_BRIDGE */      0,
+  /* next entry is HWLOC_OBJ_PCI_DEVICE */  100,
+  /* next entry is HWLOC_OBJ_OS_DEVICE */   100
+};
+
+static unsigned __hwloc_attribute_const
+hwloc_get_type_order(hwloc_obj_type_t type)
+{
+  return obj_type_order[type];
+}
+
+#if !defined(NDEBUG)
+static hwloc_obj_type_t hwloc_get_order_type(int order)
+{
+  return obj_order_type[order];
+}
+#endif
+
+int hwloc_compare_types (hwloc_obj_type_t type1, hwloc_obj_type_t type2)
+{
+  unsigned order1 = hwloc_get_type_order(type1);
+  unsigned order2 = hwloc_get_type_order(type2);
+
+  /* I/O are only comparable with each others and with machine and system */
+  if (hwloc_obj_type_is_io(type1)
+      && !hwloc_obj_type_is_io(type2) && type2 != HWLOC_OBJ_SYSTEM && type2 != HWLOC_OBJ_MACHINE)
+    return HWLOC_TYPE_UNORDERED;
+  if (hwloc_obj_type_is_io(type2)
+      && !hwloc_obj_type_is_io(type1) && type1 != HWLOC_OBJ_SYSTEM && type1 != HWLOC_OBJ_MACHINE)
+    return HWLOC_TYPE_UNORDERED;
+
+  return order1 - order2;
+}
+
+static enum hwloc_type_cmp_e
+hwloc_type_cmp(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  hwloc_obj_type_t type1 = obj1->type;
+  hwloc_obj_type_t type2 = obj2->type;
+  int compare;
+
+  compare = hwloc_compare_types(type1, type2);
+  if (compare == HWLOC_TYPE_UNORDERED)
+    return HWLOC_TYPE_EQUAL; /* we cannot do better */
+  if (compare > 0)
+    return HWLOC_TYPE_DEEPER;
+  if (compare < 0)
+    return HWLOC_TYPE_HIGHER;
+
+  /* Caches have the same types but can have different depths.  */
+  if (type1 == HWLOC_OBJ_CACHE) {
+    if (obj1->attr->cache.depth < obj2->attr->cache.depth)
+      return HWLOC_TYPE_DEEPER;
+    else if (obj1->attr->cache.depth > obj2->attr->cache.depth)
+      return HWLOC_TYPE_HIGHER;
+    else if (obj1->attr->cache.type > obj2->attr->cache.type)
+      /* consider icache deeper than dcache and dcache deeper than unified */
+      return HWLOC_TYPE_DEEPER;
+    else if (obj1->attr->cache.type < obj2->attr->cache.type)
+      /* consider icache deeper than dcache and dcache deeper than unified */
+      return HWLOC_TYPE_HIGHER;
+  }
+
+  /* Group objects have the same types but can have different depths.  */
+  if (type1 == HWLOC_OBJ_GROUP) {
+    if (obj1->attr->group.depth == (unsigned) -1
+	|| obj2->attr->group.depth == (unsigned) -1)
+      return HWLOC_TYPE_EQUAL;
+    if (obj1->attr->group.depth < obj2->attr->group.depth)
+      return HWLOC_TYPE_DEEPER;
+    else if (obj1->attr->group.depth > obj2->attr->group.depth)
+      return HWLOC_TYPE_HIGHER;
+  }
+
+  /* Bridges objects have the same types but can have different depths.  */
+  if (type1 == HWLOC_OBJ_BRIDGE) {
+    if (obj1->attr->bridge.depth < obj2->attr->bridge.depth)
+      return HWLOC_TYPE_DEEPER;
+    else if (obj1->attr->bridge.depth > obj2->attr->bridge.depth)
+      return HWLOC_TYPE_HIGHER;
+  }
+
+  return HWLOC_TYPE_EQUAL;
+}
+
+/*
+ * How to compare objects based on cpusets.
+ */
+
+enum hwloc_obj_cmp_e {
+  HWLOC_OBJ_EQUAL = HWLOC_BITMAP_EQUAL,			/**< \brief Equal */
+  HWLOC_OBJ_INCLUDED = HWLOC_BITMAP_INCLUDED,		/**< \brief Strictly included into */
+  HWLOC_OBJ_CONTAINS = HWLOC_BITMAP_CONTAINS,		/**< \brief Strictly contains */
+  HWLOC_OBJ_INTERSECTS = HWLOC_BITMAP_INTERSECTS,	/**< \brief Intersects, but no inclusion! */
+  HWLOC_OBJ_DIFFERENT = HWLOC_BITMAP_DIFFERENT		/**< \brief No intersection */
+};
+
+static int
+hwloc_obj_cmp_sets(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  hwloc_bitmap_t set1, set2;
+  int res = HWLOC_OBJ_DIFFERENT;
+
+  assert(!hwloc_obj_type_is_special(obj1->type));
+  assert(!hwloc_obj_type_is_special(obj2->type));
+
+  /* compare cpusets first */
+  if (obj1->complete_cpuset && obj2->complete_cpuset) {
+    set1 = obj1->complete_cpuset;
+    set2 = obj2->complete_cpuset;
+  } else {
+    set1 = obj1->cpuset;
+    set2 = obj2->cpuset;
+  }
+  if (set1 && set2 && !hwloc_bitmap_iszero(set1) && !hwloc_bitmap_iszero(set2)) {
+    res = hwloc_bitmap_compare_inclusion(set1, set2);
+    if (res == HWLOC_OBJ_INTERSECTS)
+      return HWLOC_OBJ_INTERSECTS;
+  }
+
+  /* then compare nodesets, and combine the results */
+  if (obj1->complete_nodeset && obj2->complete_nodeset) {
+    set1 = obj1->complete_nodeset;
+    set2 = obj2->complete_nodeset;
+  } else {
+    set1 = obj1->nodeset;
+    set2 = obj2->nodeset;
+  }
+  if (set1 && set2 && !hwloc_bitmap_iszero(set1) && !hwloc_bitmap_iszero(set2)) {
+    int noderes = hwloc_bitmap_compare_inclusion(set1, set2);
+    /* deal with conflicting cpusets/nodesets inclusions */
+    if (noderes == HWLOC_OBJ_INCLUDED) {
+      if (res == HWLOC_OBJ_CONTAINS)
+	/* contradicting order for cpusets and nodesets */
+	return HWLOC_OBJ_INTERSECTS;
+      res = HWLOC_OBJ_INCLUDED;
+
+    } else if (noderes == HWLOC_OBJ_CONTAINS) {
+      if (res == HWLOC_OBJ_INCLUDED)
+	/* contradicting order for cpusets and nodesets */
+	return HWLOC_OBJ_INTERSECTS;
+      res = HWLOC_OBJ_CONTAINS;
+
+    } else if (noderes == HWLOC_OBJ_INTERSECTS) {
+      return HWLOC_OBJ_INTERSECTS;
+
+    } else {
+      /* nodesets are different, keep the cpuset order */
+      /* FIXME: with upcoming multiple levels of NUMA, we may have to report INCLUDED or CONTAINED here */
+
+    }
+  }
+
+  return res;
+}
+
+static int
+hwloc_obj_cmp_types(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  /* Same sets, subsort by type to have a consistent ordering.  */
+  int typeres = hwloc_type_cmp(obj1, obj2);
+  if (typeres == HWLOC_TYPE_DEEPER)
+    return HWLOC_OBJ_INCLUDED;
+  if (typeres == HWLOC_TYPE_HIGHER)
+    return HWLOC_OBJ_CONTAINS;
+
+  /* Same sets and types!  Let's hope it's coherent.  */
+  return HWLOC_OBJ_EQUAL;
+}
+
+/* Compare object cpusets based on complete_cpuset if defined (always correctly ordered),
+ * or fallback to the main cpusets (only correctly ordered during early insert before disallowed bits are cleared).
+ *
+ * This is the sane way to compare object among a horizontal level.
+ */
+int
+hwloc__object_cpusets_compare_first(hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  if (obj1->complete_cpuset && obj2->complete_cpuset)
+    return hwloc_bitmap_compare_first(obj1->complete_cpuset, obj2->complete_cpuset);
+  else
+    return hwloc_bitmap_compare_first(obj1->cpuset, obj2->cpuset);
+}
+
+/* format the obj info to print in error messages */
+static void
+hwloc__report_error_format_obj(char *buf, size_t buflen, hwloc_obj_t obj)
+{
+	char typestr[64];
+	char *cpusetstr;
+	hwloc_obj_type_snprintf(typestr, sizeof(typestr), obj, 0);
+	hwloc_bitmap_asprintf(&cpusetstr, obj->cpuset);
+	if (obj->os_index != (unsigned) -1)
+	  snprintf(buf, buflen, "%s (P#%u cpuset %s)",
+		   typestr, obj->os_index, cpusetstr);
+	else
+	  snprintf(buf, buflen, "%s (cpuset %s)",
+		   typestr, cpusetstr);
+	free(cpusetstr);
+}
+
+/*
+ * How to insert objects into the topology.
+ *
+ * Note: during detection, only the first_child and next_sibling pointers are
+ * kept up to date.  Others are computed only once topology detection is
+ * complete.
+ */
+
+#define merge_index(new, old, field, type) \
+  if ((old)->field == (type) -1) \
+    (old)->field = (new)->field;
+#define merge_sizes(new, old, field) \
+  if (!(old)->field) \
+    (old)->field = (new)->field;
+#ifdef HWLOC_DEBUG
+#define check_sizes(new, old, field) \
+  if ((new)->field) \
+    assert((old)->field == (new)->field)
+#else
+#define check_sizes(new, old, field)
+#endif
+
+static void
+merge_insert_equal(hwloc_obj_t new, hwloc_obj_t old)
+{
+  merge_index(new, old, os_index, unsigned);
+
+  if (new->distances_count) {
+    if (old->distances_count) {
+      old->distances_count += new->distances_count;
+      old->distances = realloc(old->distances, old->distances_count * sizeof(*old->distances));
+      memcpy(old->distances + new->distances_count, new->distances, new->distances_count * sizeof(*old->distances));
+      free(new->distances);
+    } else {
+      old->distances_count = new->distances_count;
+      old->distances = new->distances;
+    }
+    new->distances_count = 0;
+    new->distances = NULL;
+  }
+
+  if (new->infos_count) {
+    hwloc__move_infos(&old->infos, &old->infos_count,
+		      &new->infos, &new->infos_count);
+  }
+
+  if (new->name) {
+    if (old->name)
+      free(old->name);
+    old->name = new->name;
+    new->name = NULL;
+  }
+
+  /* Ignore userdata. It will be NULL before load().
+   * It may be non-NULL if alloc+insert_group() after load().
+   */
+
+  switch(new->type) {
+  case HWLOC_OBJ_NUMANODE:
+    /* Do not check these, it may change between calls */
+    merge_sizes(new, old, memory.local_memory);
+    merge_sizes(new, old, memory.total_memory);
+    /* if both newects have a page_types array, just keep the biggest one for now */
+    if (new->memory.page_types_len && old->memory.page_types_len)
+      hwloc_debug("%s", "merging page_types by keeping the biggest one only\n");
+    if (new->memory.page_types_len < old->memory.page_types_len) {
+      free(new->memory.page_types);
+    } else {
+      free(old->memory.page_types);
+      old->memory.page_types_len = new->memory.page_types_len;
+      old->memory.page_types = new->memory.page_types;
+      new->memory.page_types = NULL;
+      new->memory.page_types_len = 0;
+    }
+    break;
+  case HWLOC_OBJ_CACHE:
+    merge_sizes(new, old, attr->cache.size);
+    check_sizes(new, old, attr->cache.size);
+    merge_sizes(new, old, attr->cache.linesize);
+    check_sizes(new, old, attr->cache.linesize);
+    break;
+  default:
+    break;
+  }
+}
+
+/* Try to insert OBJ in CUR, recurse if needed.
+ * Returns the object if it was inserted,
+ * the remaining object it was merged,
+ * NULL if failed to insert.
+ */
+static struct hwloc_obj *
+hwloc___insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t cur, hwloc_obj_t obj,
+			        hwloc_report_error_t report_error)
+{
+  hwloc_obj_t child, next_child = NULL;
+  /* These will always point to the pointer to their next last child. */
+  hwloc_obj_t *cur_children = &cur->first_child;
+  hwloc_obj_t *obj_children = &obj->first_child;
+  /* Pointer where OBJ should be put */
+  hwloc_obj_t *putp = NULL; /* OBJ position isn't found yet */
+
+  /* Make sure we haven't gone too deep.  */
+  if (!hwloc_bitmap_isincluded(obj->cpuset, cur->cpuset)) {
+    fprintf(stderr,"recursion has gone too deep?!\n");
+    return NULL;
+  }
+
+  /* Iteration with prefetching to be completely safe against CHILD removal.
+   * The list is already sorted by cpuset, and there's no intersection between siblings.
+   */
+  for (child = cur->first_child, child ? next_child = child->next_sibling : NULL;
+       child;
+       child = next_child, child ? next_child = child->next_sibling : NULL) {
+
+    int res = hwloc_obj_cmp_sets(obj, child);
+
+    if (res == HWLOC_OBJ_EQUAL) {
+      if (obj->type == HWLOC_OBJ_GROUP) {
+	/* Group are ignored keep_structure. ignored always are handled earlier. Non-ignored Groups isn't possible. */
+	assert(topology->ignored_types[HWLOC_OBJ_GROUP] == HWLOC_IGNORE_TYPE_KEEP_STRUCTURE);
+        /* Remove the Group now. The normal ignore code path wouldn't tell us whether the Group was removed or not,
+	 * while some callers need to know (at least hwloc_topology_insert_group()).
+	 *
+	 * Keep EQUAL so that the Group gets merged.
+	 */
+      } else {
+	/* otherwise compare actual types to decide of the inclusion */
+	res = hwloc_obj_cmp_types(obj, child);
+      }
+    }
+
+    switch (res) {
+      case HWLOC_OBJ_EQUAL:
+	/* Can be two objects with same type. Or one Group and anything else. */
+	if (obj->type == child->type
+	    && (obj->type == HWLOC_OBJ_PU || obj->type == HWLOC_OBJ_NUMANODE)
+	    && obj->os_index != child->os_index) {
+	  static int reported = 0;
+	  if (!reported && !hwloc_hide_errors()) {
+	    fprintf(stderr, "Cannot merge similar %s objects with different OS indexes %u and %u\n",
+		    hwloc_obj_type_string(obj->type), child->os_index, obj->os_index);
+	    reported = 1;
+	  }
+          return NULL;
+	}
+	merge_insert_equal(obj, child);
+	/* Already present, no need to insert.  */
+	return child;
+
+      case HWLOC_OBJ_INCLUDED:
+	/* OBJ is strictly contained is some child of CUR, go deeper.  */
+	return hwloc___insert_object_by_cpuset(topology, child, obj, report_error);
+
+      case HWLOC_OBJ_INTERSECTS:
+        if (report_error) {
+	  char childstr[512];
+	  char objstr[512];
+	  char msg[1024];
+	  hwloc__report_error_format_obj(objstr, sizeof(objstr), obj);
+	  hwloc__report_error_format_obj(childstr, sizeof(childstr), child);
+	  snprintf(msg, sizeof(msg), "%s intersects with %s without inclusion!", objstr, childstr);
+	  report_error(msg, __LINE__);
+	}
+	goto putback;
+
+      case HWLOC_OBJ_DIFFERENT:
+        /* OBJ should be a child of CUR before CHILD, mark its position if not found yet. */
+	if (!putp && hwloc__object_cpusets_compare_first(obj, child) < 0)
+	  /* Don't insert yet, there could be intersect errors later */
+	  putp = cur_children;
+	/* Advance cur_children.  */
+	cur_children = &child->next_sibling;
+	break;
+
+      case HWLOC_OBJ_CONTAINS:
+	/* OBJ contains CHILD, remove CHILD from CUR */
+	*cur_children = child->next_sibling;
+	child->next_sibling = NULL;
+	/* Put CHILD in OBJ */
+	*obj_children = child;
+	obj_children = &child->next_sibling;
+	child->parent = obj;
+	break;
+    }
+  }
+  /* cur/obj_children points to last CUR/OBJ child next_sibling pointer, which must be NULL. */
+  assert(!*obj_children);
+  assert(!*cur_children);
+
+  /* Put OBJ where it belongs, or in last in CUR's children.  */
+  if (!putp)
+    putp = cur_children;
+  obj->next_sibling = *putp;
+  *putp = obj;
+  obj->parent = cur;
+
+  topology->modified = 1;
+  return obj;
+
+ putback:
+  /* Put-back OBJ children in CUR and return an error. */
+  if (putp)
+    cur_children = putp; /* No need to try to insert before where OBJ was supposed to go */
+  else
+    cur_children = &cur->first_child; /* Start from the beginning */
+  /* We can insert in order, but there can be holes in the middle. */
+  while ((child = obj->first_child) != NULL) {
+    /* Remove from OBJ */
+    obj->first_child = child->next_sibling;
+    obj->parent = cur;
+    /* Find child position in CUR, and insert. */
+    while (*cur_children && hwloc__object_cpusets_compare_first(*cur_children, child) < 0)
+      cur_children = &(*cur_children)->next_sibling;
+    child->next_sibling = *cur_children;
+    *cur_children = child;
+  }
+  return NULL;
+}
+
+/* insertion routine that lets you change the error reporting callback */
+struct hwloc_obj *
+hwloc__insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj,
+			       hwloc_report_error_t report_error)
+{
+  struct hwloc_obj *result;
+
+  assert(!hwloc_obj_type_is_special(obj->type));
+
+  /* Start at the top.  */
+  result = hwloc___insert_object_by_cpuset(topology, topology->levels[0][0], obj, report_error);
+  if (result != obj) {
+    /* either failed to insert, or got merged, free the original object */
+    hwloc_free_unlinked_object(obj);
+  } else {
+    /* Add the cpuset to the top */
+    hwloc_bitmap_or(topology->levels[0][0]->complete_cpuset, topology->levels[0][0]->complete_cpuset, obj->cpuset);
+    if (obj->nodeset)
+      hwloc_bitmap_or(topology->levels[0][0]->complete_nodeset, topology->levels[0][0]->complete_nodeset, obj->nodeset);
+  }
+  return result;
+}
+
+/* the default insertion routine warns in case of error.
+ * it's used by most backends */
+struct hwloc_obj *
+hwloc_insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj)
+{
+  return hwloc__insert_object_by_cpuset(topology, obj, hwloc_report_os_error);
+}
+
+void
+hwloc_insert_object_by_parent(struct hwloc_topology *topology, hwloc_obj_t parent, hwloc_obj_t obj)
+{
+  hwloc_obj_t *current;
+
+  if (obj->type == HWLOC_OBJ_MISC) {
+    /* Append to the end of the Misc list */
+    for (current = &parent->misc_first_child; *current; current = &(*current)->next_sibling);
+  } else if (hwloc_obj_type_is_io(obj->type)) {
+    /* Append to the end of the I/O list */
+    for (current = &parent->io_first_child; *current; current = &(*current)->next_sibling);
+  } else {
+    /* Append to the end of the list.
+     * The caller takes care of inserting children in the right cpuset order, without intersection between them.
+     * Duplicating doesn't need to check the order since the source topology is supposed to be OK already.
+     * XML reorders if needed, and fails on intersecting siblings.
+     * Other callers just insert random objects such as I/O or Misc, no cpuset issue there.
+     */
+    for (current = &parent->first_child; *current; current = &(*current)->next_sibling);
+  }
+
+  *current = obj;
+  obj->parent = parent;
+  obj->next_sibling = NULL;
+  topology->modified = 1;
+}
+
+hwloc_obj_t
+hwloc_topology_alloc_group_object(struct hwloc_topology *topology __hwloc_attribute_unused)
+{
+  hwloc_obj_t obj = hwloc_alloc_setup_object(HWLOC_OBJ_GROUP, -1);
+  if (!obj)
+    return NULL;
+  obj->attr->group.depth = -1;
+  return obj;
+}
+
+hwloc_obj_t
+hwloc_topology_insert_group_object(struct hwloc_topology *topology, hwloc_obj_t obj)
+{
+  hwloc_obj_t res;
+
+  if (!topology->is_loaded) {
+    /* this could actually work, we would just need to disable connect_children/levels below */
+    hwloc_free_unlinked_object(obj);
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (topology->ignored_types[HWLOC_OBJ_GROUP] == HWLOC_IGNORE_TYPE_ALWAYS) {
+    hwloc_free_unlinked_object(obj);
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if ((!obj->cpuset || hwloc_bitmap_iszero(obj->cpuset))
+      && (!obj->complete_cpuset || hwloc_bitmap_iszero(obj->complete_cpuset))
+      && (!obj->nodeset || hwloc_bitmap_iszero(obj->nodeset))
+      && (!obj->complete_nodeset || hwloc_bitmap_iszero(obj->complete_nodeset))) {
+    hwloc_free_unlinked_object(obj);
+    errno = EINVAL;
+    return NULL;
+  }
+
+  res = hwloc__insert_object_by_cpuset(topology, obj, NULL /* do not show errors on stdout */);
+  if (!res)
+    return NULL;
+  if (res != obj)
+    /* merged */
+    return res;
+
+  /* properly inserted */
+  hwloc_obj_add_children_sets(obj);
+  hwloc_connect_children(topology->levels[0][0]);
+  if (hwloc_connect_levels(topology) < 0)
+    return NULL;
+  topology->modified = 0;
+  return obj;
+}
+
+static void hwloc_connect_misc_level(hwloc_topology_t topology);
+
+hwloc_obj_t
+hwloc_topology_insert_misc_object(struct hwloc_topology *topology, hwloc_obj_t parent, const char *name)
+{
+  hwloc_obj_t obj;
+
+  if (topology->ignored_types[HWLOC_OBJ_MISC] == HWLOC_IGNORE_TYPE_ALWAYS) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  if (!topology->is_loaded) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  obj = hwloc_alloc_setup_object(HWLOC_OBJ_MISC, -1);
+  if (name)
+    obj->name = strdup(name);
+
+  hwloc_insert_object_by_parent(topology, parent, obj);
+
+  hwloc_connect_children(parent); /* FIXME: only connect misc children */
+  hwloc_connect_misc_level(topology);
+  topology->modified = 0;
+
+  return obj;
+}
+
+static int hwloc_memory_page_type_compare(const void *_a, const void *_b)
+{
+  const struct hwloc_obj_memory_page_type_s *a = _a;
+  const struct hwloc_obj_memory_page_type_s *b = _b;
+  /* consider 0 as larger so that 0-size page_type go to the end */
+  if (!b->size)
+    return -1;
+  /* don't cast a-b in int since those are ullongs */
+  if (b->size == a->size)
+    return 0;
+  return a->size < b->size ? -1 : 1;
+}
+
+/* Propagate memory counts */
+static void
+propagate_total_memory(hwloc_obj_t obj)
+{
+  hwloc_obj_t *temp, child;
+  unsigned i;
+
+  /* reset total before counting local and children memory */
+  obj->memory.total_memory = 0;
+
+  /* Propagate memory up. */
+  for_each_child_safe(child, obj, temp) {
+    propagate_total_memory(child);
+    obj->memory.total_memory += child->memory.total_memory;
+  }
+  /* No memory under I/O or Misc */
+
+  obj->memory.total_memory += obj->memory.local_memory;
+
+  /* By the way, sort the page_type array.
+   * Cannot do it on insert since some backends (e.g. XML) add page_types after inserting the object.
+   */
+  qsort(obj->memory.page_types, obj->memory.page_types_len, sizeof(*obj->memory.page_types), hwloc_memory_page_type_compare);
+  /* Ignore 0-size page_types, they are at the end */
+  for(i=obj->memory.page_types_len; i>=1; i--)
+    if (obj->memory.page_types[i-1].size)
+      break;
+  obj->memory.page_types_len = i;
+}
+
+/* Collect the cpuset of all the PU objects. */
+static void
+collect_proc_cpuset(hwloc_obj_t obj, hwloc_obj_t sys)
+{
+  hwloc_obj_t child, *temp;
+
+  if (sys) {
+    /* We are already given a pointer to a system object */
+    if (obj->type == HWLOC_OBJ_PU)
+      hwloc_bitmap_or(sys->cpuset, sys->cpuset, obj->cpuset);
+  } else {
+    if (obj->cpuset) {
+      /* This object is the root of a machine */
+      sys = obj;
+      /* Assume no PU for now */
+      hwloc_bitmap_zero(obj->cpuset);
+    }
+  }
+
+  for_each_child_safe(child, obj, temp)
+    collect_proc_cpuset(child, sys);
+  /* No PU under I/O or Misc */
+}
+
+/* While traversing down and up, propagate the disallowed cpus by
+ * and'ing them to and from the first object that has a cpuset */
+static void
+propagate_unused_cpuset(hwloc_obj_t obj, hwloc_obj_t sys)
+{
+  hwloc_obj_t child, *temp;
+
+  if (obj->cpuset) {
+    if (sys) {
+      /* We are already given a pointer to an system object, update it and update ourselves */
+      hwloc_bitmap_t mask = hwloc_bitmap_alloc();
+
+      /* Apply the topology cpuset */
+      hwloc_bitmap_and(obj->cpuset, obj->cpuset, sys->cpuset);
+
+      /* Update complete cpuset down */
+      if (obj->complete_cpuset) {
+	hwloc_bitmap_and(obj->complete_cpuset, obj->complete_cpuset, sys->complete_cpuset);
+      } else {
+	obj->complete_cpuset = hwloc_bitmap_dup(sys->complete_cpuset);
+	hwloc_bitmap_and(obj->complete_cpuset, obj->complete_cpuset, obj->cpuset);
+      }
+
+      /* Update allowed cpusets */
+      if (obj->allowed_cpuset) {
+	/* Update ours */
+	hwloc_bitmap_and(obj->allowed_cpuset, obj->allowed_cpuset, sys->allowed_cpuset);
+
+	/* Update the given cpuset, but only what we know */
+	hwloc_bitmap_copy(mask, obj->cpuset);
+	hwloc_bitmap_not(mask, mask);
+	hwloc_bitmap_or(mask, mask, obj->allowed_cpuset);
+	hwloc_bitmap_and(sys->allowed_cpuset, sys->allowed_cpuset, mask);
+      } else {
+	/* Just take it as such */
+	obj->allowed_cpuset = hwloc_bitmap_dup(sys->allowed_cpuset);
+	hwloc_bitmap_and(obj->allowed_cpuset, obj->allowed_cpuset, obj->cpuset);
+      }
+
+      hwloc_bitmap_free(mask);
+    } else {
+      /* This object is the root of a machine */
+      sys = obj;
+      /* Apply complete_cpuset to cpuset and allowed_cpuset, it
+       * will automatically be applied below */
+      if (obj->complete_cpuset)
+        hwloc_bitmap_and(obj->cpuset, obj->cpuset, obj->complete_cpuset);
+      else
+        obj->complete_cpuset = hwloc_bitmap_dup(obj->cpuset);
+      if (obj->allowed_cpuset)
+        hwloc_bitmap_and(obj->allowed_cpuset, obj->allowed_cpuset, obj->complete_cpuset);
+      else
+        obj->allowed_cpuset = hwloc_bitmap_dup(obj->complete_cpuset);
+    }
+  }
+
+  for_each_child_safe(child, obj, temp)
+    propagate_unused_cpuset(child, sys);
+  /* No PU under I/O or Misc */
+}
+
+/* Setup object cpusets/nodesets by OR'ing its children. */
+HWLOC_DECLSPEC int
+hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src)
+{
+#define ADD_OTHER_OBJ_SET(_dst, _src, _set)			\
+  if ((_src)->_set) {						\
+    if (!(_dst)->_set)						\
+      (_dst)->_set = hwloc_bitmap_alloc();			\
+    hwloc_bitmap_or((_dst)->_set, (_dst)->_set, (_src)->_set);	\
+  }
+  ADD_OTHER_OBJ_SET(dst, src, cpuset);
+  ADD_OTHER_OBJ_SET(dst, src, complete_cpuset);
+  ADD_OTHER_OBJ_SET(dst, src, allowed_cpuset);
+  ADD_OTHER_OBJ_SET(dst, src, nodeset);
+  ADD_OTHER_OBJ_SET(dst, src, complete_nodeset);
+  ADD_OTHER_OBJ_SET(dst, src, allowed_nodeset);
+  return 0;
+}
+
+HWLOC_DECLSPEC int
+hwloc_obj_add_children_sets(hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
+  assert(obj->cpuset != NULL);
+  child = obj->first_child;
+  while (child) {
+    assert(child->cpuset != NULL);
+    hwloc_obj_add_other_obj_sets(obj, child);
+    child = child->next_sibling;
+  }
+  /* No need to look at Misc children, they contain no PU. */
+  return 0;
+}
+
+/* Propagate nodesets up and down */
+static void
+propagate_nodeset(hwloc_obj_t obj, hwloc_obj_t sys)
+{
+  hwloc_obj_t child, *temp;
+  hwloc_bitmap_t parent_nodeset = NULL;
+  int parent_weight = 0;
+
+  if (!sys && obj->nodeset) {
+    sys = obj;
+    if (!obj->complete_nodeset)
+      obj->complete_nodeset = hwloc_bitmap_dup(obj->nodeset);
+    if (!obj->allowed_nodeset)
+      obj->allowed_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
+  }
+
+  if (sys) {
+    if (obj->nodeset) {
+      /* Some existing nodeset coming from above, to possibly propagate down */
+      parent_nodeset = obj->nodeset;
+      parent_weight = hwloc_bitmap_weight(parent_nodeset);
+    } else
+      obj->nodeset = hwloc_bitmap_alloc();
+  }
+
+  for_each_child_safe(child, obj, temp) {
+    /* Propagate singleton nodesets down */
+    if (parent_weight == 1) {
+      if (!child->nodeset)
+        child->nodeset = hwloc_bitmap_dup(obj->nodeset);
+      else if (!hwloc_bitmap_isequal(child->nodeset, parent_nodeset)) {
+        hwloc_debug_bitmap("Oops, parent nodeset %s", parent_nodeset);
+        hwloc_debug_bitmap(" is different from child nodeset %s, ignoring the child one\n", child->nodeset);
+        hwloc_bitmap_copy(child->nodeset, parent_nodeset);
+      }
+    }
+
+    /* Recurse */
+    propagate_nodeset(child, sys);
+
+    /* Propagate children nodesets up */
+    if (sys && child->nodeset)
+      hwloc_bitmap_or(obj->nodeset, obj->nodeset, child->nodeset);
+  }
+  /* No nodeset under I/O or Misc */
+}
+
+/* Propagate allowed and complete nodesets */
+static void
+propagate_nodesets(hwloc_obj_t obj)
+{
+  hwloc_bitmap_t mask = hwloc_bitmap_alloc();
+  hwloc_obj_t child, *temp;
+
+  for_each_child_safe(child, obj, temp) {
+    if (obj->nodeset) {
+      /* Update complete nodesets down */
+      if (child->complete_nodeset) {
+        hwloc_bitmap_and(child->complete_nodeset, child->complete_nodeset, obj->complete_nodeset);
+      } else if (child->nodeset) {
+        child->complete_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
+        hwloc_bitmap_and(child->complete_nodeset, child->complete_nodeset, child->nodeset);
+      } /* else the child doesn't have nodeset information, we can not provide a complete nodeset */
+
+      /* Update allowed nodesets down */
+      if (child->allowed_nodeset) {
+        hwloc_bitmap_and(child->allowed_nodeset, child->allowed_nodeset, obj->allowed_nodeset);
+      } else if (child->nodeset) {
+        child->allowed_nodeset = hwloc_bitmap_dup(obj->allowed_nodeset);
+        hwloc_bitmap_and(child->allowed_nodeset, child->allowed_nodeset, child->nodeset);
+      }
+    }
+
+    propagate_nodesets(child);
+
+    if (obj->nodeset) {
+      /* Update allowed nodesets up */
+      if (child->nodeset && child->allowed_nodeset) {
+        hwloc_bitmap_copy(mask, child->nodeset);
+        hwloc_bitmap_andnot(mask, mask, child->allowed_nodeset);
+        hwloc_bitmap_andnot(obj->allowed_nodeset, obj->allowed_nodeset, mask);
+      }
+    }
+  }
+  hwloc_bitmap_free(mask);
+  /* No nodeset under I/O or Misc */
+
+  if (obj->nodeset) {
+    /* Apply complete nodeset to nodeset and allowed_nodeset */
+    if (obj->complete_nodeset)
+      hwloc_bitmap_and(obj->nodeset, obj->nodeset, obj->complete_nodeset);
+    else
+      obj->complete_nodeset = hwloc_bitmap_dup(obj->nodeset);
+    if (obj->allowed_nodeset)
+      hwloc_bitmap_and(obj->allowed_nodeset, obj->allowed_nodeset, obj->complete_nodeset);
+    else
+      obj->allowed_nodeset = hwloc_bitmap_dup(obj->complete_nodeset);
+  }
+}
+
+static void
+remove_unused_sets(hwloc_obj_t obj)
+{
+  hwloc_obj_t child, *temp;
+
+  if (obj->cpuset) {
+    hwloc_bitmap_and(obj->cpuset, obj->cpuset, obj->allowed_cpuset);
+  }
+  if (obj->nodeset) {
+    hwloc_bitmap_and(obj->nodeset, obj->nodeset, obj->allowed_nodeset);
+  }
+  if (obj->type == HWLOC_OBJ_NUMANODE && obj->os_index != (unsigned) -1 &&
+      !hwloc_bitmap_isset(obj->allowed_nodeset, obj->os_index)) {
+    unsigned i;
+    hwloc_debug("Dropping memory from disallowed node %u\n", obj->os_index);
+    obj->memory.local_memory = 0;
+    obj->memory.total_memory = 0;
+    for(i=0; i<obj->memory.page_types_len; i++)
+      obj->memory.page_types[i].count = 0;
+  }
+
+  for_each_child_safe(child, obj, temp)
+    remove_unused_sets(child);
+  /* No cpuset under I/O or Misc */
+}
+
+void
+hwloc__reorder_children(hwloc_obj_t parent)
+{
+  /* move the children list on the side */
+  hwloc_obj_t *prev, child, children = parent->first_child;
+  parent->first_child = NULL;
+  while (children) {
+    /* dequeue child */
+    child = children;
+    children = child->next_sibling;
+    /* find where to enqueue it */
+    prev = &parent->first_child;
+    while (*prev && hwloc__object_cpusets_compare_first(child, *prev) > 0)
+      prev = &((*prev)->next_sibling);
+    /* enqueue */
+    child->next_sibling = *prev;
+    *prev = child;
+  }
+  /* No ordering to enforce for Misc children. */
+}
+
+/* Remove objects that are ignored in any case.
+ * Returns 1 if *pparent were replaced, which means the caller need to reorder its children.
+ * Returns 0 otherwise.
+ */
+static int
+ignore_type_always(hwloc_topology_t topology, hwloc_obj_t *pparent)
+{
+  hwloc_obj_t parent = *pparent, child, *pchild;
+  int dropped_children = 0;
+  int dropped = 0;
+
+  /* account dropped normal children only, others don't required reordering */
+  for_each_child_safe(child, parent, pchild)
+    dropped_children += ignore_type_always(topology, pchild);
+  for_each_io_child_safe(child, parent, pchild) /* There can be Misc under I/O */
+    ignore_type_always(topology, pchild);
+  for_each_misc_child_safe(child, parent, pchild)
+    ignore_type_always(topology, pchild);
+
+  if ((parent != topology->levels[0][0] &&
+       topology->ignored_types[parent->type] == HWLOC_IGNORE_TYPE_ALWAYS)
+      || (parent->type == HWLOC_OBJ_CACHE && parent->attr->cache.type == HWLOC_OBJ_CACHE_INSTRUCTION
+	  && !(topology->flags & HWLOC_TOPOLOGY_FLAG_ICACHES))) {
+    hwloc_debug("%s", "\nDropping ignored object ");
+    hwloc_debug_print_object(0, parent);
+    unlink_and_free_single_object(pparent);
+    topology->modified = 1;
+    dropped = 1;
+
+  } else if (dropped_children) {
+    /* we keep this object but its children changed, reorder them by complete_cpuset */
+    hwloc__reorder_children(parent);
+  }
+
+  return dropped;
+}
+
+/* Remove all children whose cpuset is empty, except NUMA nodes
+ * since we want to keep memory information, and except PCI bridges and devices.
+ */
+static void
+remove_empty(hwloc_topology_t topology, hwloc_obj_t *pobj)
+{
+  hwloc_obj_t obj = *pobj, child, *pchild;
+
+  for_each_child_safe(child, obj, pchild)
+    remove_empty(topology, pchild);
+  /* No cpuset under I/O or Misc */
+
+  if (obj->type != HWLOC_OBJ_NUMANODE
+      && !obj->first_child /* only remove if all children were removed above, so that we don't remove parents of NUMAnode */
+      && !obj->io_first_child /* only remove if no I/O is attached there */
+      && hwloc_bitmap_iszero(obj->cpuset)) {
+    /* Remove empty children (even if it has Misc children) */
+    hwloc_debug("%s", "\nRemoving empty object ");
+    hwloc_debug_print_object(0, obj);
+    unlink_and_free_single_object(pobj);
+    topology->modified = 1;
+  }
+}
+
+/* Remove objects that are ignored with keep structure flag.
+ * Returns 1 if *pparent were replaced, which means the caller need to reorder its children.
+ * Returns 0 otherwise.
+ */
+static int
+ignore_type_keep_structure(hwloc_topology_t topology, hwloc_obj_t *pparent)
+{
+  hwloc_obj_t parent = *pparent, child, *pchild;
+  int replacechild = 0, replaceparent = 0, droppedchildren = 0;
+
+  if (!parent->first_child) /* can't use arity yet */
+    /* There are no children, nothing to merge. */
+    return 0;
+
+  /* account dropped normal children only, others don't required reordering */
+  for_each_child_safe(child, parent, pchild)
+    droppedchildren += ignore_type_keep_structure(topology, pchild);
+  for_each_io_child_safe(child, parent, pchild)
+    ignore_type_keep_structure(topology, pchild);
+  for_each_misc_child_safe(child, parent, pchild)
+    ignore_type_keep_structure(topology, pchild);
+
+  if (droppedchildren)
+    hwloc__reorder_children(parent);
+
+  child = parent->first_child;
+  /* we don't merge if there are multiple "important" children. */
+  if (child->next_sibling) /* can't use arity yet */
+    return 0;
+
+  /* Check whether parent and/or child can be replaced */
+  if (topology->ignored_types[parent->type] == HWLOC_IGNORE_TYPE_KEEP_STRUCTURE) {
+    /* Parent can be ignored in favor of the child.  */
+    replaceparent = 1;
+  }
+  if (topology->ignored_types[child->type] == HWLOC_IGNORE_TYPE_KEEP_STRUCTURE) {
+    /* Child can be ignored in favor of the parent.  */
+    replacechild = 1;
+  }
+
+  /* Decide which one to actually replace */
+  if (replaceparent && replacechild) {
+    /* If both may be replaced, look at obj_type_priority */
+    if (obj_type_priority[parent->type] > obj_type_priority[child->type])
+      replaceparent = 0;
+    else
+      replacechild = 0;
+  }
+
+  if (replaceparent) {
+    /* Replace parent with child */
+    hwloc_debug("%s", "\nIgnoring parent ");
+    hwloc_debug_print_object(0, parent);
+    /* move children to child, so that unlink_and_free_single_object() doesn't move them to the grandparent */
+    if (parent->io_first_child) {
+      append_siblings_list(&child->io_first_child, parent->io_first_child, child);
+      parent->io_first_child = NULL;
+    }
+    if (parent->misc_first_child) {
+      append_siblings_list(&child->misc_first_child, parent->misc_first_child, child);
+      parent->misc_first_child = NULL;
+    }
+    unlink_and_free_single_object(pparent);
+    topology->modified = 1;
+
+  } else if (replacechild) {
+    /* Replace child with parent */
+    hwloc_debug("%s", "\nIgnoring child ");
+    hwloc_debug_print_object(0, child);
+    unlink_and_free_single_object(&parent->first_child);
+    topology->modified = 1;
+  }
+
+  return replaceparent ? 1 : 0;
+}
+
+static void
+hwloc_drop_all_io(hwloc_topology_t topology, hwloc_obj_t root)
+{
+  hwloc_obj_t child, *pchild;
+  for_each_child_safe(child, root, pchild) {
+    hwloc_drop_all_io(topology, child);
+  }
+  for_each_io_child_safe(child, root, pchild) {
+    unlink_and_free_object_and_children(pchild);
+    topology->modified = 1;
+  }
+  /* No I/O under Misc */
+}
+
+/*
+ * If IO_DEVICES and WHOLE_IO are not set, we drop everything.
+ * If WHOLE_IO is not set, we drop non-interesting devices,
+ * and bridges that have no children.
+ * If IO_BRIDGES is also not set, we also drop all bridges
+ * except the hostbridges.
+ */
+static void
+hwloc_drop_useless_io(hwloc_topology_t topology, hwloc_obj_t root)
+{
+  hwloc_obj_t child, *pchild;
+
+  /* recurse into normal children */
+  for_each_child_safe(child, root, pchild) {
+    hwloc_drop_useless_io(topology, child);
+  }
+
+  /* filter I/O children and recurse */
+  for_each_io_child_safe(child, root, pchild) {
+    /* remove useless children if needed */
+    if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_IO)
+	&& child->type == HWLOC_OBJ_PCI_DEVICE) {
+      unsigned classid = child->attr->pcidev.class_id;
+      unsigned baseclass = classid >> 8;
+      if (baseclass != 0x03 /* PCI_BASE_CLASS_DISPLAY */
+	  && baseclass != 0x02 /* PCI_BASE_CLASS_NETWORK */
+	  && baseclass != 0x01 /* PCI_BASE_CLASS_STORAGE */
+	  && baseclass != 0x0b /* PCI_BASE_CLASS_PROCESSOR */
+	  && classid != 0x0c06 /* PCI_CLASS_SERIAL_INFINIBAND */
+	  && baseclass != 0x12 /* Processing Accelerators */) {
+	unlink_and_free_object_and_children(pchild);
+	topology->modified = 1;
+	continue;
+      }
+    }
+    /* recurse to ignore grand-children etc */
+    hwloc_drop_useless_io(topology, child);
+    /* now remove useless bridges if needed */
+    if (child->type == HWLOC_OBJ_BRIDGE) {
+      if (!child->io_first_child) {
+	/* bridges with no children are removed if WHOLE_IO isn't given */
+	if (!(topology->flags & (HWLOC_TOPOLOGY_FLAG_WHOLE_IO))) {
+	  unlink_and_free_single_object(pchild);
+	  topology->modified = 1;
+	  continue;
+	}
+      } else if (child->attr->bridge.upstream_type != HWLOC_OBJ_BRIDGE_HOST) {
+	/* only hostbridges are kept if WHOLE_IO or IO_BRIDGE are not given */
+	if (!(topology->flags & (HWLOC_TOPOLOGY_FLAG_IO_BRIDGES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO))) {
+	  unlink_and_free_single_object(pchild);
+	  topology->modified = 1;
+	  continue;
+	}
+      }
+    }
+  }
+
+  /* No I/O under Misc */
+}
+
+static void
+hwloc_propagate_bridge_depth(hwloc_topology_t topology, hwloc_obj_t root, unsigned depth)
+{
+  hwloc_obj_t child;
+  for(child = root->first_child; child; child = child->next_sibling) {
+    assert(!depth); /* no normal children under I/O */
+    hwloc_propagate_bridge_depth(topology, child, 0);
+  }
+  for(child = root->io_first_child; child; child = child->next_sibling) {
+    if (child->type == HWLOC_OBJ_BRIDGE) {
+      child->attr->bridge.depth = depth;
+      hwloc_propagate_bridge_depth(topology, child, depth+1);
+    } else if (!hwloc_obj_type_is_io(child->type)) {
+      hwloc_propagate_bridge_depth(topology, child, 0);
+    }
+  }
+  /* No I/O under Misc children */
+}
+
+static void
+hwloc_propagate_symmetric_subtree(hwloc_topology_t topology, hwloc_obj_t root)
+{
+  hwloc_obj_t child, *array;
+  int ok;
+
+  /* assume we're not symmetric by default */
+  root->symmetric_subtree = 0;
+
+  /* if no child, we are symmetric */
+  if (!root->arity) {
+    root->symmetric_subtree = 1;
+    return;
+  }
+
+  /* look at normal children only, I/O and Misc are ignored.
+   * return if any child is not symmetric.
+   */
+  ok = 1;
+  for(child = root->first_child; child; child = child->next_sibling) {
+    hwloc_propagate_symmetric_subtree(topology, child);
+    if (!child->symmetric_subtree)
+      ok = 0;
+  }
+  if (!ok)
+    return;
+  /* Misc and I/O children do not care about symmetric_subtree */
+
+  /* now check that children subtrees are identical.
+   * just walk down the first child in each tree and compare their depth and arities
+   */
+  array = malloc(root->arity * sizeof(*array));
+  memcpy(array, root->children, root->arity * sizeof(*array));
+  while (1) {
+    unsigned i;
+    /* check current level arities and depth */
+    for(i=1; i<root->arity; i++)
+      if (array[i]->depth != array[0]->depth
+	  || array[i]->arity != array[0]->arity) {
+      free(array);
+      return;
+    }
+    if (!array[0]->arity)
+      /* no more children level, we're ok */
+      break;
+    /* look at first child of each element now */
+    for(i=0; i<root->arity; i++)
+      array[i] = array[i]->first_child;
+  }
+  free(array);
+
+  /* everything went fine, we're symmetric */
+  root->symmetric_subtree = 1;
+}
+
+/*
+ * Initialize handy pointers in the whole topology.
+ * The topology only had first_child and next_sibling pointers.
+ * When this funtions return, all parent/children pointers are initialized.
+ * The remaining fields (levels, cousins, logical_index, depth, ...) will
+ * be setup later in hwloc_connect_levels().
+ *
+ * Can be called several times, so may have to update the array.
+ */
+void
+hwloc_connect_children(hwloc_obj_t parent)
+{
+  unsigned n, oldn = parent->arity;
+  hwloc_obj_t child, prev_child;
+  int ok;
+
+  /* Main children list */
+
+  ok = 1;
+  prev_child = NULL;
+  for (n = 0, child = parent->first_child;
+       child;
+       n++,   prev_child = child, child = child->next_sibling) {
+    child->sibling_rank = n;
+    child->prev_sibling = prev_child;
+    /* already OK in the array? */
+    if (n >= oldn || parent->children[n] != child)
+      ok = 0;
+    /* recurse */
+    hwloc_connect_children(child);
+  }
+  parent->last_child = prev_child;
+  parent->arity = n;
+  if (!n) {
+    /* no need for an array anymore */
+    free(parent->children);
+    parent->children = NULL;
+    goto io;
+  }
+  if (ok)
+    /* array is already OK (even if too large) */
+    goto io;
+
+  /* alloc a larger array if needed */
+  if (oldn < n) {
+    free(parent->children);
+    parent->children = malloc(n * sizeof(*parent->children));
+  }
+  /* refill */
+  for (n = 0, child = parent->first_child;
+       child;
+       n++,   child = child->next_sibling) {
+    parent->children[n] = child;
+  }
+
+  /* Misc children list */
+ io:
+
+  prev_child = NULL;
+  for (n = 0, child = parent->io_first_child;
+       child;
+       n++,   prev_child = child, child = child->next_sibling) {
+    child->parent = parent;
+    child->sibling_rank = n;
+    child->prev_sibling = prev_child;
+    hwloc_connect_children(child);
+  }
+  parent->io_arity = n;
+
+  /* Misc children list */
+
+  prev_child = NULL;
+  for (n = 0, child = parent->misc_first_child;
+       child;
+       n++,   prev_child = child, child = child->next_sibling) {
+    child->parent = parent;
+    child->sibling_rank = n;
+    child->prev_sibling = prev_child;
+    hwloc_connect_children(child);
+  }
+  parent->misc_arity = n;
+}
+
+/*
+ * Check whether there is an object below ROOT that has the same type as OBJ
+ */
+static int
+find_same_type(hwloc_obj_t root, hwloc_obj_t obj)
+{
+  hwloc_obj_t child;
+
+  if (hwloc_type_cmp(root, obj) == HWLOC_TYPE_EQUAL)
+    return 1;
+
+  for (child = root->first_child; child; child = child->next_sibling)
+    if (find_same_type(child, obj))
+      return 1;
+
+  return 0;
+}
+
+/* traverse the array of current object and compare them with top_obj.
+ * if equal, take the object and put its children into the remaining objs.
+ * if not equal, put the object into the remaining objs.
+ */
+static int
+hwloc_level_take_objects(hwloc_obj_t top_obj,
+			 hwloc_obj_t *current_objs, unsigned n_current_objs,
+			 hwloc_obj_t *taken_objs, unsigned n_taken_objs __hwloc_attribute_unused,
+			 hwloc_obj_t *remaining_objs, unsigned n_remaining_objs __hwloc_attribute_unused)
+{
+  unsigned taken_i = 0;
+  unsigned new_i = 0;
+  unsigned i, j;
+
+  for (i = 0; i < n_current_objs; i++)
+    if (hwloc_type_cmp(top_obj, current_objs[i]) == HWLOC_TYPE_EQUAL) {
+      /* Take it, add main children.  */
+      taken_objs[taken_i++] = current_objs[i];
+      for (j = 0; j < current_objs[i]->arity; j++)
+	remaining_objs[new_i++] = current_objs[i]->children[j];
+    } else {
+      /* Leave it.  */
+      remaining_objs[new_i++] = current_objs[i];
+    }
+
+#ifdef HWLOC_DEBUG
+  /* Make sure we didn't mess up.  */
+  assert(taken_i == n_taken_objs);
+  assert(new_i == n_current_objs - n_taken_objs + n_remaining_objs);
+#endif
+
+  return new_i;
+}
+
+static unsigned
+hwloc_build_level_from_list(struct hwloc_obj *first, struct hwloc_obj ***levelp)
+{
+  unsigned i, nb;
+  struct hwloc_obj * obj;
+
+  /* count */
+  obj = first;
+  i = 0;
+  while (obj) {
+    i++;
+    obj = obj->next_cousin;
+  }
+  nb = i;
+
+  /* allocate and fill level */
+  *levelp = malloc(nb * sizeof(struct hwloc_obj *));
+  obj = first;
+  i = 0;
+  while (obj) {
+    obj->logical_index = i;
+    (*levelp)[i] = obj;
+    i++;
+    obj = obj->next_cousin;
+  }
+
+  return nb;
+}
+
+/* Append I/O objects to their lists */
+static void
+hwloc_list_io_objects(hwloc_topology_t topology, hwloc_obj_t obj)
+{
+  hwloc_obj_t child, *temp;
+
+  if (hwloc_obj_type_is_io(obj->type)) {
+    /* make sure we don't have remaining stale pointers from a previous load */
+    obj->next_cousin = NULL;
+    obj->prev_cousin = NULL;
+
+    if (obj->type == HWLOC_OBJ_BRIDGE) {
+      obj->depth = HWLOC_TYPE_DEPTH_BRIDGE;
+      /* Insert in the main bridge list */
+      if (topology->first_bridge) {
+	obj->prev_cousin = topology->last_bridge;
+	obj->prev_cousin->next_cousin = obj;
+	topology->last_bridge = obj;
+      } else {
+	topology->first_bridge = topology->last_bridge = obj;
+      }
+
+    } else if (obj->type == HWLOC_OBJ_PCI_DEVICE) {
+      obj->depth = HWLOC_TYPE_DEPTH_PCI_DEVICE;
+      /* Insert in the main pcidev list */
+      if (topology->first_pcidev) {
+	obj->prev_cousin = topology->last_pcidev;
+	obj->prev_cousin->next_cousin = obj;
+	topology->last_pcidev = obj;
+      } else {
+	topology->first_pcidev = topology->last_pcidev = obj;
+      }
+
+    } else if (obj->type == HWLOC_OBJ_OS_DEVICE) {
+      obj->depth = HWLOC_TYPE_DEPTH_OS_DEVICE;
+      /* Insert in the main osdev list */
+      if (topology->first_osdev) {
+	obj->prev_cousin = topology->last_osdev;
+	obj->prev_cousin->next_cousin = obj;
+	topology->last_osdev = obj;
+      } else {
+	topology->first_osdev = topology->last_osdev = obj;
+      }
+    }
+  }
+
+  for_each_child_safe(child, obj, temp)
+    hwloc_list_io_objects(topology, child);
+  for_each_io_child_safe(child, obj, temp)
+    hwloc_list_io_objects(topology, child);
+  /* No I/O under Misc */
+}
+
+/* Build I/O levels */
+static void
+hwloc_connect_io_levels(hwloc_topology_t topology)
+{
+  free(topology->bridge_level);
+  topology->bridge_level = NULL;
+  topology->bridge_nbobjects = 0;
+  topology->first_bridge = topology->last_bridge = NULL;
+  topology->type_depth[HWLOC_OBJ_BRIDGE] = HWLOC_TYPE_DEPTH_BRIDGE;
+
+  free(topology->pcidev_level);
+  topology->pcidev_level = NULL;
+  topology->pcidev_nbobjects = 0;
+  topology->first_pcidev = topology->last_pcidev = NULL;
+  topology->type_depth[HWLOC_OBJ_PCI_DEVICE] = HWLOC_TYPE_DEPTH_PCI_DEVICE;
+
+  free(topology->osdev_level);
+  topology->osdev_level = NULL;
+  topology->osdev_nbobjects = 0;
+  topology->first_osdev = topology->last_osdev = NULL;
+  topology->type_depth[HWLOC_OBJ_OS_DEVICE] = HWLOC_TYPE_DEPTH_OS_DEVICE;
+
+  hwloc_list_io_objects(topology, topology->levels[0][0]);
+  topology->bridge_nbobjects = hwloc_build_level_from_list(topology->first_bridge, &topology->bridge_level);
+  topology->pcidev_nbobjects = hwloc_build_level_from_list(topology->first_pcidev, &topology->pcidev_level);
+  topology->osdev_nbobjects = hwloc_build_level_from_list(topology->first_osdev, &topology->osdev_level);
+}
+
+/* Append Misc object to their list */
+static void
+hwloc_list_misc_objects(hwloc_topology_t topology, hwloc_obj_t obj)
+{
+  hwloc_obj_t child, *temp;
+
+  if (obj->type == HWLOC_OBJ_MISC) {
+    obj->depth = HWLOC_TYPE_DEPTH_MISC;
+    /* Insert the main Misc list */
+    if (topology->first_misc) {
+      obj->prev_cousin = topology->last_misc;
+      obj->prev_cousin->next_cousin = obj;
+      topology->last_misc = obj;
+    } else {
+      topology->first_misc = topology->last_misc = obj;
+    }
+  }
+
+  for_each_child_safe(child, obj, temp)
+    hwloc_list_misc_objects(topology, child);
+  for_each_io_child_safe(child, obj, temp)
+    hwloc_list_misc_objects(topology, child);
+  for_each_misc_child_safe(child, obj, temp)
+    hwloc_list_misc_objects(topology, child);
+}
+
+/* Build Misc level */
+static void
+hwloc_connect_misc_level(hwloc_topology_t topology)
+{
+  free(topology->misc_level);
+  topology->misc_level = NULL;
+  topology->misc_nbobjects = 0;
+  topology->first_misc = topology->last_misc = NULL;
+  topology->type_depth[HWLOC_OBJ_MISC] = HWLOC_TYPE_DEPTH_MISC;
+
+  hwloc_list_misc_objects(topology, topology->levels[0][0]);
+  topology->misc_nbobjects = hwloc_build_level_from_list(topology->first_misc, &topology->misc_level);
+}
+
+/*
+ * Do the remaining work that hwloc_connect_children() did not do earlier.
+ */
+int
+hwloc_connect_levels(hwloc_topology_t topology)
+{
+  unsigned l, i=0;
+  hwloc_obj_t *objs, *taken_objs, *new_objs, top_obj, root;
+  unsigned n_objs, n_taken_objs, n_new_objs;
+
+  /* reset non-root levels (root was initialized during init and will not change here) */
+  for(l=1; l<HWLOC_DEPTH_MAX; l++)
+    free(topology->levels[l]);
+  memset(topology->levels+1, 0, (HWLOC_DEPTH_MAX-1)*sizeof(*topology->levels));
+  memset(topology->level_nbobjects+1, 0,  (HWLOC_DEPTH_MAX-1)*sizeof(*topology->level_nbobjects));
+  topology->nb_levels = 1;
+  /* don't touch next_group_depth, the Group objects are still here */
+
+  /* initialize all depth to unknown */
+  for (l = HWLOC_OBJ_SYSTEM; l < HWLOC_OBJ_TYPE_MAX; l++)
+    topology->type_depth[l] = HWLOC_TYPE_DEPTH_UNKNOWN;
+
+  /* initialize root type depth */
+  root = topology->levels[0][0];
+  root->depth = 0;
+  topology->type_depth[root->type] = 0;
+  /* root level */
+  root->logical_index = 0;
+  root->prev_cousin = NULL;
+  root->next_cousin = NULL;
+  /* root as a child of nothing */
+  root->parent = NULL;
+  root->sibling_rank = 0;
+  root->prev_sibling = NULL;
+  root->next_sibling = NULL;
+
+  /* Start with children of the whole system.  */
+  n_objs = topology->levels[0][0]->arity;
+  objs = malloc(n_objs * sizeof(objs[0]));
+  if (!objs) {
+    errno = ENOMEM;
+    return -1;
+  }
+  memcpy(objs, topology->levels[0][0]->children, n_objs*sizeof(objs[0]));
+
+  /* Keep building levels while there are objects left in OBJS.  */
+  while (n_objs) {
+    /* At this point, the objs array contains only objects that may go into levels */
+
+    /* First find which type of object is the topmost.
+     * Don't use PU if there are other types since we want to keep PU at the bottom.
+     */
+
+    /* Look for the first non-PU object, and use the first PU if we really find nothing else */
+    for (i = 0; i < n_objs; i++)
+      if (objs[i]->type != HWLOC_OBJ_PU)
+        break;
+    top_obj = i == n_objs ? objs[0] : objs[i];
+
+    /* See if this is actually the topmost object */
+    for (i = 0; i < n_objs; i++) {
+      if (hwloc_type_cmp(top_obj, objs[i]) != HWLOC_TYPE_EQUAL) {
+	if (find_same_type(objs[i], top_obj)) {
+	  /* OBJS[i] is strictly above an object of the same type as TOP_OBJ, so it
+	   * is above TOP_OBJ.  */
+	  top_obj = objs[i];
+	}
+      }
+    }
+
+    /* Now peek all objects of the same type, build a level with that and
+     * replace them with their children.  */
+
+    /* First count them.  */
+    n_taken_objs = 0;
+    n_new_objs = 0;
+    for (i = 0; i < n_objs; i++)
+      if (hwloc_type_cmp(top_obj, objs[i]) == HWLOC_TYPE_EQUAL) {
+	n_taken_objs++;
+	n_new_objs += objs[i]->arity;
+      }
+
+    /* New level.  */
+    taken_objs = malloc((n_taken_objs + 1) * sizeof(taken_objs[0]));
+    /* New list of pending objects.  */
+    if (n_objs - n_taken_objs + n_new_objs) {
+      new_objs = malloc((n_objs - n_taken_objs + n_new_objs) * sizeof(new_objs[0]));
+    } else {
+#ifdef HWLOC_DEBUG
+      assert(!n_new_objs);
+      assert(n_objs == n_taken_objs);
+#endif
+      new_objs = NULL;
+    }
+
+    n_new_objs = hwloc_level_take_objects(top_obj,
+					  objs, n_objs,
+					  taken_objs, n_taken_objs,
+					  new_objs, n_new_objs);
+
+    /* Ok, put numbers in the level and link cousins.  */
+    for (i = 0; i < n_taken_objs; i++) {
+      taken_objs[i]->depth = topology->nb_levels;
+      taken_objs[i]->logical_index = i;
+      if (i) {
+	taken_objs[i]->prev_cousin = taken_objs[i-1];
+	taken_objs[i-1]->next_cousin = taken_objs[i];
+      }
+    }
+    taken_objs[0]->prev_cousin = NULL;
+    taken_objs[n_taken_objs-1]->next_cousin = NULL;
+
+    /* One more level!  */
+    if (top_obj->type == HWLOC_OBJ_CACHE)
+      hwloc_debug("--- Cache level depth %u", top_obj->attr->cache.depth);
+    else
+      hwloc_debug("--- %s level", hwloc_obj_type_string(top_obj->type));
+    hwloc_debug(" has number %u\n\n", topology->nb_levels);
+
+    if (topology->type_depth[top_obj->type] == HWLOC_TYPE_DEPTH_UNKNOWN)
+      topology->type_depth[top_obj->type] = topology->nb_levels;
+    else
+      topology->type_depth[top_obj->type] = HWLOC_TYPE_DEPTH_MULTIPLE; /* mark as unknown */
+
+    taken_objs[n_taken_objs] = NULL;
+
+    topology->level_nbobjects[topology->nb_levels] = n_taken_objs;
+    topology->levels[topology->nb_levels] = taken_objs;
+
+    topology->nb_levels++;
+
+    free(objs);
+
+    /* Switch to new_objs */
+    objs = new_objs;
+    n_objs = n_new_objs;
+  }
+
+  /* It's empty now.  */
+  if (objs)
+    free(objs);
+
+  hwloc_connect_io_levels(topology);
+  hwloc_connect_misc_level(topology);
+
+  hwloc_propagate_symmetric_subtree(topology, topology->levels[0][0]);
+
+  return 0;
+}
+
+void hwloc_alloc_obj_cpusets(hwloc_obj_t obj)
+{
+  if (!obj->cpuset)
+    obj->cpuset = hwloc_bitmap_alloc_full();
+  if (!obj->complete_cpuset)
+    obj->complete_cpuset = hwloc_bitmap_alloc();
+  if (!obj->allowed_cpuset)
+    obj->allowed_cpuset = hwloc_bitmap_alloc_full();
+  if (!obj->nodeset)
+    obj->nodeset = hwloc_bitmap_alloc();
+  if (!obj->complete_nodeset)
+    obj->complete_nodeset = hwloc_bitmap_alloc();
+  if (!obj->allowed_nodeset)
+    obj->allowed_nodeset = hwloc_bitmap_alloc_full();
+}
+
+/* Main discovery loop */
+static int
+hwloc_discover(struct hwloc_topology *topology)
+{
+  struct hwloc_backend *backend;
+  int gotsomeio = 0;
+  unsigned discoveries = 0;
+
+  topology->modified = 0; /* no need to reconnect yet */
+
+  /* discover() callbacks should use hwloc_insert to add objects initialized
+   * through hwloc_alloc_setup_object.
+   * For node levels, nodeset and memory must be initialized.
+   * For cache levels, memory and type/depth must be initialized.
+   * For group levels, depth must be initialized.
+   */
+
+  /* There must be at least a PU object for each logical processor, at worse
+   * produced by hwloc_setup_pu_level()
+   */
+
+  /* To be able to just use hwloc_insert_object_by_cpuset to insert the object
+   * in the topology according to the cpuset, the cpuset field must be
+   * initialized.
+   */
+
+  /* A priori, All processors are visible in the topology, and allowed
+   * for the application.
+   *
+   * - If some processors exist but topology information is unknown for them
+   *   (and thus the backend couldn't create objects for them), they should be
+   *   added to the complete_cpuset field of the lowest object where the object
+   *   could reside.
+   *
+   * - If some processors are not allowed for the application (e.g. for
+   *   administration reasons), they should be dropped from the allowed_cpuset
+   *   field.
+   *
+   * The same applies to the node sets complete_nodeset and allowed_cpuset.
+   *
+   * If such field doesn't exist yet, it can be allocated, and initialized to
+   * zero (for complete), or to full (for allowed). The values are
+   * automatically propagated to the whole tree after detection.
+   */
+
+  /*
+   * Discover CPUs first
+   */
+  backend = topology->backends;
+  while (NULL != backend) {
+    int err;
+    if (backend->component->type != HWLOC_DISC_COMPONENT_TYPE_CPU
+	&& backend->component->type != HWLOC_DISC_COMPONENT_TYPE_GLOBAL)
+      /* not yet */
+      goto next_cpubackend;
+    if (!backend->discover)
+      goto next_cpubackend;
+
+    if (topology->modified && (backend->flags & HWLOC_BACKEND_FLAG_NEED_LEVELS)) {
+      hwloc_debug("Backend %s forcing a reconnect of levels\n", backend->component->name);
+      hwloc_connect_children(topology->levels[0][0]);
+      if (hwloc_connect_levels(topology) < 0)
+	return -1;
+      topology->modified = 0;
+    }
+
+    err = backend->discover(backend);
+    if (err >= 0) {
+      if (backend->component->type == HWLOC_DISC_COMPONENT_TYPE_GLOBAL)
+        gotsomeio += err;
+      discoveries++;
+    }
+    hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+next_cpubackend:
+    backend = backend->next;
+  }
+
+  if (!discoveries) {
+    hwloc_debug("%s", "No CPU backend enabled or no discovery succeeded\n");
+    errno = EINVAL;
+    return -1;
+  }
+
+  /* Update objects cpusets and nodesets now that the CPU/GLOBAL backend populated PUs and nodes */
+
+  hwloc_debug("%s", "\nRestrict topology cpusets to existing PU and NODE objects\n");
+  collect_proc_cpuset(topology->levels[0][0], NULL);
+
+  hwloc_debug("%s", "\nPropagate disallowed cpus down and up\n");
+  propagate_unused_cpuset(topology->levels[0][0], NULL);
+
+  /* Backends must allocate root->*nodeset.
+   *
+   * Most of them call hwloc_alloc_obj_cpusets() on the root to do so.
+   * root->complete_nodeset is empty by default, and filled by the core
+   * when NUMA nodes are added with insert_by_cpuset().
+   * root->allowed_nodeset is everything by default, unless reduced by backends.
+   *
+   * The XML backend takes care of everything to properly support old XML input
+   * with missing nodesets and/or NUMA nodes. It checks nodesets and fix them if needed.
+   */
+  assert(topology->levels[0][0]->nodeset);
+  assert(topology->levels[0][0]->complete_nodeset);
+  assert(topology->levels[0][0]->allowed_nodeset);
+  /* If there's no NUMA node, add one with all the memory */
+  if (hwloc_bitmap_iszero(topology->levels[0][0]->complete_nodeset)) {
+    hwloc_obj_t node = hwloc_alloc_setup_object(HWLOC_OBJ_NUMANODE, 0);
+    node->cpuset = hwloc_bitmap_dup(topology->levels[0][0]->cpuset); /* requires root cpuset to be initialized above */
+    node->complete_cpuset = hwloc_bitmap_dup(topology->levels[0][0]->complete_cpuset); /* requires root cpuset to be initialized above */
+    node->allowed_cpuset = hwloc_bitmap_dup(topology->levels[0][0]->allowed_cpuset); /* requires root cpuset to be initialized above */
+    node->nodeset = hwloc_bitmap_alloc();
+    /* other nodesets will be filled below */
+    hwloc_bitmap_set(node->nodeset, 0);
+    memcpy(&node->memory, &topology->levels[0][0]->memory, sizeof(node->memory));
+    memset(&topology->levels[0][0]->memory, 0, sizeof(node->memory));
+    hwloc_insert_object_by_cpuset(topology, node);
+  }
+  hwloc_debug("%s", "\nPropagate nodesets\n");
+  propagate_nodeset(topology->levels[0][0], NULL);
+  propagate_nodesets(topology->levels[0][0]);
+
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  if (!(topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM)) {
+    hwloc_debug("%s", "\nRemoving unauthorized sets from all sets\n");
+    remove_unused_sets(topology->levels[0][0]);
+    hwloc_debug_print_objects(0, topology->levels[0][0]);
+  }
+
+  /*
+   * All object cpusets and nodesets are properly set now.
+   */
+
+  /*
+   * Group levels by distances
+   */
+  hwloc_distances_finalize_os(topology);
+  hwloc_group_by_distances(topology);
+
+  /* Now connect handy pointers to make remaining discovery easier. */
+  hwloc_debug("%s", "\nOk, finished tweaking, now connect\n");
+  if (topology->modified) {
+    hwloc_connect_children(topology->levels[0][0]);
+    if (hwloc_connect_levels(topology) < 0)
+      return -1;
+    topology->modified = 0;
+  }
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  /*
+   * Additional discovery with other backends
+   */
+
+  backend = topology->backends;
+  while (NULL != backend) {
+    int err;
+    if (backend->component->type == HWLOC_DISC_COMPONENT_TYPE_CPU
+	|| backend->component->type == HWLOC_DISC_COMPONENT_TYPE_GLOBAL)
+      /* already done above */
+      goto next_noncpubackend;
+    if (!backend->discover)
+      goto next_noncpubackend;
+
+    if (topology->modified && (backend->flags & HWLOC_BACKEND_FLAG_NEED_LEVELS)) {
+      hwloc_debug("Backend %s forcing a reconnect of levels\n", backend->component->name);
+      hwloc_connect_children(topology->levels[0][0]);
+      if (hwloc_connect_levels(topology) < 0)
+	return -1;
+      topology->modified = 0;
+    }
+
+    err = backend->discover(backend);
+    if (err >= 0) {
+      gotsomeio += err;
+    }
+    hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+next_noncpubackend:
+    backend = backend->next;
+  }
+
+  /* if we got anything, filter interesting objects and update the tree */
+  if (gotsomeio) {
+    if (!(topology->flags & (HWLOC_TOPOLOGY_FLAG_IO_DEVICES|HWLOC_TOPOLOGY_FLAG_WHOLE_IO)))
+      /* drop all I/O children */
+      hwloc_drop_all_io(topology, topology->levels[0][0]);
+    else
+      hwloc_drop_useless_io(topology, topology->levels[0][0]);
+    hwloc_debug("%s", "\nNow reconnecting\n");
+    hwloc_debug_print_objects(0, topology->levels[0][0]);
+    hwloc_propagate_bridge_depth(topology, topology->levels[0][0], 0);
+  }
+
+  /* Remove some stuff */
+
+  hwloc_debug("%s", "\nRemoving ignored objects\n");
+  ignore_type_always(topology, &topology->levels[0][0]);
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  hwloc_debug("%s", "\nRemoving empty objects except numa nodes and PCI devices\n");
+  remove_empty(topology, &topology->levels[0][0]);
+    if (!topology->levels[0][0]) {
+    fprintf(stderr, "Topology became empty, aborting!\n");
+    abort();
+  }
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  hwloc_debug("%s", "\nRemoving objects whose type has HWLOC_IGNORE_TYPE_KEEP_STRUCTURE and have only one child or are the only child\n");
+  ignore_type_keep_structure(topology, &topology->levels[0][0]);
+  hwloc_debug_print_objects(0, topology->levels[0][0]);
+
+  /* Reconnect things after all these changes */
+  if (topology->modified) {
+    /* Often raised because of Groups inserted for I/Os */
+    hwloc_connect_children(topology->levels[0][0]);
+    if (hwloc_connect_levels(topology) < 0)
+      return -1;
+    topology->modified = 0;
+  }
+
+  /* accumulate children memory in total_memory fields (only once parent is set) */
+  hwloc_debug("%s", "\nPropagate total memory up\n");
+  propagate_total_memory(topology->levels[0][0]);
+
+  /*
+   * Now that objects are numbered, take distance matrices from backends and put them in the main topology.
+   *
+   * Some objects may have disappeared (in removed_empty or removed_ignored) since we setup os distances
+   * (hwloc_distances_finalize_os()) above. Reset them so as to not point to disappeared objects anymore.
+   */
+  hwloc_distances_restrict_os(topology);
+  hwloc_distances_finalize_os(topology);
+  hwloc_distances_finalize_logical(topology);
+
+  /* add some identification attributes if not loading from XML */
+  if (topology->backends
+      && strcmp(topology->backends->component->name, "xml")) {
+    char *value;
+    /* add a hwlocVersion */
+    hwloc_obj_add_info(topology->levels[0][0], "hwlocVersion", VERSION);
+    /* add a ProcessName */
+    value = hwloc_progname(topology);
+    if (value) {
+      hwloc_obj_add_info(topology->levels[0][0], "ProcessName", value);
+      free(value);
+    }
+  }
+
+  /*
+   * Now set binding hooks according to topology->is_thissystem
+   * what the native OS backend offers.
+   */
+  hwloc_set_binding_hooks(topology);
+
+  return 0;
+}
+
+/* To be before discovery is actually launched,
+ * Resets everything in case a previous load initialized some stuff.
+ */
+void
+hwloc_topology_setup_defaults(struct hwloc_topology *topology)
+{
+  struct hwloc_obj *root_obj;
+
+  /* reset support */
+  memset(&topology->binding_hooks, 0, sizeof(topology->binding_hooks));
+  memset(topology->support.discovery, 0, sizeof(*topology->support.discovery));
+  memset(topology->support.cpubind, 0, sizeof(*topology->support.cpubind));
+  memset(topology->support.membind, 0, sizeof(*topology->support.membind));
+
+  /* Only the System object on top by default */
+  topology->nb_levels = 1; /* there's at least SYSTEM */
+  topology->next_group_depth = 0;
+  topology->levels[0] = malloc (sizeof (hwloc_obj_t));
+  topology->level_nbobjects[0] = 1;
+  /* NULLify other levels so that we can detect and free old ones in hwloc_connect_levels() if needed */
+  memset(topology->levels+1, 0, (HWLOC_DEPTH_MAX-1)*sizeof(*topology->levels));
+  topology->bridge_level = NULL;
+  topology->pcidev_level = NULL;
+  topology->osdev_level = NULL;
+  topology->first_bridge = topology->last_bridge = NULL;
+  topology->first_pcidev = topology->last_pcidev = NULL;
+  topology->first_osdev = topology->last_osdev = NULL;
+  topology->misc_level = NULL;
+  topology->first_misc = topology->last_misc = NULL;
+
+  /* Create the actual machine object, but don't touch its attributes yet
+   * since the OS backend may still change the object into something else
+   * (for instance System)
+   */
+  root_obj = hwloc_alloc_setup_object(HWLOC_OBJ_MACHINE, 0);
+  topology->levels[0][0] = root_obj;
+}
+
+int
+hwloc_topology_init (struct hwloc_topology **topologyp)
+{
+  struct hwloc_topology *topology;
+  int i;
+
+  topology = malloc (sizeof (struct hwloc_topology));
+  if(!topology)
+    return -1;
+
+  hwloc_components_init(topology);
+
+  /* Setup topology context */
+  topology->is_loaded = 0;
+  topology->flags = 0;
+  topology->is_thissystem = 1;
+  topology->pid = 0;
+  topology->userdata = NULL;
+
+  topology->support.discovery = malloc(sizeof(*topology->support.discovery));
+  topology->support.cpubind = malloc(sizeof(*topology->support.cpubind));
+  topology->support.membind = malloc(sizeof(*topology->support.membind));
+
+  /* Only ignore useless cruft by default */
+  for(i = HWLOC_OBJ_SYSTEM; i < HWLOC_OBJ_TYPE_MAX; i++)
+    topology->ignored_types[i] = HWLOC_IGNORE_TYPE_NEVER;
+  topology->ignored_types[HWLOC_OBJ_GROUP] = HWLOC_IGNORE_TYPE_KEEP_STRUCTURE;
+
+  hwloc_distances_init(topology);
+
+  topology->userdata_export_cb = NULL;
+  topology->userdata_import_cb = NULL;
+
+  /* Make the topology look like something coherent but empty */
+  hwloc_topology_setup_defaults(topology);
+
+  *topologyp = topology;
+  return 0;
+}
+
+int
+hwloc_topology_set_pid(struct hwloc_topology *topology __hwloc_attribute_unused,
+                       hwloc_pid_t pid __hwloc_attribute_unused)
+{
+  /* this does *not* change the backend */
+#ifdef HWLOC_LINUX_SYS
+  topology->pid = pid;
+  return 0;
+#else /* HWLOC_LINUX_SYS */
+  errno = ENOSYS;
+  return -1;
+#endif /* HWLOC_LINUX_SYS */
+}
+
+int
+hwloc_topology_set_synthetic(struct hwloc_topology *topology, const char *description)
+{
+  return hwloc_disc_component_force_enable(topology,
+					   0 /* api */,
+					   -1, "synthetic",
+					   description, NULL, NULL);
+}
+
+int
+hwloc_topology_set_xml(struct hwloc_topology *topology,
+		       const char *xmlpath)
+{
+  return hwloc_disc_component_force_enable(topology,
+					   0 /* api */,
+					   -1, "xml",
+					   xmlpath, NULL, NULL);
+}
+
+int
+hwloc_topology_set_xmlbuffer(struct hwloc_topology *topology,
+                             const char *xmlbuffer,
+                             int size)
+{
+  return hwloc_disc_component_force_enable(topology,
+					   0 /* api */,
+					   -1, "xml", NULL,
+					   xmlbuffer, (void*) (uintptr_t) size);
+}
+
+int
+hwloc_topology_set_flags (struct hwloc_topology *topology, unsigned long flags)
+{
+  if (topology->is_loaded) {
+    /* actually harmless */
+    errno = EBUSY;
+    return -1;
+  }
+  topology->flags = flags;
+  return 0;
+}
+
+unsigned long
+hwloc_topology_get_flags (struct hwloc_topology *topology)
+{
+  return topology->flags;
+}
+
+int
+hwloc_topology_ignore_type(struct hwloc_topology *topology, hwloc_obj_type_t type)
+{
+  if (type >= HWLOC_OBJ_TYPE_MAX) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (type == HWLOC_OBJ_PU || type == HWLOC_OBJ_NUMANODE) {
+    /* we need the PU and NUMA levels */
+    errno = EINVAL;
+    return -1;
+  } else if (hwloc_obj_type_is_io(type)) {
+    /* I/O devices aren't in any level, use topology flags to ignore them */
+    errno = EINVAL;
+    return -1;
+  }
+
+  topology->ignored_types[type] = HWLOC_IGNORE_TYPE_ALWAYS;
+  return 0;
+}
+
+int
+hwloc_topology_ignore_type_keep_structure(struct hwloc_topology *topology, hwloc_obj_type_t type)
+{
+  if (type >= HWLOC_OBJ_TYPE_MAX) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (type == HWLOC_OBJ_PU || type == HWLOC_OBJ_NUMANODE || type == HWLOC_OBJ_MISC) {
+    /* We need the PU and NUMA levels.
+     * Misc are outside of the main topology structure, makes no sense.
+     */
+    errno = EINVAL;
+    return -1;
+  } else if (hwloc_obj_type_is_io(type)) {
+    /* I/O devices aren't in any level, use topology flags to ignore them */
+    errno = EINVAL;
+    return -1;
+  }
+
+  topology->ignored_types[type] = HWLOC_IGNORE_TYPE_KEEP_STRUCTURE;
+  return 0;
+}
+
+int
+hwloc_topology_ignore_all_keep_structure(struct hwloc_topology *topology)
+{
+  unsigned type;
+  for(type = HWLOC_OBJ_SYSTEM; type < HWLOC_OBJ_TYPE_MAX; type++)
+    if (type != HWLOC_OBJ_PU && type != HWLOC_OBJ_NUMANODE
+	&& !hwloc_obj_type_is_io((hwloc_obj_type_t) type))
+      topology->ignored_types[type] = HWLOC_IGNORE_TYPE_KEEP_STRUCTURE;
+  return 0;
+}
+
+/* traverse the tree and free everything.
+ * only use first_child/next_sibling so that it works before load()
+ * and may be used when switching between backend.
+ */
+static void
+hwloc_topology_clear_tree (struct hwloc_topology *topology, struct hwloc_obj *root)
+{
+  hwloc_obj_t child;
+  child = root->first_child;
+  while (child) {
+    hwloc_obj_t nextchild = child->next_sibling;
+    hwloc_topology_clear_tree (topology, child);
+    child = nextchild;
+  }
+  child = root->io_first_child;
+  while (child) {
+    hwloc_obj_t nextchild = child->next_sibling;
+    hwloc_topology_clear_tree (topology, child);
+    child = nextchild;
+  }
+  child = root->misc_first_child;
+  while (child) {
+    hwloc_obj_t nextchild = child->next_sibling;
+    hwloc_topology_clear_tree (topology, child);
+    child = nextchild;
+  }
+  hwloc_free_unlinked_object (root);
+}
+
+void
+hwloc_topology_clear (struct hwloc_topology *topology)
+{
+  unsigned l;
+  hwloc_topology_clear_tree (topology, topology->levels[0][0]);
+  for (l=0; l<topology->nb_levels; l++) {
+    free(topology->levels[l]);
+    topology->levels[l] = NULL;
+  }
+  free(topology->bridge_level);
+  free(topology->pcidev_level);
+  free(topology->osdev_level);
+  free(topology->misc_level);
+}
+
+void
+hwloc_topology_destroy (struct hwloc_topology *topology)
+{
+  hwloc_backends_disable_all(topology);
+  hwloc_components_destroy_all(topology);
+
+  hwloc_topology_clear(topology);
+  hwloc_distances_destroy(topology);
+
+  free(topology->support.discovery);
+  free(topology->support.cpubind);
+  free(topology->support.membind);
+  free(topology);
+}
+
+int
+hwloc_topology_load (struct hwloc_topology *topology)
+{
+  int err;
+
+  if (topology->is_loaded) {
+    errno = EBUSY;
+    return -1;
+  }
+
+  /* Only apply variables if we have not changed the backend yet.
+   * Only the last one will be kept.
+   * Check for XML last (that's the one that may be set system-wide by administrators)
+   * so that it's only used if other variables are not set,
+   * to allow users to override easily.
+   */
+  if (!topology->backends) {
+    const char *synthetic_env = getenv("HWLOC_SYNTHETIC");
+    if (synthetic_env)
+      hwloc_disc_component_force_enable(topology,
+					1 /* env force */,
+					-1, "synthetic",
+					synthetic_env, NULL, NULL);
+  }
+  if (!topology->backends) {
+    const char *fsroot_path_env = getenv("HWLOC_FSROOT");
+    if (fsroot_path_env)
+      hwloc_disc_component_force_enable(topology,
+					1 /* env force */,
+					HWLOC_DISC_COMPONENT_TYPE_CPU, "linux",
+					fsroot_path_env, NULL, NULL);
+  }
+  if (!topology->backends) {
+    const char *xmlpath_env = getenv("HWLOC_XMLFILE");
+    if (xmlpath_env)
+      hwloc_disc_component_force_enable(topology,
+					1 /* env force */,
+					-1, "xml",
+					xmlpath_env, NULL, NULL);
+  }
+
+  /* instantiate all possible other backends now */
+  hwloc_disc_components_enable_others(topology);
+  /* now that backends are enabled, update the thissystem flag */
+  hwloc_backends_is_thissystem(topology);
+
+  /* get distance matrix from the environment are store them (as indexes) in the topology.
+   * indexes will be converted into objects later once the tree will be filled
+   */
+  hwloc_distances_set_from_env(topology);
+
+  /* actual topology discovery */
+  err = hwloc_discover(topology);
+  if (err < 0)
+    goto out;
+
+#ifndef HWLOC_DEBUG
+  if (getenv("HWLOC_DEBUG_CHECK"))
+#endif
+    hwloc_topology_check(topology);
+
+  topology->is_loaded = 1;
+  return 0;
+
+ out:
+  hwloc_topology_clear(topology);
+  hwloc_distances_destroy(topology);
+  hwloc_topology_setup_defaults(topology);
+  hwloc_backends_disable_all(topology);
+  return -1;
+}
+
+/* adjust object cpusets according the given droppedcpuset,
+ * drop object whose cpuset becomes empty,
+ * and mark dropped nodes in droppednodeset
+ */
+static void
+restrict_object(hwloc_topology_t topology, unsigned long flags, hwloc_obj_t *pobj, hwloc_const_cpuset_t droppedcpuset, hwloc_nodeset_t droppednodeset, int droppingparent)
+{
+  hwloc_obj_t obj = *pobj, child, *pchild;
+  int dropping;
+  int modified = hwloc_bitmap_intersects(obj->complete_cpuset, droppedcpuset);
+
+  hwloc_clear_object_distances(obj);
+
+  hwloc_bitmap_andnot(obj->cpuset, obj->cpuset, droppedcpuset);
+  hwloc_bitmap_andnot(obj->complete_cpuset, obj->complete_cpuset, droppedcpuset);
+  hwloc_bitmap_andnot(obj->allowed_cpuset, obj->allowed_cpuset, droppedcpuset);
+
+  dropping = droppingparent || hwloc_bitmap_iszero(obj->cpuset);
+
+  if (modified) {
+    for_each_child_safe(child, obj, pchild)
+      restrict_object(topology, flags, pchild, droppedcpuset, droppednodeset, dropping);
+    /* Nothing to restrict under I/O or Misc */
+  }
+
+  if (dropping) {
+    hwloc_debug("%s", "\nRemoving object during restrict");
+    hwloc_debug_print_object(0, obj);
+    if (obj->type == HWLOC_OBJ_NUMANODE)
+      hwloc_bitmap_set(droppednodeset, obj->os_index);
+    if (obj->io_first_child && !(flags & HWLOC_RESTRICT_FLAG_ADAPT_IO))
+      unlink_and_free_object_and_children(&obj->io_first_child);
+    if (obj->misc_first_child && !(flags & HWLOC_RESTRICT_FLAG_ADAPT_MISC))
+      unlink_and_free_object_and_children(&obj->misc_first_child);
+    unlink_and_free_single_object(pobj);
+    topology->modified = 1;
+    /* do not remove children. if they were to be removed, they would have been already */
+  }
+}
+
+/* adjust object nodesets accordingly the given droppednodeset
+ */
+static void
+restrict_object_nodeset(hwloc_topology_t topology, hwloc_obj_t *pobj, hwloc_nodeset_t droppednodeset)
+{
+  hwloc_obj_t obj = *pobj, child, *pchild;
+
+  /* if this object isn't modified, don't bother looking at children */
+  if (!hwloc_bitmap_intersects(obj->complete_nodeset, droppednodeset))
+    return;
+
+  hwloc_bitmap_andnot(obj->nodeset, obj->nodeset, droppednodeset);
+  hwloc_bitmap_andnot(obj->complete_nodeset, obj->complete_nodeset, droppednodeset);
+  hwloc_bitmap_andnot(obj->allowed_nodeset, obj->allowed_nodeset, droppednodeset);
+
+  for_each_child_safe(child, obj, pchild)
+    restrict_object_nodeset(topology, pchild, droppednodeset);
+  /* Nothing to restrict under I/O and Misc */
+}
+
+int
+hwloc_topology_restrict(struct hwloc_topology *topology, hwloc_const_cpuset_t cpuset, unsigned long flags)
+{
+  hwloc_bitmap_t droppedcpuset, droppednodeset;
+
+  /* make sure we'll keep something in the topology */
+  if (!hwloc_bitmap_intersects(cpuset, topology->levels[0][0]->cpuset)) {
+    errno = EINVAL; /* easy failure, just don't touch the topology */
+    return -1;
+  }
+
+  droppedcpuset = hwloc_bitmap_alloc();
+  droppednodeset = hwloc_bitmap_alloc();
+
+  /* drop object based on the reverse of cpuset, and fill the 'dropped' nodeset */
+  hwloc_bitmap_not(droppedcpuset, cpuset);
+  restrict_object(topology, flags, &topology->levels[0][0], droppedcpuset, droppednodeset, 0 /* root cannot be removed */);
+  /* update nodesets according to dropped nodeset */
+  restrict_object_nodeset(topology, &topology->levels[0][0], droppednodeset);
+
+  hwloc_bitmap_free(droppedcpuset);
+  hwloc_bitmap_free(droppednodeset);
+
+  hwloc_connect_children(topology->levels[0][0]);
+  if (hwloc_connect_levels(topology) < 0)
+    goto out;
+  topology->modified = 0;
+
+  propagate_total_memory(topology->levels[0][0]);
+  hwloc_distances_restrict(topology, flags);
+  hwloc_distances_finalize_os(topology);
+  hwloc_distances_finalize_logical(topology);
+  return 0;
+
+ out:
+  /* unrecoverable failure, re-init the topology */
+   hwloc_topology_clear(topology);
+   hwloc_distances_destroy(topology);
+   hwloc_topology_setup_defaults(topology);
+   return -1;
+}
+
+int
+hwloc_topology_is_thissystem(struct hwloc_topology *topology)
+{
+  return topology->is_thissystem;
+}
+
+unsigned
+hwloc_topology_get_depth(struct hwloc_topology *topology)
+{
+  return topology->nb_levels;
+}
+
+const struct hwloc_topology_support *
+hwloc_topology_get_support(struct hwloc_topology * topology)
+{
+  return &topology->support;
+}
+
+void hwloc_topology_set_userdata(struct hwloc_topology * topology, const void *userdata)
+{
+  topology->userdata = (void *) userdata;
+}
+
+void * hwloc_topology_get_userdata(struct hwloc_topology * topology)
+{
+  return topology->userdata;
+}
+
+/****************
+ * Debug Checks *
+ ****************/
+
+static void
+hwloc__check_object(hwloc_topology_t topology, hwloc_obj_t obj);
+
+/* check children between a parent object */
+static void
+hwloc__check_children(hwloc_topology_t topology, hwloc_obj_t parent)
+{
+  unsigned j;
+
+  if (!parent->arity) {
+    /* check whether that parent has no children for real */
+    assert(!parent->children);
+    assert(!parent->first_child);
+    assert(!parent->last_child);
+    return;
+  }
+  /* check whether that parent has children for real */
+  assert(parent->children);
+  assert(parent->first_child);
+  assert(parent->last_child);
+
+  /* sibling checks */
+  for(j=0; j<parent->arity; j++) {
+    hwloc_obj_t child = parent->children[j];
+    assert(child->parent == parent);
+    assert(child->sibling_rank == j);
+    if (j)
+      assert(child->prev_sibling == parent->children[j-1]);
+    else
+      assert(!child->prev_sibling);
+    if (j == parent->arity-1)
+      assert(!child->next_sibling);
+    else
+      assert(child->next_sibling == parent->children[j+1]);
+    if (!hwloc_obj_type_is_io(child->type))
+      assert(child->depth > parent->depth);
+    /* recurse */
+    hwloc__check_object(topology, child);
+  }
+  assert(parent->first_child == parent->children[0]);
+  assert(parent->last_child == parent->children[parent->arity-1]);
+
+  /* we already checked in the caller that objects have either all sets or none */
+
+  {
+    /* check that parent->cpuset == exclusive OR of children
+     * (can be wrong for complete_cpuset since disallowed/offline/unknown PUs can be removed)
+     */
+    hwloc_bitmap_t remaining_parent_cpuset = hwloc_bitmap_dup(parent->cpuset);
+    hwloc_bitmap_t remaining_parent_nodeset = hwloc_bitmap_dup(parent->nodeset);
+    for(j=0; j<parent->arity; j++) {
+      if (!parent->children[j]->cpuset)
+	continue;
+      /* check that child cpuset is included in the reminder of the parent */
+      assert(hwloc_bitmap_isincluded(parent->children[j]->cpuset, remaining_parent_cpuset));
+      hwloc_bitmap_andnot(remaining_parent_cpuset, remaining_parent_cpuset, parent->children[j]->cpuset);
+      /* check that child cpuset is included in the parent (multiple children may have the same nodeset when we're below a NUMA node) */
+      assert(hwloc_bitmap_isincluded(parent->children[j]->nodeset, parent->nodeset));
+      hwloc_bitmap_andnot(remaining_parent_nodeset, remaining_parent_nodeset, parent->children[j]->nodeset);
+    }
+
+    if (parent->type == HWLOC_OBJ_PU) {
+      /* if parent is a PU (with Misc children for instance),
+       * its os_index bit may remain in cpuset. */
+      assert(hwloc_bitmap_weight(remaining_parent_cpuset) == 1);
+      assert(hwloc_bitmap_first(remaining_parent_cpuset) == (int)parent->os_index);
+    } else {
+      /* nothing remains */
+      assert(hwloc_bitmap_iszero(remaining_parent_cpuset));
+    }
+    hwloc_bitmap_free(remaining_parent_cpuset);
+
+    if (parent->type == HWLOC_OBJ_NUMANODE)
+      /* if parent is a NUMA node, its os_index bit may remain.
+       * or it could already have been removed by a child. */
+      hwloc_bitmap_clr(remaining_parent_nodeset, parent->os_index);
+    if (parent->type == HWLOC_OBJ_PU) {
+      /* if parent is a PU (with Misc children for instance),
+       * one bit may remain in nodeset. */
+      assert(hwloc_bitmap_weight(remaining_parent_nodeset) == 1);
+    } else {
+      /* nothing remains */
+      assert(hwloc_bitmap_iszero(remaining_parent_nodeset));
+    }
+    hwloc_bitmap_free(remaining_parent_nodeset);
+  }
+
+  /* check that children complete_cpuset are properly ordered, empty ones may be anywhere
+   * (can be wrong for main cpuset since removed PUs can break the ordering).
+   */
+  {
+    int firstchild;
+    int prev_firstchild = -1; /* -1 works fine with first comparisons below */
+    for(j=0; j<parent->arity; j++) {
+      if (!parent->children[j]->complete_cpuset
+	  || hwloc_bitmap_iszero(parent->children[j]->complete_cpuset))
+	continue;
+
+      firstchild = hwloc_bitmap_first(parent->children[j]->complete_cpuset);
+      assert(prev_firstchild < firstchild);
+      prev_firstchild = firstchild;
+    }
+  }
+}
+
+static void
+hwloc__check_io_children(hwloc_topology_t topology, hwloc_obj_t parent)
+{
+  unsigned j;
+  hwloc_obj_t child, prev;
+
+  if (!parent->io_arity) {
+    /* check whether that parent has no children for real */
+    assert(!parent->io_first_child);
+    return;
+  }
+  /* check whether that parent has children for real */
+  assert(parent->io_first_child);
+
+  for(prev = NULL, child = parent->io_first_child, j = 0;
+      child;
+      prev = child, child = child->next_sibling, j++) {
+    /* all children must be I/O */
+    assert(hwloc_obj_type_is_io(child->type));
+
+    /* check siblings */
+    assert(child->parent == parent);
+    assert(child->sibling_rank == j);
+    if (prev)
+      assert(prev->next_sibling == child);
+    assert(child->prev_sibling == prev);
+    if (j == parent->io_arity-1)
+      assert(child->next_sibling == NULL);
+
+    /* only I/O and Misc children, recurse */
+    assert(!child->first_child);
+    hwloc__check_object(topology, child);
+  }
+  /* check arity */
+  assert(j == parent->io_arity);
+}
+
+static void
+hwloc__check_misc_children(hwloc_topology_t topology, hwloc_obj_t parent)
+{
+  unsigned j;
+  hwloc_obj_t child, prev;
+
+  if (!parent->misc_arity) {
+    /* check whether that parent has no children for real */
+    assert(!parent->misc_first_child);
+    return;
+  }
+  /* check whether that parent has children for real */
+  assert(parent->misc_first_child);
+
+  for(prev = NULL, child = parent->misc_first_child, j = 0;
+      child;
+      prev = child, child = child->next_sibling, j++) {
+    /* all children must be Misc */
+    assert(child->type == HWLOC_OBJ_MISC);
+
+    /* check siblings */
+    assert(child->parent == parent);
+    assert(child->sibling_rank == j);
+    if (prev)
+      assert(prev->next_sibling == child);
+    assert(child->prev_sibling == prev);
+    if (j == parent->misc_arity-1)
+      assert(child->next_sibling == NULL);
+
+    /* only Misc children, recurse */
+    assert(!child->first_child);
+    assert(!child->io_first_child);
+    hwloc__check_object(topology, child);
+  }
+  /* check arity */
+  assert(j == parent->misc_arity);
+}
+
+static void
+hwloc__check_object(hwloc_topology_t topology, hwloc_obj_t obj)
+{
+  /* check that sets and depth */
+  if (hwloc_obj_type_is_special(obj->type)) {
+    assert(!obj->cpuset);
+    if (obj->type == HWLOC_OBJ_BRIDGE)
+      assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_BRIDGE);
+    else if (obj->type == HWLOC_OBJ_PCI_DEVICE)
+      assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_PCI_DEVICE);
+    else if (obj->type == HWLOC_OBJ_OS_DEVICE)
+      assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_OS_DEVICE);
+    else if (obj->type == HWLOC_OBJ_MISC)
+      assert(obj->depth == (unsigned) HWLOC_TYPE_DEPTH_MISC);
+  } else {
+    assert(obj->cpuset);
+    assert((int) obj->depth >= 0);
+  }
+
+  /* there's other cpusets and nodesets if and only if there's a main cpuset */
+  assert(!!obj->cpuset == !!obj->complete_cpuset);
+  assert(!!obj->cpuset == !!obj->allowed_cpuset);
+  assert(!!obj->cpuset == !!obj->nodeset);
+  assert(!!obj->nodeset == !!obj->complete_nodeset);
+  assert(!!obj->nodeset == !!obj->allowed_nodeset);
+
+  /* check that complete/allowed/inline sets are larger than the main sets */
+  if (obj->cpuset) {
+    assert(hwloc_bitmap_isincluded(obj->cpuset, obj->complete_cpuset));
+    assert(hwloc_bitmap_isincluded(obj->nodeset, obj->complete_nodeset));
+    if (topology->flags & HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM) {
+      assert(hwloc_bitmap_isincluded(obj->allowed_cpuset, obj->cpuset));
+      assert(hwloc_bitmap_isincluded(obj->allowed_nodeset, obj->nodeset));
+    } else {
+      assert(hwloc_bitmap_isequal(obj->allowed_cpuset, obj->cpuset));
+      assert(hwloc_bitmap_isequal(obj->allowed_nodeset, obj->nodeset));
+    }
+  }
+
+  /* check children */
+  hwloc__check_children(topology, obj);
+  hwloc__check_io_children(topology, obj);
+  hwloc__check_misc_children(topology, obj);
+}
+
+static void
+hwloc__check_level(struct hwloc_topology *topology, unsigned depth)
+{
+  unsigned width = hwloc_get_nbobjs_by_depth(topology, depth);
+  struct hwloc_obj *prev = NULL;
+  hwloc_obj_t obj;
+  unsigned j;
+
+  /* check each object of the level */
+  for(j=0; j<width; j++) {
+    obj = hwloc_get_obj_by_depth(topology, depth, j);
+    /* check that the object is corrected placed horizontally and vertically */
+    assert(obj);
+    assert(obj->depth == depth);
+    assert(obj->logical_index == j);
+    /* check that all objects in the level have the same type */
+    if (prev) {
+      assert(hwloc_type_cmp(obj, prev) == HWLOC_TYPE_EQUAL);
+      assert(prev->next_cousin == obj);
+    }
+    assert(obj->prev_cousin == prev);
+
+    /* check that PUs and NUMA nodes have correct cpuset/nodeset */
+    if (obj->type == HWLOC_OBJ_PU) {
+      assert(hwloc_bitmap_weight(obj->complete_cpuset) == 1);
+      assert(hwloc_bitmap_first(obj->complete_cpuset) == (int) obj->os_index);
+    }
+    if (obj->type == HWLOC_OBJ_NUMANODE) {
+      assert(hwloc_bitmap_weight(obj->complete_nodeset) == 1);
+      assert(hwloc_bitmap_first(obj->complete_nodeset) == (int) obj->os_index);
+    }
+    prev = obj;
+  }
+  if (prev)
+    assert(prev->next_cousin == NULL);
+
+  if (width) {
+    /* check first object of the level */
+    obj = hwloc_get_obj_by_depth(topology, depth, 0);
+    assert(obj);
+    assert(!obj->prev_cousin);
+    /* check type */
+    assert(hwloc_get_depth_type(topology, depth) == obj->type);
+    assert(depth == (unsigned) hwloc_get_type_depth(topology, obj->type)
+	   || HWLOC_TYPE_DEPTH_MULTIPLE == hwloc_get_type_depth(topology, obj->type));
+    /* check last object of the level */
+    obj = hwloc_get_obj_by_depth(topology, depth, width-1);
+    assert(obj);
+    assert(!obj->next_cousin);
+  }
+
+  /* check last+1 object of the level */
+  obj = hwloc_get_obj_by_depth(topology, depth, width);
+  assert(!obj);
+}
+
+/* check a whole topology structure */
+void
+hwloc_topology_check(struct hwloc_topology *topology)
+{
+  struct hwloc_obj *obj;
+  hwloc_obj_type_t type;
+  unsigned i, j, depth;
+
+  depth = hwloc_topology_get_depth(topology);
+
+  assert(!topology->modified);
+
+  /* check type orders */
+  for (type = HWLOC_OBJ_SYSTEM; type < HWLOC_OBJ_TYPE_MAX; type++) {
+    assert(hwloc_get_order_type(hwloc_get_type_order(type)) == type);
+  }
+  for (i = hwloc_get_type_order(HWLOC_OBJ_SYSTEM);
+       i <= hwloc_get_type_order(HWLOC_OBJ_CORE); i++) {
+    assert(i == hwloc_get_type_order(hwloc_get_order_type(i)));
+  }
+
+  /* check that last level is PU */
+  assert(hwloc_get_depth_type(topology, depth-1) == HWLOC_OBJ_PU);
+  assert(hwloc_get_nbobjs_by_depth(topology, depth-1) > 0);
+  for(j=0; j<hwloc_get_nbobjs_by_depth(topology, depth-1); j++) {
+    obj = hwloc_get_obj_by_depth(topology, depth-1, j);
+    assert(obj);
+    assert(obj->type == HWLOC_OBJ_PU);
+  }
+  /* check that other levels are not PU */
+  for(i=1; i<depth-1; i++)
+    assert(hwloc_get_depth_type(topology, i) != HWLOC_OBJ_PU);
+
+  /* check that we have a NUMA level */
+  j = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  assert(j < hwloc_topology_get_depth(topology));
+  assert(hwloc_get_depth_type(topology, j) == HWLOC_OBJ_NUMANODE);
+  /* check that other levels are not NUMA */
+  for(i=0; i<depth-1; i++)
+    if (i != j)
+      assert(hwloc_get_depth_type(topology, i) != HWLOC_OBJ_NUMANODE);
+
+  /* top-level specific checks */
+  assert(hwloc_get_nbobjs_by_depth(topology, 0) == 1);
+  obj = hwloc_get_root_obj(topology);
+  assert(obj);
+  assert(!obj->parent);
+  assert(obj->cpuset);
+  assert(!obj->depth);
+
+  /* check each level */
+  for(i=0; i<depth; i++)
+    hwloc__check_level(topology, i);
+  hwloc__check_level(topology, HWLOC_OBJ_BRIDGE);
+  hwloc__check_level(topology, HWLOC_OBJ_PCI_DEVICE);
+  hwloc__check_level(topology, HWLOC_OBJ_OS_DEVICE);
+  hwloc__check_level(topology, HWLOC_OBJ_MISC);
+
+  /* recurse and check the tree of children, and type-specific checks */
+  hwloc__check_object(topology, obj);
+}
diff --git a/ext/hwloc/hwloc/traversal.c b/ext/hwloc/hwloc/traversal.c
new file mode 100644
index 0000000..f1e9ba7
--- /dev/null
+++ b/ext/hwloc/hwloc/traversal.c
@@ -0,0 +1,701 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <private/private.h>
+#include <private/misc.h>
+#include <private/debug.h>
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif /* HAVE_STRINGS_H */
+
+int
+hwloc_get_type_depth (struct hwloc_topology *topology, hwloc_obj_type_t type)
+{
+  return topology->type_depth[type];
+}
+
+hwloc_obj_type_t
+hwloc_get_depth_type (hwloc_topology_t topology, unsigned depth)
+{
+  if (depth >= topology->nb_levels)
+    switch (depth) {
+    case HWLOC_TYPE_DEPTH_BRIDGE:
+      return HWLOC_OBJ_BRIDGE;
+    case HWLOC_TYPE_DEPTH_PCI_DEVICE:
+      return HWLOC_OBJ_PCI_DEVICE;
+    case HWLOC_TYPE_DEPTH_OS_DEVICE:
+      return HWLOC_OBJ_OS_DEVICE;
+    case HWLOC_TYPE_DEPTH_MISC:
+      return HWLOC_OBJ_MISC;
+    default:
+      return (hwloc_obj_type_t) -1;
+    }
+  return topology->levels[depth][0]->type;
+}
+
+unsigned
+hwloc_get_nbobjs_by_depth (struct hwloc_topology *topology, unsigned depth)
+{
+  if (depth >= topology->nb_levels)
+    switch (depth) {
+    case HWLOC_TYPE_DEPTH_BRIDGE:
+      return topology->bridge_nbobjects;
+    case HWLOC_TYPE_DEPTH_PCI_DEVICE:
+      return topology->pcidev_nbobjects;
+    case HWLOC_TYPE_DEPTH_OS_DEVICE:
+      return topology->osdev_nbobjects;
+    case HWLOC_TYPE_DEPTH_MISC:
+      return topology->misc_nbobjects;
+    default:
+      return 0;
+    }
+  return topology->level_nbobjects[depth];
+}
+
+struct hwloc_obj *
+hwloc_get_obj_by_depth (struct hwloc_topology *topology, unsigned depth, unsigned idx)
+{
+  if (depth >= topology->nb_levels)
+    switch (depth) {
+    case HWLOC_TYPE_DEPTH_BRIDGE:
+      return idx < topology->bridge_nbobjects ? topology->bridge_level[idx] : NULL;
+    case HWLOC_TYPE_DEPTH_PCI_DEVICE:
+      return idx < topology->pcidev_nbobjects ? topology->pcidev_level[idx] : NULL;
+    case HWLOC_TYPE_DEPTH_OS_DEVICE:
+      return idx < topology->osdev_nbobjects ? topology->osdev_level[idx] : NULL;
+    case HWLOC_TYPE_DEPTH_MISC:
+      return idx < topology->misc_nbobjects ? topology->misc_level[idx] : NULL;
+    default:
+      return NULL;
+    }
+  if (idx >= topology->level_nbobjects[depth])
+    return NULL;
+  return topology->levels[depth][idx];
+}
+
+unsigned hwloc_get_closest_objs (struct hwloc_topology *topology, struct hwloc_obj *src, struct hwloc_obj **objs, unsigned max)
+{
+  struct hwloc_obj *parent, *nextparent, **src_objs;
+  int i,src_nbobjects;
+  unsigned stored = 0;
+
+  if (!src->cpuset)
+    return 0;
+
+  src_nbobjects = topology->level_nbobjects[src->depth];
+  src_objs = topology->levels[src->depth];
+
+  parent = src;
+  while (stored < max) {
+    while (1) {
+      nextparent = parent->parent;
+      if (!nextparent)
+	goto out;
+      if (!hwloc_bitmap_isequal(parent->cpuset, nextparent->cpuset))
+	break;
+      parent = nextparent;
+    }
+
+    /* traverse src's objects and find those that are in nextparent and were not in parent */
+    for(i=0; i<src_nbobjects; i++) {
+      if (hwloc_bitmap_isincluded(src_objs[i]->cpuset, nextparent->cpuset)
+	  && !hwloc_bitmap_isincluded(src_objs[i]->cpuset, parent->cpuset)) {
+	objs[stored++] = src_objs[i];
+	if (stored == max)
+	  goto out;
+      }
+    }
+    parent = nextparent;
+  }
+
+ out:
+  return stored;
+}
+
+static int
+hwloc__get_largest_objs_inside_cpuset (struct hwloc_obj *current, hwloc_const_bitmap_t set,
+				       struct hwloc_obj ***res, int *max)
+{
+  int gotten = 0;
+  unsigned i;
+
+  /* the caller must ensure this */
+  if (*max <= 0)
+    return 0;
+
+  if (hwloc_bitmap_isequal(current->cpuset, set)) {
+    **res = current;
+    (*res)++;
+    (*max)--;
+    return 1;
+  }
+
+  for (i=0; i<current->arity; i++) {
+    hwloc_bitmap_t subset = hwloc_bitmap_dup(set);
+    int ret;
+
+    /* split out the cpuset part corresponding to this child and see if there's anything to do */
+    hwloc_bitmap_and(subset, subset, current->children[i]->cpuset);
+    if (hwloc_bitmap_iszero(subset)) {
+      hwloc_bitmap_free(subset);
+      continue;
+    }
+
+    ret = hwloc__get_largest_objs_inside_cpuset (current->children[i], subset, res, max);
+    gotten += ret;
+    hwloc_bitmap_free(subset);
+
+    /* if no more room to store remaining objects, return what we got so far */
+    if (!*max)
+      break;
+  }
+
+  return gotten;
+}
+
+int
+hwloc_get_largest_objs_inside_cpuset (struct hwloc_topology *topology, hwloc_const_bitmap_t set,
+				      struct hwloc_obj **objs, int max)
+{
+  struct hwloc_obj *current = topology->levels[0][0];
+
+  if (!hwloc_bitmap_isincluded(set, current->cpuset))
+    return -1;
+
+  if (max <= 0)
+    return 0;
+
+  return hwloc__get_largest_objs_inside_cpuset (current, set, &objs, &max);
+}
+
+const char *
+hwloc_obj_type_string (hwloc_obj_type_t obj)
+{
+  switch (obj)
+    {
+    case HWLOC_OBJ_SYSTEM: return "System";
+    case HWLOC_OBJ_MACHINE: return "Machine";
+    case HWLOC_OBJ_MISC: return "Misc";
+    case HWLOC_OBJ_GROUP: return "Group";
+    case HWLOC_OBJ_NUMANODE: return "NUMANode";
+    case HWLOC_OBJ_PACKAGE: return "Package";
+    case HWLOC_OBJ_CACHE: return "Cache";
+    case HWLOC_OBJ_CORE: return "Core";
+    case HWLOC_OBJ_BRIDGE: return "Bridge";
+    case HWLOC_OBJ_PCI_DEVICE: return "PCIDev";
+    case HWLOC_OBJ_OS_DEVICE: return "OSDev";
+    case HWLOC_OBJ_PU: return "PU";
+    default: return "Unknown";
+    }
+}
+
+hwloc_obj_type_t
+hwloc_obj_type_of_string (const char * string)
+{
+  if (!strcasecmp(string, "System")) return HWLOC_OBJ_SYSTEM;
+  if (!strcasecmp(string, "Machine")) return HWLOC_OBJ_MACHINE;
+  if (!strcasecmp(string, "Misc")) return HWLOC_OBJ_MISC;
+  if (!strcasecmp(string, "Group")) return HWLOC_OBJ_GROUP;
+  if (!strcasecmp(string, "NUMANode") || !strcasecmp(string, "Node")) return HWLOC_OBJ_NUMANODE;
+  if (!strcasecmp(string, "Package") || !strcasecmp(string, "Socket") /* backward compat with v1.10 */) return HWLOC_OBJ_PACKAGE;
+  if (!strcasecmp(string, "Cache")) return HWLOC_OBJ_CACHE;
+  if (!strcasecmp(string, "Core")) return HWLOC_OBJ_CORE;
+  if (!strcasecmp(string, "PU")) return HWLOC_OBJ_PU;
+  if (!strcasecmp(string, "Bridge")) return HWLOC_OBJ_BRIDGE;
+  if (!strcasecmp(string, "PCIDev")) return HWLOC_OBJ_PCI_DEVICE;
+  if (!strcasecmp(string, "OSDev")) return HWLOC_OBJ_OS_DEVICE;
+  return (hwloc_obj_type_t) -1;
+}
+
+int
+hwloc_obj_type_sscanf(const char *string, hwloc_obj_type_t *typep, int *depthattrp, void *typeattrp, size_t typeattrsize)
+{
+  hwloc_obj_type_t type = (hwloc_obj_type_t) -1;
+  int depthattr = -1;
+  hwloc_obj_cache_type_t cachetypeattr = (hwloc_obj_cache_type_t) -1; /* unspecified */
+  char *end;
+
+  /* types without depthattr */
+  if (!hwloc_strncasecmp(string, "system", 2)) {
+    type = HWLOC_OBJ_SYSTEM;
+  } else if (!hwloc_strncasecmp(string, "machine", 2)) {
+    type = HWLOC_OBJ_MACHINE;
+  } else if (!hwloc_strncasecmp(string, "node", 1)
+	     || !hwloc_strncasecmp(string, "numa", 1)) { /* matches node and numanode */
+    type = HWLOC_OBJ_NUMANODE;
+  } else if (!hwloc_strncasecmp(string, "package", 2)
+	     || !hwloc_strncasecmp(string, "socket", 2)) { /* backward compat with v1.10 */
+    type = HWLOC_OBJ_PACKAGE;
+  } else if (!hwloc_strncasecmp(string, "core", 2)) {
+    type = HWLOC_OBJ_CORE;
+  } else if (!hwloc_strncasecmp(string, "pu", 2)) {
+    type = HWLOC_OBJ_PU;
+  } else if (!hwloc_strncasecmp(string, "misc", 2)) {
+    type = HWLOC_OBJ_MISC;
+  } else if (!hwloc_strncasecmp(string, "bridge", 2)) {
+    type = HWLOC_OBJ_BRIDGE;
+  } else if (!hwloc_strncasecmp(string, "pci", 2)) {
+    type = HWLOC_OBJ_PCI_DEVICE;
+  } else if (!hwloc_strncasecmp(string, "os", 2)) {
+    type = HWLOC_OBJ_OS_DEVICE;
+
+  /* types with depthattr */
+  } else if (!hwloc_strncasecmp(string, "cache", 2)) {
+    type = HWLOC_OBJ_CACHE;
+
+  } else if ((string[0] == 'l' || string[0] == 'L') && string[1] >= '0' && string[1] <= '9') {
+    type = HWLOC_OBJ_CACHE;
+    depthattr = strtol(string+1, &end, 10);
+    if (*end == 'd') {
+      cachetypeattr = HWLOC_OBJ_CACHE_DATA;
+    } else if (*end == 'i') {
+      cachetypeattr = HWLOC_OBJ_CACHE_INSTRUCTION;
+    } else if (*end == 'u') {
+      cachetypeattr = HWLOC_OBJ_CACHE_UNIFIED;
+    }
+
+  } else if (!hwloc_strncasecmp(string, "group", 2)) {
+    int length;
+    type = HWLOC_OBJ_GROUP;
+    length = strcspn(string, "0123456789");
+    if (length <= 5 && !hwloc_strncasecmp(string, "group", length)
+	&& string[length] >= '0' && string[length] <= '9') {
+      depthattr = strtol(string+length, &end, 10);
+    }
+  } else
+    return -1;
+
+  *typep = type;
+  if (depthattrp)
+    *depthattrp = depthattr;
+  if (typeattrp) {
+    if (type == HWLOC_OBJ_CACHE && sizeof(hwloc_obj_cache_type_t) <= typeattrsize)
+      memcpy(typeattrp, &cachetypeattr, sizeof(hwloc_obj_cache_type_t));
+  }
+
+  return 0;
+}
+
+static const char *
+hwloc_pci_class_string(unsigned short class_id)
+{
+  switch ((class_id & 0xff00) >> 8) {
+    case 0x00:
+      switch (class_id) {
+	case 0x0001: return "VGA";
+      }
+      return "PCI";
+    case 0x01:
+      switch (class_id) {
+	case 0x0100: return "SCSI";
+	case 0x0101: return "IDE";
+	case 0x0102: return "Flop";
+	case 0x0103: return "IPI";
+	case 0x0104: return "RAID";
+	case 0x0105: return "ATA";
+	case 0x0106: return "SATA";
+	case 0x0107: return "SAS";
+	case 0x0108: return "NVMExp";
+      }
+      return "Stor";
+    case 0x02:
+      switch (class_id) {
+	case 0x0200: return "Ether";
+	case 0x0201: return "TokRn";
+	case 0x0202: return "FDDI";
+	case 0x0203: return "ATM";
+	case 0x0204: return "ISDN";
+	case 0x0205: return "WrdFip";
+	case 0x0206: return "PICMG";
+	case 0x0207: return "IB";
+      }
+      return "Net";
+    case 0x03:
+      switch (class_id) {
+	case 0x0300: return "VGA";
+	case 0x0301: return "XGA";
+	case 0x0302: return "3D";
+      }
+      return "Disp";
+    case 0x04:
+      switch (class_id) {
+	case 0x0400: return "Video";
+	case 0x0401: return "Audio";
+	case 0x0402: return "Phone";
+	case 0x0403: return "Auddv";
+      }
+      return "MM";
+    case 0x05:
+      switch (class_id) {
+	case 0x0500: return "RAM";
+	case 0x0501: return "Flash";
+      }
+      return "Mem";
+    case 0x06:
+      switch (class_id) {
+	case 0x0600: return "Host";
+	case 0x0601: return "ISA";
+	case 0x0602: return "EISA";
+	case 0x0603: return "MC";
+	case 0x0604: return "PCI_B";
+	case 0x0605: return "PCMCIA";
+	case 0x0606: return "Nubus";
+	case 0x0607: return "CardBus";
+	case 0x0608: return "RACEway";
+	case 0x0609: return "PCI_SB";
+	case 0x060a: return "IB_B";
+      }
+      return "Bridg";
+    case 0x07:
+      switch (class_id) {
+	case 0x0700: return "Ser";
+	case 0x0701: return "Para";
+	case 0x0702: return "MSer";
+	case 0x0703: return "Modm";
+	case 0x0704: return "GPIB";
+	case 0x0705: return "SmrtCrd";
+      }
+      return "Comm";
+    case 0x08:
+      switch (class_id) {
+	case 0x0800: return "PIC";
+	case 0x0801: return "DMA";
+	case 0x0802: return "Time";
+	case 0x0803: return "RTC";
+	case 0x0804: return "HtPl";
+	case 0x0805: return "SD-HtPl";
+	case 0x0806: return "IOMMU";
+      }
+      return "Syst";
+    case 0x09:
+      switch (class_id) {
+	case 0x0900: return "Kbd";
+	case 0x0901: return "Pen";
+	case 0x0902: return "Mouse";
+	case 0x0903: return "Scan";
+	case 0x0904: return "Game";
+      }
+      return "In";
+    case 0x0a:
+      return "Dock";
+    case 0x0b:
+      switch (class_id) {
+	case 0x0b00: return "386";
+	case 0x0b01: return "486";
+	case 0x0b02: return "Pent";
+	case 0x0b10: return "Alpha";
+	case 0x0b20: return "PPC";
+	case 0x0b30: return "MIPS";
+	case 0x0b40: return "CoProc";
+      }
+      return "Proc";
+    case 0x0c:
+      switch (class_id) {
+	case 0x0c00: return "Firw";
+	case 0x0c01: return "ACCES";
+	case 0x0c02: return "SSA";
+	case 0x0c03: return "USB";
+	case 0x0c04: return "Fiber";
+	case 0x0c05: return "SMBus";
+	case 0x0c06: return "IB";
+	case 0x0c07: return "IPMI";
+	case 0x0c08: return "SERCOS";
+	case 0x0c09: return "CANBUS";
+      }
+      return "Ser";
+    case 0x0d:
+      switch (class_id) {
+	case 0x0d00: return "IRDA";
+	case 0x0d01: return "IR";
+	case 0x0d10: return "RF";
+	case 0x0d11: return "Blueth";
+	case 0x0d12: return "BroadB";
+	case 0x0d20: return "802.1a";
+	case 0x0d21: return "802.1b";
+      }
+      return "Wifi";
+    case 0x0e:
+      switch (class_id) {
+	case 0x0e00: return "I2O";
+      }
+      return "Intll";
+    case 0x0f:
+      switch (class_id) {
+	case 0x0f00: return "S-TV";
+	case 0x0f01: return "S-Aud";
+	case 0x0f02: return "S-Voice";
+	case 0x0f03: return "S-Data";
+      }
+      return "Satel";
+    case 0x10:
+      return "Crypt";
+    case 0x11:
+      return "Signl";
+    case 0x12:
+      return "Accel";
+    case 0x13:
+      return "Instr";
+    case 0xff:
+      return "Oth";
+  }
+  return "PCI";
+}
+
+static const char* hwloc_obj_cache_type_letter(hwloc_obj_cache_type_t type)
+{
+  switch (type) {
+  case HWLOC_OBJ_CACHE_UNIFIED: return "";
+  case HWLOC_OBJ_CACHE_DATA: return "d";
+  case HWLOC_OBJ_CACHE_INSTRUCTION: return "i";
+  default: return "unknown";
+  }
+}
+
+int
+hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj, int verbose)
+{
+  hwloc_obj_type_t type = obj->type;
+  switch (type) {
+  case HWLOC_OBJ_MISC:
+  case HWLOC_OBJ_SYSTEM:
+  case HWLOC_OBJ_MACHINE:
+  case HWLOC_OBJ_NUMANODE:
+  case HWLOC_OBJ_PACKAGE:
+  case HWLOC_OBJ_CORE:
+  case HWLOC_OBJ_PU:
+    return hwloc_snprintf(string, size, "%s", hwloc_obj_type_string(type));
+  case HWLOC_OBJ_CACHE:
+    return hwloc_snprintf(string, size, "L%u%s%s", obj->attr->cache.depth,
+			  hwloc_obj_cache_type_letter(obj->attr->cache.type),
+			  verbose ? hwloc_obj_type_string(type): "");
+  case HWLOC_OBJ_GROUP:
+	  /* TODO: more pretty presentation? */
+    if (obj->attr->group.depth != (unsigned) -1)
+      return hwloc_snprintf(string, size, "%s%u", hwloc_obj_type_string(type), obj->attr->group.depth);
+    else
+      return hwloc_snprintf(string, size, "%s", hwloc_obj_type_string(type));
+  case HWLOC_OBJ_BRIDGE:
+    if (verbose)
+      return snprintf(string, size, "Bridge %s->%s",
+		      obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI ? "PCI" : "Host",
+		      "PCI");
+    else
+      return snprintf(string, size, obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI ? "PCIBridge" : "HostBridge");
+  case HWLOC_OBJ_PCI_DEVICE:
+    return snprintf(string, size, "PCI %04x:%04x",
+		    obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id);
+  case HWLOC_OBJ_OS_DEVICE:
+    switch (obj->attr->osdev.type) {
+    case HWLOC_OBJ_OSDEV_BLOCK: return hwloc_snprintf(string, size, "Block");
+    case HWLOC_OBJ_OSDEV_NETWORK: return hwloc_snprintf(string, size, verbose ? "Network" : "Net");
+    case HWLOC_OBJ_OSDEV_OPENFABRICS: return hwloc_snprintf(string, size, "OpenFabrics");
+    case HWLOC_OBJ_OSDEV_DMA: return hwloc_snprintf(string, size, "DMA");
+    case HWLOC_OBJ_OSDEV_GPU: return hwloc_snprintf(string, size, "GPU");
+    case HWLOC_OBJ_OSDEV_COPROC: return hwloc_snprintf(string, size, verbose ? "Co-Processor" : "CoProc");
+    default:
+      *string = '\0';
+      return 0;
+    }
+    break;
+  default:
+    if (size > 0)
+      *string = '\0';
+    return 0;
+  }
+}
+
+int
+hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj, const char * separator, int verbose)
+{
+  const char *prefix = "";
+  char *tmp = string;
+  ssize_t tmplen = size;
+  int ret = 0;
+  int res;
+
+  /* make sure we output at least an empty string */
+  if (size)
+    *string = '\0';
+
+  /* print memory attributes */
+  res = 0;
+  if (verbose) {
+    if (obj->memory.local_memory)
+      res = hwloc_snprintf(tmp, tmplen, "%slocal=%lu%s%stotal=%lu%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->memory.local_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->memory.total_memory, verbose),
+			   separator,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->memory.total_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->memory.local_memory, verbose));
+    else if (obj->memory.total_memory)
+      res = hwloc_snprintf(tmp, tmplen, "%stotal=%lu%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->memory.total_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->memory.total_memory, verbose));
+  } else {
+    if (obj->memory.local_memory)
+      res = hwloc_snprintf(tmp, tmplen, "%s%lu%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->memory.local_memory, verbose),
+			   hwloc_memory_size_printf_unit(obj->memory.local_memory, verbose));
+  }
+  if (res < 0)
+    return -1;
+  ret += res;
+  if (ret > 0)
+    prefix = separator;
+  if (res >= tmplen)
+    res = tmplen>0 ? tmplen - 1 : 0;
+  tmp += res;
+  tmplen -= res;
+
+  /* printf type-specific attributes */
+  res = 0;
+  switch (obj->type) {
+  case HWLOC_OBJ_CACHE:
+    if (verbose) {
+      char assoc[32];
+      if (obj->attr->cache.associativity == -1)
+	snprintf(assoc, sizeof(assoc), "%sfully-associative", separator);
+      else if (obj->attr->cache.associativity == 0)
+	*assoc = '\0';
+      else
+	snprintf(assoc, sizeof(assoc), "%sways=%d", separator, obj->attr->cache.associativity);
+      res = hwloc_snprintf(tmp, tmplen, "%ssize=%lu%s%slinesize=%u%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->attr->cache.size, verbose),
+			   hwloc_memory_size_printf_unit(obj->attr->cache.size, verbose),
+			   separator, obj->attr->cache.linesize,
+			   assoc);
+    } else
+      res = hwloc_snprintf(tmp, tmplen, "%s%lu%s",
+			   prefix,
+			   (unsigned long) hwloc_memory_size_printf_value(obj->attr->cache.size, verbose),
+			   hwloc_memory_size_printf_unit(obj->attr->cache.size, verbose));
+    break;
+  case HWLOC_OBJ_BRIDGE:
+    if (verbose) {
+      char up[128], down[64];
+      /* upstream is PCI or HOST */
+      if (obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_PCI) {
+        char linkspeed[64]= "";
+        if (obj->attr->pcidev.linkspeed)
+          snprintf(linkspeed, sizeof(linkspeed), "%slink=%.2fGB/s", separator, obj->attr->pcidev.linkspeed);
+	snprintf(up, sizeof(up), "busid=%04x:%02x:%02x.%01x%sid=%04x:%04x%sclass=%04x(%s)%s",
+		 obj->attr->pcidev.domain, obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func, separator,
+		 obj->attr->pcidev.vendor_id, obj->attr->pcidev.device_id, separator,
+		 obj->attr->pcidev.class_id, hwloc_pci_class_string(obj->attr->pcidev.class_id), linkspeed);
+      } else
+        *up = '\0';
+      /* downstream is_PCI */
+      snprintf(down, sizeof(down), "buses=%04x:[%02x-%02x]",
+	       obj->attr->bridge.downstream.pci.domain, obj->attr->bridge.downstream.pci.secondary_bus, obj->attr->bridge.downstream.pci.subordinate_bus);
+      if (*up)
+	res = snprintf(string, size, "%s%s%s", up, separator, down);
+      else
+	res = snprintf(string, size, "%s", down);
+    }
+    break;
+  case HWLOC_OBJ_PCI_DEVICE:
+    if (verbose) {
+      char linkspeed[64]= "";
+      char busid[16] = "[collapsed]";
+      if (obj->attr->pcidev.linkspeed)
+        snprintf(linkspeed, sizeof(linkspeed), "%slink=%.2fGB/s", separator, obj->attr->pcidev.linkspeed);
+      if (!hwloc_obj_get_info_by_name(obj, "lstopoCollapse"))
+	snprintf(busid, sizeof(busid), "%04x:%02x:%02x.%01x",
+		 obj->attr->pcidev.domain, obj->attr->pcidev.bus, obj->attr->pcidev.dev, obj->attr->pcidev.func);
+      res = snprintf(string, size, "busid=%s%sclass=%04x(%s)%s",
+		     busid, separator,
+		     obj->attr->pcidev.class_id, hwloc_pci_class_string(obj->attr->pcidev.class_id), linkspeed);
+    }
+    break;
+  default:
+    break;
+  }
+  if (res < 0)
+    return -1;
+  ret += res;
+  if (ret > 0)
+    prefix = separator;
+  if (res >= tmplen)
+    res = tmplen>0 ? tmplen - 1 : 0;
+  tmp += res;
+  tmplen -= res;
+
+  /* printf infos */
+  if (verbose) {
+    unsigned i;
+    for(i=0; i<obj->infos_count; i++) {
+      if (!strcmp(obj->infos[i].name, "lstopoCollapse"))
+	continue;
+      if (strchr(obj->infos[i].value, ' '))
+	res = hwloc_snprintf(tmp, tmplen, "%s%s=\"%s\"",
+			     prefix,
+			     obj->infos[i].name, obj->infos[i].value);
+      else
+	res = hwloc_snprintf(tmp, tmplen, "%s%s=%s",
+			     prefix,
+			     obj->infos[i].name, obj->infos[i].value);
+      if (res < 0)
+        return -1;
+      ret += res;
+      if (res >= tmplen)
+        res = tmplen>0 ? tmplen - 1 : 0;
+      tmp += res;
+      tmplen -= res;
+      if (ret > 0)
+        prefix = separator;
+    }
+  }
+
+  return ret;
+}
+
+
+int
+hwloc_obj_snprintf(char *string, size_t size,
+    struct hwloc_topology *topology __hwloc_attribute_unused, struct hwloc_obj *l, const char *_indexprefix, int verbose)
+{
+  const char *indexprefix = _indexprefix ? _indexprefix : "#";
+  char os_index[12] = "";
+  char type[64];
+  char attr[128];
+  int attrlen;
+
+  if (l->os_index != (unsigned) -1) {
+    hwloc_snprintf(os_index, 12, "%s%u", indexprefix, l->os_index);
+  }
+
+  hwloc_obj_type_snprintf(type, sizeof(type), l, verbose);
+  attrlen = hwloc_obj_attr_snprintf(attr, sizeof(attr), l, " ", verbose);
+
+  if (attrlen > 0)
+    return hwloc_snprintf(string, size, "%s%s(%s)", type, os_index, attr);
+  else
+    return hwloc_snprintf(string, size, "%s%s", type, os_index);
+}
+
+int hwloc_obj_cpuset_snprintf(char *str, size_t size, size_t nobj, struct hwloc_obj * const *objs)
+{
+  hwloc_bitmap_t set = hwloc_bitmap_alloc();
+  int res;
+  unsigned i;
+
+  hwloc_bitmap_zero(set);
+  for(i=0; i<nobj; i++)
+    if (objs[i]->cpuset)
+      hwloc_bitmap_or(set, set, objs[i]->cpuset);
+
+  res = hwloc_bitmap_snprintf(str, size, set);
+  hwloc_bitmap_free(set);
+  return res;
+}
diff --git a/ext/hwloc/include/hwloc.h b/ext/hwloc/include/hwloc.h
new file mode 100644
index 0000000..6c8d203
--- /dev/null
+++ b/ext/hwloc/include/hwloc.h
@@ -0,0 +1,2206 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/*=====================================================================
+ *                 PLEASE GO READ THE DOCUMENTATION!
+ *         ------------------------------------------------
+ *               $tarball_directory/doc/doxygen-doc/
+ *                                or
+ *           http://www.open-mpi.org/projects/hwloc/doc/
+ *=====================================================================
+ *
+ * FAIR WARNING: Do NOT expect to be able to figure out all the
+ * subtleties of hwloc by simply reading function prototypes and
+ * constant descrptions here in this file.
+ *
+ * Hwloc has wonderful documentation in both PDF and HTML formats for
+ * your reading pleasure.  The formal documentation explains a LOT of
+ * hwloc-specific concepts, provides definitions, and discusses the
+ * "big picture" for many of the things that you'll find here in this
+ * header file.
+ *
+ * The PDF/HTML documentation was generated via Doxygen; much of what
+ * you'll see in there is also here in this file.  BUT THERE IS A LOT
+ * THAT IS IN THE PDF/HTML THAT IS ***NOT*** IN hwloc.h!
+ *
+ * There are entire paragraph-length descriptions, discussions, and
+ * pretty prictures to explain subtle corner cases, provide concrete
+ * examples, etc.
+ *
+ * Please, go read the documentation.  :-)
+ *
+ * Moreover there are several examples of hwloc use under doc/examples
+ * in the source tree.
+ *
+ *=====================================================================*/
+
+/** \file
+ * \brief The hwloc API.
+ *
+ * See hwloc/bitmap.h for bitmap specific macros.
+ * See hwloc/helper.h for high-level topology traversal helpers.
+ * See hwloc/inlines.h for the actual inline code of some functions below.
+ */
+
+#ifndef HWLOC_H
+#define HWLOC_H
+
+#include <hwloc/autogen/config.h>
+#include <sys/types.h>
+#include <stdio.h>
+#include <string.h>
+#include <limits.h>
+
+/*
+ * Symbol transforms
+ */
+#include <hwloc/rename.h>
+
+/*
+ * Bitmap definitions
+ */
+
+#include <hwloc/bitmap.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_api_version API version
+ * @{
+ */
+
+/** \brief Indicate at build time which hwloc API version is being used. */
+#define HWLOC_API_VERSION 0x00020000
+
+/** \brief Indicate at runtime which hwloc API version was used at build time. */
+HWLOC_DECLSPEC unsigned hwloc_get_api_version(void);
+
+/** \brief Current component and plugin ABI version (see hwloc/plugins.h) */
+#define HWLOC_COMPONENT_ABI 5
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_sets Object Sets (hwloc_cpuset_t and hwloc_nodeset_t)
+ *
+ * Hwloc uses bitmaps to represent two distinct kinds of object sets:
+ * CPU sets (::hwloc_cpuset_t) and NUMA node sets (::hwloc_nodeset_t).
+ * These types are both typedefs to a common back end type
+ * (::hwloc_bitmap_t), and therefore all the hwloc bitmap functions
+ * are applicable to both ::hwloc_cpuset_t and ::hwloc_nodeset_t (see
+ * \ref hwlocality_bitmap).
+ *
+ * The rationale for having two different types is that even though
+ * the actions one wants to perform on these types are the same (e.g.,
+ * enable and disable individual items in the set/mask), they're used
+ * in very different contexts: one for specifying which processors to
+ * use and one for specifying which NUMA nodes to use.  Hence, the
+ * name difference is really just to reflect the intent of where the
+ * type is used.
+ *
+ * @{
+ */
+
+/** \brief A CPU set is a bitmap whose bits are set according to CPU
+ * physical OS indexes.
+ *
+ * It may be consulted and modified with the bitmap API as any
+ * ::hwloc_bitmap_t (see hwloc/bitmap.h).
+ *
+ * Each bit may be converted into a PU object using
+ * hwloc_get_pu_obj_by_os_index().
+ */
+typedef hwloc_bitmap_t hwloc_cpuset_t;
+/** \brief A non-modifiable ::hwloc_cpuset_t. */
+typedef hwloc_const_bitmap_t hwloc_const_cpuset_t;
+
+/** \brief A node set is a bitmap whose bits are set according to NUMA
+ * memory node physical OS indexes.
+ *
+ * It may be consulted and modified with the bitmap API as any
+ * ::hwloc_bitmap_t (see hwloc/bitmap.h).
+ * Each bit may be converted into a NUMA node object using
+ * hwloc_get_numanode_obj_by_os_index().
+ *
+ * When binding memory on a system without any NUMA node,
+ * the single main memory bank is considered as NUMA node #0.
+ *
+ * See also \ref hwlocality_helper_nodeset_convert.
+ */
+typedef hwloc_bitmap_t hwloc_nodeset_t;
+/** \brief A non-modifiable ::hwloc_nodeset_t.
+ */
+typedef hwloc_const_bitmap_t hwloc_const_nodeset_t;
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_types Object Types
+ * @{
+ */
+
+/** \brief Type of topology object.
+ *
+ * \note Do not rely on the ordering or completeness of the values as new ones
+ * may be defined in the future!  If you need to compare types, use
+ * hwloc_compare_types() instead.
+ */
+typedef enum {
+    /* ***************************************************************
+       WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+
+       If new enum values are added here, you MUST also go update the
+       obj_type_order[] and obj_order_type[] arrays in src/topology.c.
+
+       WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+       *************************************************************** */
+
+  HWLOC_OBJ_SYSTEM,	/**< \brief Whole system (may be a cluster of machines).
+  			  * The whole system that is accessible to hwloc.
+			  * That may comprise several machines in SSI systems.
+			  */
+  HWLOC_OBJ_MACHINE,	/**< \brief Machine.
+			  * The typical root object type.
+			  * A set of processors and memory with cache
+			  * coherency.
+			  */
+  HWLOC_OBJ_NUMANODE,	/**< \brief NUMA node.
+			  * A set of processors around memory which the
+			  * processors can directly access.
+			  *
+			  * There is always at one such object in the topology
+			  * even if the machine is not NUMA.
+			  */
+  HWLOC_OBJ_PACKAGE,	/**< \brief Physical package, what goes into a socket.
+			  * In the physical meaning, i.e. that you can add
+			  * or remove physically.
+			  */
+  HWLOC_OBJ_CACHE,	/**< \brief Cache.
+			  * Can be L1i, L1d, L2, L3, ...
+			  */
+  HWLOC_OBJ_CORE,	/**< \brief Core.
+			  * A computation unit (may be shared by several
+			  * logical processors).
+			  */
+  HWLOC_OBJ_PU,		/**< \brief Processing Unit, or (Logical) Processor.
+			  * An execution unit (may share a core with some
+			  * other logical processors, e.g. in the case of
+			  * an SMT core).
+			  *
+			  * Objects of this kind are always reported and can
+			  * thus be used as fallback when others are not.
+			  */
+
+  HWLOC_OBJ_GROUP,	/**< \brief Group objects.
+			  * Objects which do not fit in the above but are
+			  * detected by hwloc and are useful to take into
+			  * account for affinity. For instance, some operating systems
+			  * expose their arbitrary processors aggregation this
+			  * way.  And hwloc may insert such objects to group
+			  * NUMA nodes according to their distances.
+			  *
+			  * These objects are ignored when they do not bring
+			  * any structure.
+			  */
+
+  HWLOC_OBJ_MISC,	/**< \brief Miscellaneous objects.
+			  * Objects without particular meaning, that can e.g. be
+			  * added by the application for its own use, or by hwloc
+			  * for miscellaneous objects such as MemoryDevice.
+			  * These objects are not listed in the main children list,
+			  * but rather in the dedicated misc children list.
+			  * Misc objects may only have Misc objects as children,
+			  * and those are in the dedicated misc children list as well.
+			  * Misc objects have NULL CPU and node sets.
+			  */
+
+  HWLOC_OBJ_BRIDGE,	/**< \brief Bridge.
+			  * Any bridge that connects the host or an I/O bus,
+			  * to another I/O bus.
+			  * They are not added to the topology unless I/O discovery
+			  * is enabled with hwloc_topology_set_flags().
+			  * I/O objects are not listed in the main children list,
+			  * but rather in the dedicated io children list.
+			  * I/O objects have NULL CPU and node sets.
+			  */
+  HWLOC_OBJ_PCI_DEVICE,	/**< \brief PCI device.
+			  * They are not added to the topology unless I/O discovery
+			  * is enabled with hwloc_topology_set_flags().
+			  * I/O objects are not listed in the main children list,
+			  * but rather in the dedicated io children list.
+			  * I/O objects have NULL CPU and node sets.
+			  */
+  HWLOC_OBJ_OS_DEVICE,	/**< \brief Operating system device.
+			  * They are not added to the topology unless I/O discovery
+			  * is enabled with hwloc_topology_set_flags().
+			  * I/O objects are not listed in the main children list,
+			  * but rather in the dedicated io children list.
+			  * I/O objects have NULL CPU and node sets.
+			  */
+
+  HWLOC_OBJ_TYPE_MAX    /**< \private Sentinel value */
+
+    /* ***************************************************************
+       WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+
+       If new enum values are added here, you MUST also go update the
+       obj_type_order[] and obj_order_type[] arrays in src/topology.c.
+
+       WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING
+       *************************************************************** */
+} hwloc_obj_type_t;
+
+/** \brief Cache type. */
+typedef enum hwloc_obj_cache_type_e {
+  HWLOC_OBJ_CACHE_UNIFIED,      /**< \brief Unified cache. */
+  HWLOC_OBJ_CACHE_DATA,         /**< \brief Data cache. */
+  HWLOC_OBJ_CACHE_INSTRUCTION   /**< \brief Instruction cache.
+				  * Only used when the HWLOC_TOPOLOGY_FLAG_ICACHES topology flag is set. */
+} hwloc_obj_cache_type_t;
+
+/** \brief Type of one side (upstream or downstream) of an I/O bridge. */
+typedef enum hwloc_obj_bridge_type_e {
+  HWLOC_OBJ_BRIDGE_HOST,	/**< \brief Host-side of a bridge, only possible upstream. */
+  HWLOC_OBJ_BRIDGE_PCI		/**< \brief PCI-side of a bridge. */
+} hwloc_obj_bridge_type_t;
+
+/** \brief Type of a OS device. */
+typedef enum hwloc_obj_osdev_type_e {
+  HWLOC_OBJ_OSDEV_BLOCK,	/**< \brief Operating system block device.
+				  * For instance "sda" on Linux. */
+  HWLOC_OBJ_OSDEV_GPU,		/**< \brief Operating system GPU device.
+				  * For instance ":0.0" for a GL display,
+				  * "card0" for a Linux DRM device. */
+  HWLOC_OBJ_OSDEV_NETWORK,	/**< \brief Operating system network device.
+				  * For instance the "eth0" interface on Linux. */
+  HWLOC_OBJ_OSDEV_OPENFABRICS,	/**< \brief Operating system openfabrics device.
+				  * For instance the "mlx4_0" InfiniBand HCA device on Linux. */
+  HWLOC_OBJ_OSDEV_DMA,		/**< \brief Operating system dma engine device.
+				  * For instance the "dma0chan0" DMA channel on Linux. */
+  HWLOC_OBJ_OSDEV_COPROC	/**< \brief Operating system co-processor device.
+				  * For instance "mic0" for a Xeon Phi (MIC) on Linux,
+				  * "opencl0d0" for a OpenCL device,
+				  * "cuda0" for a CUDA device. */
+} hwloc_obj_osdev_type_t;
+
+/** \brief Compare the depth of two object types
+ *
+ * Types shouldn't be compared as they are, since newer ones may be added in
+ * the future.  This function returns less than, equal to, or greater than zero
+ * respectively if \p type1 objects usually include \p type2 objects, are the
+ * same as \p type2 objects, or are included in \p type2 objects. If the types
+ * can not be compared (because neither is usually contained in the other),
+ * HWLOC_TYPE_UNORDERED is returned.  Object types containing CPUs can always
+ * be compared (usually, a system contains machines which contain nodes which
+ * contain packages which contain caches, which contain cores, which contain
+ * processors).
+ *
+ * \note HWLOC_OBJ_PU will always be the deepest.
+ * \note This does not mean that the actual topology will respect that order:
+ * e.g. as of today cores may also contain caches, and packages may also contain
+ * nodes. This is thus just to be seen as a fallback comparison method.
+ */
+HWLOC_DECLSPEC int hwloc_compare_types (hwloc_obj_type_t type1, hwloc_obj_type_t type2) __hwloc_attribute_const;
+
+enum hwloc_compare_types_e {
+    HWLOC_TYPE_UNORDERED = INT_MAX	/**< \brief Value returned by hwloc_compare_types when types can not be compared. \hideinitializer */
+};
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_objects Object Structure and Attributes
+ * @{
+ */
+
+union hwloc_obj_attr_u;
+
+/** \brief Object memory */
+struct hwloc_obj_memory_s {
+  hwloc_uint64_t total_memory; /**< \brief Total memory (in bytes) in this object and its children */
+  hwloc_uint64_t local_memory; /**< \brief Local memory (in bytes) */
+
+  /** \brief Size of array \p page_types */
+  unsigned page_types_len;
+  /** \brief Array of local memory page types, \c NULL if no local memory and \p page_types is 0.
+   *
+   * The array is sorted by increasing \p size fields.
+   * It contains \p page_types_len slots.
+   */
+  struct hwloc_obj_memory_page_type_s {
+    hwloc_uint64_t size;	/**< \brief Size of pages */
+    hwloc_uint64_t count;	/**< \brief Number of pages of this size */
+  } * page_types;
+};
+
+/** \brief Structure of a topology object
+ *
+ * Applications must not modify any field except hwloc_obj.userdata.
+ */
+struct hwloc_obj {
+  /* physical information */
+  hwloc_obj_type_t type;		/**< \brief Type of object */
+  unsigned os_index;			/**< \brief OS-provided physical index number.
+					 * It is not guaranteed unique across the entire machine,
+					 * except for PUs and NUMA nodes.
+					 */
+  char *name;				/**< \brief Object description if any */
+
+  struct hwloc_obj_memory_s memory;	/**< \brief Memory attributes */
+
+  union hwloc_obj_attr_u *attr;		/**< \brief Object type-specific Attributes,
+					 * may be \c NULL if no attribute value was found */
+
+  /* global position */
+  unsigned depth;			/**< \brief Vertical index in the hierarchy.
+					 * If the topology is symmetric, this is equal to the
+					 * parent depth plus one, and also equal to the number
+					 * of parent/child links from the root object to here.
+					 */
+  unsigned logical_index;		/**< \brief Horizontal index in the whole list of similar objects,
+					 * hence guaranteed unique across the entire machine.
+					 * Could be a "cousin_rank" since it's the rank within the "cousin" list below
+					 */
+
+  /* cousins are all objects of the same type (and depth) across the entire topology */
+  struct hwloc_obj *next_cousin;	/**< \brief Next object of same type and depth */
+  struct hwloc_obj *prev_cousin;	/**< \brief Previous object of same type and depth */
+
+  /* children of the same parent are siblings, even if they may have different type and depth */
+  struct hwloc_obj *parent;		/**< \brief Parent, \c NULL if root (system object) */
+  unsigned sibling_rank;		/**< \brief Index in parent's \c children[] array. Or the index in parent's I/O or Misc children list. */
+  struct hwloc_obj *next_sibling;	/**< \brief Next object below the same parent */
+  struct hwloc_obj *prev_sibling;	/**< \brief Previous object below the same parent */
+
+  /* children array below this object (except I/O and Misc children) */
+  unsigned arity;			/**< \brief Number of children */
+  struct hwloc_obj **children;		/**< \brief Children, \c children[0 .. arity -1] */
+  struct hwloc_obj *first_child;	/**< \brief First child */
+  struct hwloc_obj *last_child;		/**< \brief Last child */
+
+  int symmetric_subtree;		/**< \brief Set if the subtree of normal objects below this object is symmetric,
+					  * which means all children and their children have identical subtrees.
+					  * I/O and Misc children are ignored.
+					  *
+					  * If set in the topology root object, lstopo may export the topology
+					  * as a synthetic string.
+					  */
+
+  /* specific list of I/O children */
+  unsigned io_arity;			/**< \brief Number of I/O children */
+  struct hwloc_obj *io_first_child;	/**< \brief First I/O child */
+
+  /* specific list of Misc children */
+  unsigned misc_arity;			/**< \brief Number of Misc children */
+  struct hwloc_obj *misc_first_child;	/**< \brief First Misc child */
+
+  /* cpusets and nodesets */
+  hwloc_cpuset_t cpuset;		/**< \brief CPUs covered by this object
+                                          *
+                                          * This is the set of CPUs for which there are PU objects in the topology
+                                          * under this object, i.e. which are known to be physically contained in this
+                                          * object and known how (the children path between this object and the PU
+                                          * objects).
+                                          *
+                                          * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
+                                          * some of these CPUs may not be allowed for binding, see allowed_cpuset.
+                                          *
+					  * \note All objects have non-NULL CPU and node sets except Misc and I/O objects.
+					  *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          */
+  hwloc_cpuset_t complete_cpuset;       /**< \brief The complete CPU set of logical processors of this object,
+                                          *
+                                          * This may include not only the same as the cpuset field, but also the CPUs for
+                                          * which topology information is unknown or incomplete, the offlines CPUS, and
+                                          * the CPUs that are ignored when the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM flag
+                                          * is not set.
+                                          * Thus no corresponding PU object may be found in the topology, because the
+                                          * precise position is undefined. It is however known that it would be somewhere
+                                          * under this object.
+                                          *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          */
+  hwloc_cpuset_t allowed_cpuset;        /**< \brief The CPU set of allowed logical processors
+                                          *
+                                          * This includes the CPUs contained in this object which are allowed for
+                                          * binding, i.e. passing them to the hwloc binding functions should not return
+                                          * permission errors.  This is usually restricted by administration rules.
+                                          *
+                                          * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
+                                          * allowed_cpuset may be smaller than cpuset. Otherwise they are identical.
+                                          *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          */
+
+  hwloc_nodeset_t nodeset;              /**< \brief NUMA nodes covered by this object or containing this object
+                                          *
+                                          * This is the set of NUMA nodes for which there are NODE objects in the
+                                          * topology under or above this object, i.e. which are known to be physically
+                                          * contained in this object or containing it and known how (the children path
+                                          * between this object and the NODE objects).
+                                          *
+                                          * In the end, these nodes are those that are close to the current object.
+                                          *
+                                          * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
+                                          * some of these nodes may not be allowed for allocation, see allowed_nodeset.
+                                          *
+                                          * If there are no NUMA nodes in the machine, all the memory is close to this
+                                          * object, so only the first bit may be set in \p nodeset.
+                                          *
+					  * \note All objects have non-NULL CPU and node sets except Misc and I/O objects.
+					  *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          */
+  hwloc_nodeset_t complete_nodeset;     /**< \brief The complete NUMA node set of this object,
+                                          *
+                                          * This may include not only the same as the nodeset field, but also the NUMA
+                                          * nodes for which topology information is unknown or incomplete, the offlines
+                                          * nodes, and the nodes that are ignored when the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM
+                                          * flag is not set.
+                                          * Thus no corresponding NODE object may be found in the topology, because the
+                                          * precise position is undefined. It is however known that it would be
+                                          * somewhere under this object.
+                                          *
+                                          * If there are no NUMA nodes in the machine, all the memory is close to this
+                                          * object, so only the first bit is set in \p complete_nodeset.
+                                          *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          */
+  hwloc_nodeset_t allowed_nodeset;      /**< \brief The set of allowed NUMA memory nodes
+                                          *
+                                          * This includes the NUMA memory nodes contained in this object which are
+                                          * allowed for memory allocation, i.e. passing them to NUMA node-directed
+                                          * memory allocation should not return permission errors. This is usually
+                                          * restricted by administration rules.
+                                          *
+                                          * If the HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM configuration flag is set,
+                                          * allowed_nodeset may be smaller than nodeset. Otherwise they are identical.
+                                          *
+                                          * If there are no NUMA nodes in the machine, all the memory is close to this
+                                          * object, so only the first bit may be set in \p allowed_nodeset.
+                                          *
+                                          * \note Its value must not be changed, hwloc_bitmap_dup must be used instead.
+                                          */
+
+  struct hwloc_distances_s **distances;	/**< \brief Distances between all objects at same depth below this object */
+  unsigned distances_count;
+
+  struct hwloc_obj_info_s *infos;	/**< \brief Array of stringified info type=name. */
+  unsigned infos_count;			/**< \brief Size of infos array. */
+
+  /* misc */
+  void *userdata;			/**< \brief Application-given private data pointer,
+					 * initialized to \c NULL, use it as you wish.
+					 * See hwloc_topology_set_userdata_export_callback() in hwloc/export.h
+					 * if you wish to export this field to XML. */
+};
+/**
+ * \brief Convenience typedef; a pointer to a struct hwloc_obj.
+ */
+typedef struct hwloc_obj * hwloc_obj_t;
+
+/** \brief Object type-specific Attributes */
+union hwloc_obj_attr_u {
+  /** \brief Cache-specific Object Attributes */
+  struct hwloc_cache_attr_s {
+    hwloc_uint64_t size;		  /**< \brief Size of cache in bytes */
+    unsigned depth;			  /**< \brief Depth of cache (e.g., L1, L2, ...etc.) */
+    unsigned linesize;			  /**< \brief Cache-line size in bytes. 0 if unknown */
+    int associativity;			  /**< \brief Ways of associativity,
+    					    *  -1 if fully associative, 0 if unknown */
+    hwloc_obj_cache_type_t type;          /**< \brief Cache type */
+  } cache;
+  /** \brief Group-specific Object Attributes */
+  struct hwloc_group_attr_s {
+    unsigned depth;			  /**< \brief Depth of group object */
+  } group;
+  /** \brief PCI Device specific Object Attributes */
+  struct hwloc_pcidev_attr_s {
+    unsigned short domain;
+    unsigned char bus, dev, func;
+    unsigned short class_id;
+    unsigned short vendor_id, device_id, subvendor_id, subdevice_id;
+    unsigned char revision;
+    float linkspeed; /* in GB/s */
+  } pcidev;
+  /** \brief Bridge specific Object Attribues */
+  struct hwloc_bridge_attr_s {
+    union {
+      struct hwloc_pcidev_attr_s pci;
+    } upstream;
+    hwloc_obj_bridge_type_t upstream_type;
+    union {
+      struct {
+	unsigned short domain;
+	unsigned char secondary_bus, subordinate_bus;
+      } pci;
+    } downstream;
+    hwloc_obj_bridge_type_t downstream_type;
+    unsigned depth;
+  } bridge;
+  /** \brief OS Device specific Object Attributes */
+  struct hwloc_osdev_attr_s {
+    hwloc_obj_osdev_type_t type;
+  } osdev;
+};
+
+/** \brief Distances between objects
+ *
+ * One object may contain a distance structure describing distances
+ * between all its descendants at a given relative depth. If the
+ * containing object is the root object of the topology, then the
+ * distances are available for all objects in the machine.
+ *
+ * If the \p latency pointer is not \c NULL, the pointed array contains
+ * memory latencies (non-zero values), see below.
+ *
+ * In the future, some other types of distances may be considered.
+ * In these cases, \p latency may be \c NULL.
+ */
+struct hwloc_distances_s {
+  unsigned relative_depth;	/**< \brief Relative depth of the considered objects
+				 * below the object containing this distance information. */
+  unsigned nbobjs;		/**< \brief Number of objects considered in the matrix.
+				 * It is the number of descendant objects at \p relative_depth
+				 * below the containing object.
+				 * It corresponds to the result of hwloc_get_nbobjs_inside_cpuset_by_depth(). */
+
+  float *latency;		/**< \brief Matrix of latencies between objects, stored as a one-dimension array.
+				 * May be \c NULL if the distances considered here are not latencies.
+				 *
+				 * Unless defined by the user, this currently contains latencies
+				 * between NUMA nodes (as reported in the System Locality Distance Information Table
+				 * (SLIT) in the ACPI specification), which may or may not be accurate.
+				 * It corresponds to the latency for accessing the memory of one node
+				 * from a core in another node.
+				 *
+				 * Values are normalized to get 1.0 as the minimal value in the matrix.
+				 * Latency from i-th to j-th object is stored in slot i*nbobjs+j.
+				 */
+  float latency_max;		/**< \brief The maximal value in the latency matrix. */
+  float latency_base;		/**< \brief The multiplier that should be applied to latency matrix
+				 * to retrieve the original OS-provided latencies.
+				 * Usually 10 on Linux since ACPI SLIT uses 10 for local latency.
+				 */
+};
+
+/** \brief Object info */
+struct hwloc_obj_info_s {
+  char *name;	/**< \brief Info name */
+  char *value;	/**< \brief Info value */
+};
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_creation Topology Creation and Destruction
+ * @{
+ */
+
+struct hwloc_topology;
+/** \brief Topology context
+ *
+ * To be initialized with hwloc_topology_init() and built with hwloc_topology_load().
+ */
+typedef struct hwloc_topology * hwloc_topology_t;
+
+/** \brief Allocate a topology context.
+ *
+ * \param[out] topologyp is assigned a pointer to the new allocated context.
+ *
+ * \return 0 on success, -1 on error.
+ */
+HWLOC_DECLSPEC int hwloc_topology_init (hwloc_topology_t *topologyp);
+
+/** \brief Build the actual topology
+ *
+ * Build the actual topology once initialized with hwloc_topology_init() and
+ * tuned with \ref hwlocality_configuration and \ref hwlocality_setsource routines.
+ * No other routine may be called earlier using this topology context.
+ *
+ * \param topology is the topology to be loaded with objects.
+ *
+ * \return 0 on success, -1 on error.
+ *
+ * \note On failure, the topology is reinitialized. It should be either
+ * destroyed with hwloc_topology_destroy() or configured and loaded again.
+ *
+ * \note This function may be called only once per topology.
+ *
+ * \sa hwlocality_configuration and hwlocality_setsource
+ */
+HWLOC_DECLSPEC int hwloc_topology_load(hwloc_topology_t topology);
+
+/** \brief Terminate and free a topology context
+ *
+ * \param topology is the topology to be freed
+ */
+HWLOC_DECLSPEC void hwloc_topology_destroy (hwloc_topology_t topology);
+
+/** \brief Duplicate a topology.
+ *
+ * The entire topology structure as well as its objects
+ * are duplicated into a new one.
+ *
+ * This is useful for keeping a backup while modifying a topology.
+ */
+HWLOC_DECLSPEC int hwloc_topology_dup(hwloc_topology_t *newtopology, hwloc_topology_t oldtopology);
+
+/** \brief Run internal checks on a topology structure
+ *
+ * The program aborts if an inconsistency is detected in the given topology.
+ *
+ * \param topology is the topology to be checked
+ *
+ * \note This routine is only useful to developers.
+ *
+ * \note The input topology should have been previously loaded with
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC void hwloc_topology_check(hwloc_topology_t topology);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_levels Object levels, depths and types
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Get the depth of the hierarchical tree of objects.
+ *
+ * This is the depth of HWLOC_OBJ_PU objects plus one.
+ */
+HWLOC_DECLSPEC unsigned hwloc_topology_get_depth(hwloc_topology_t __hwloc_restrict topology) __hwloc_attribute_pure;
+
+/** \brief Returns the depth of objects of type \p type.
+ *
+ * If no object of this type is present on the underlying architecture, or if
+ * the OS doesn't provide this kind of information, the function returns
+ * HWLOC_TYPE_DEPTH_UNKNOWN.
+ *
+ * If type is absent but a similar type is acceptable, see also
+ * hwloc_get_type_or_below_depth() and hwloc_get_type_or_above_depth().
+ *
+ * If some objects of the given type exist in different levels,
+ * for instance L1 and L2 caches, or L1i and L1d caches,
+ * the function returns HWLOC_TYPE_DEPTH_MULTIPLE.
+ * See hwloc_get_cache_type_depth() in hwloc/helper.h to better handle this
+ * case.
+ *
+ * If an I/O object type is given, the function returns a virtual value
+ * because I/O objects are stored in special levels that are not CPU-related.
+ * This virtual depth may be passed to other hwloc functions such as
+ * hwloc_get_obj_by_depth() but it should not be considered as an actual
+ * depth by the application. In particular, it should not be compared with
+ * any other object depth or with the entire topology depth.
+ */
+HWLOC_DECLSPEC int hwloc_get_type_depth (hwloc_topology_t topology, hwloc_obj_type_t type);
+
+enum hwloc_get_type_depth_e {
+    HWLOC_TYPE_DEPTH_UNKNOWN = -1,    /**< \brief No object of given type exists in the topology. \hideinitializer */
+    HWLOC_TYPE_DEPTH_MULTIPLE = -2,   /**< \brief Objects of given type exist at different depth in the topology. \hideinitializer */
+    HWLOC_TYPE_DEPTH_BRIDGE = -3,     /**< \brief Virtual depth for bridge object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_PCI_DEVICE = -4, /**< \brief Virtual depth for PCI device object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_OS_DEVICE = -5,  /**< \brief Virtual depth for software device object level. \hideinitializer */
+    HWLOC_TYPE_DEPTH_MISC = -6        /**< \brief Virtual depth for Misc object. \hideinitializer */
+};
+
+/** \brief Returns the depth of objects of type \p type or below
+ *
+ * If no object of this type is present on the underlying architecture, the
+ * function returns the depth of the first "present" object typically found
+ * inside \p type.
+ *
+ * If some objects of the given type exist in different levels, for instance
+ * L1 and L2 caches, the function returns HWLOC_TYPE_DEPTH_MULTIPLE.
+ */
+static __hwloc_inline int
+hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the depth of objects of type \p type or above
+ *
+ * If no object of this type is present on the underlying architecture, the
+ * function returns the depth of the first "present" object typically
+ * containing \p type.
+ *
+ * If some objects of the given type exist in different levels, for instance
+ * L1 and L2 caches, the function returns HWLOC_TYPE_DEPTH_MULTIPLE.
+ */
+static __hwloc_inline int
+hwloc_get_type_or_above_depth (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the type of objects at depth \p depth.
+ *
+ * \return -1 if depth \p depth does not exist.
+ */
+HWLOC_DECLSPEC hwloc_obj_type_t hwloc_get_depth_type (hwloc_topology_t topology, unsigned depth) __hwloc_attribute_pure;
+
+/** \brief Returns the width of level at depth \p depth.
+ */
+HWLOC_DECLSPEC unsigned hwloc_get_nbobjs_by_depth (hwloc_topology_t topology, unsigned depth) __hwloc_attribute_pure;
+
+/** \brief Returns the width of level type \p type
+ *
+ * If no object for that type exists, 0 is returned.
+ * If there are several levels with objects of that type, -1 is returned.
+ */
+static __hwloc_inline int
+hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type) __hwloc_attribute_pure;
+
+/** \brief Returns the top-object of the topology-tree.
+ *
+ * Its type is typically ::HWLOC_OBJ_MACHINE but it could be different
+ * for complex topologies.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_root_obj (hwloc_topology_t topology) __hwloc_attribute_pure;
+
+/** \brief Returns the topology object at logical index \p idx from depth \p depth */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_get_obj_by_depth (hwloc_topology_t topology, unsigned depth, unsigned idx) __hwloc_attribute_pure;
+
+/** \brief Returns the topology object at logical index \p idx with type \p type
+ *
+ * If no object for that type exists, \c NULL is returned.
+ * If there are several levels with objects of that type, \c NULL is returned
+ * and ther caller may fallback to hwloc_get_obj_by_depth().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx) __hwloc_attribute_pure;
+
+/** \brief Returns the next object at depth \p depth.
+ *
+ * If \p prev is \c NULL, return the first object at depth \p depth.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_depth (hwloc_topology_t topology, unsigned depth, hwloc_obj_t prev);
+
+/** \brief Returns the next object of type \p type.
+ *
+ * If \p prev is \c NULL, return the first object at type \p type.  If
+ * there are multiple or no depth for given type, return \c NULL and
+ * let the caller fallback to hwloc_get_next_obj_by_depth().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type,
+			    hwloc_obj_t prev);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_object_strings Manipulating Object Type, Sets and Attributes as Strings
+ * @{
+ */
+
+/** \brief Return a stringified topology object type */
+HWLOC_DECLSPEC const char * hwloc_obj_type_string (hwloc_obj_type_t type) __hwloc_attribute_const;
+
+/** \brief Return an object type and attributes from a type string.
+ *
+ * Convert strings such as "Package" or "Cache" into the corresponding types.
+ * Matching is case-insensitive, and only the first letters are actually
+ * required to match.
+ *
+ * Types that have specific attributes, for instance caches and groups,
+ * may be returned in \p depthattrp and \p typeattrp. They are ignored
+ * when these pointers are \c NULL.
+ *
+ * For instance "L2i" or "L2iCache" would return
+ * type HWLOC_OBJ_CACHE in \p typep, 2 in \p depthattrp,
+ * and HWLOC_OBJ_CACHE_TYPE_INSTRUCTION in \p typeattrp
+ * (this last pointer should point to a hwloc_obj_cache_type_t).
+ * "Group3" would return type HWLOC_OBJ_GROUP type and 3 in \p depthattrp.
+ * Attributes that are not specified in the string (for instance "Group"
+ * without a depth, or "L2Cache" without a cache type) are set to -1.
+ *
+ * \p typeattrd is only filled if the size specified in \p typeattrsize
+ * is large enough. It is currently only used for caches, and the required
+ * size is at least the size of hwloc_obj_cache_type_t.
+ *
+ * \return 0 if a type was correctly identified, otherwise -1.
+ *
+ * \note This is an extended version of the now deprecated hwloc_obj_type_of_string()
+ */
+HWLOC_DECLSPEC int hwloc_obj_type_sscanf(const char *string,
+					 hwloc_obj_type_t *typep,
+					 int *depthattrp,
+					 void *typeattrp, size_t typeattrsize);
+
+/** \brief Stringify the type of a given topology object into a human-readable form.
+ *
+ * It differs from hwloc_obj_type_string() because it prints type attributes such
+ * as cache depth and type.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_type_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj,
+				   int verbose);
+
+/** \brief Stringify the attributes of a given topology object into a human-readable form.
+ *
+ * Attribute values are separated by \p separator.
+ *
+ * Only the major attributes are printed in non-verbose mode.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_attr_snprintf(char * __hwloc_restrict string, size_t size, hwloc_obj_t obj, const char * __hwloc_restrict separator,
+				   int verbose);
+
+/** \brief Stringify the cpuset containing a set of objects.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_cpuset_snprintf(char * __hwloc_restrict str, size_t size, size_t nobj, const hwloc_obj_t * __hwloc_restrict objs);
+
+/** \brief Search the given key name in object infos and return the corresponding value.
+ *
+ * If multiple keys match the given name, only the first one is returned.
+ *
+ * \return \c NULL if no such key exists.
+ */
+static __hwloc_inline const char *
+hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name) __hwloc_attribute_pure;
+
+/** \brief Add the given info name and value pair to the given object.
+ *
+ * The info is appended to the existing info array even if another key
+ * with the same name already exists.
+ *
+ * The input strings are copied before being added in the object infos.
+ *
+ * \note This function may be used to enforce object colors in the lstopo
+ * graphical output by using "lstopoStyle" as a name and "Background=#rrggbb"
+ * as a value. See CUSTOM COLORS in the lstopo(1) manpage for details.
+ *
+ * \note If \p value contains some non-printable characters, they will
+ * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h.
+ */
+HWLOC_DECLSPEC void hwloc_obj_add_info(hwloc_obj_t obj, const char *name, const char *value);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_cpubinding CPU binding
+ *
+ * It is often useful to call hwloc_bitmap_singlify() first so that a single CPU
+ * remains in the set. This way, the process will not even migrate between
+ * different CPUs inside the given set.
+ * Some operating systems also only support that kind of binding.
+ *
+ * Some operating systems do not provide all hwloc-supported
+ * mechanisms to bind processes, threads, etc.
+ * hwloc_topology_get_support() may be used to query about the actual CPU
+ * binding support in the currently used operating system.
+ *
+ * When the requested binding operation is not available and the
+ * ::HWLOC_CPUBIND_STRICT flag was passed, the function returns -1.
+ * \p errno is set to \c ENOSYS when it is not possible to bind the requested kind of object
+ * processes/threads. errno is set to \c EXDEV when the requested cpuset
+ * can not be enforced (e.g. some systems only allow one CPU, and some
+ * other systems only allow one NUMA node).
+ *
+ * If ::HWLOC_CPUBIND_STRICT was not passed, the function may fail as well,
+ * or the operating system may use a slightly different operation
+ * (with side-effects, smaller binding set, etc.)
+ * when the requested operation is not exactly supported.
+ *
+ * The most portable version that should be preferred over the others,
+ * whenever possible, is the following one which just binds the current program,
+ * assuming it is single-threaded:
+ *
+ * \code
+ * hwloc_set_cpubind(topology, set, 0),
+ * \endcode
+ *
+ * If the program may be multithreaded, the following one should be preferred
+ * to only bind the current thread:
+ *
+ * \code
+ * hwloc_set_cpubind(topology, set, HWLOC_CPUBIND_THREAD),
+ * \endcode
+ *
+ * \sa Some example codes are available under doc/examples/ in the source tree.
+ *
+ * \note To unbind, just call the binding function with either a full cpuset or
+ * a cpuset equal to the system cpuset.
+ *
+ * \note On some operating systems, CPU binding may have effects on memory binding, see
+ * ::HWLOC_CPUBIND_NOMEMBIND
+ *
+ * \note Running lstopo --top or hwloc-ps can be a very convenient tool to check
+ * how binding actually happened.
+ * @{
+ */
+
+/** \brief Process/Thread binding flags.
+ *
+ * These bit flags can be used to refine the binding policy.
+ *
+ * The default (0) is to bind the current process, assumed to be
+ * single-threaded, in a non-strict way.  This is the most portable
+ * way to bind as all operating systems usually provide it.
+ *
+ * \note Not all systems support all kinds of binding.  See the
+ * "Detailed Description" section of \ref hwlocality_cpubinding for a
+ * description of errors that can occur.
+ */
+typedef enum {
+  /** \brief Bind all threads of the current (possibly) multithreaded process.
+   * \hideinitializer */
+  HWLOC_CPUBIND_PROCESS = (1<<0),
+
+  /** \brief Bind current thread of current process.
+   * \hideinitializer */
+  HWLOC_CPUBIND_THREAD = (1<<1),
+
+  /** \brief Request for strict binding from the OS.
+   *
+   * By default, when the designated CPUs are all busy while other
+   * CPUs are idle, operating systems may execute the thread/process
+   * on those other CPUs instead of the designated CPUs, to let them
+   * progress anyway.  Strict binding means that the thread/process
+   * will _never_ execute on other cpus than the designated CPUs, even
+   * when those are busy with other tasks and other CPUs are idle.
+   *
+   * \note Depending on the operating system, strict binding may not
+   * be possible (e.g., the OS does not implement it) or not allowed
+   * (e.g., for an administrative reasons), and the function will fail
+   * in that case.
+   *
+   * When retrieving the binding of a process, this flag checks
+   * whether all its threads  actually have the same binding. If the
+   * flag is not given, the binding of each thread will be
+   * accumulated.
+   *
+   * \note This flag is meaningless when retrieving the binding of a
+   * thread.
+   * \hideinitializer
+   */
+  HWLOC_CPUBIND_STRICT = (1<<2),
+
+  /** \brief Avoid any effect on memory binding
+   *
+   * On some operating systems, some CPU binding function would also
+   * bind the memory on the corresponding NUMA node.  It is often not
+   * a problem for the application, but if it is, setting this flag
+   * will make hwloc avoid using OS functions that would also bind
+   * memory.  This will however reduce the support of CPU bindings,
+   * i.e. potentially return -1 with errno set to ENOSYS in some
+   * cases.
+   *
+   * This flag is only meaningful when used with functions that set
+   * the CPU binding.  It is ignored when used with functions that get
+   * CPU binding information.
+   * \hideinitializer
+   */
+  HWLOC_CPUBIND_NOMEMBIND = (1<<3)
+} hwloc_cpubind_flags_t;
+
+/** \brief Bind current process or thread on cpus given in physical bitmap \p set.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_cpubind(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
+
+/** \brief Get current process or thread binding.
+ *
+ * Writes into \p set the physical cpuset which the process or thread (according to \e
+ * flags) was last bound to.
+ */
+HWLOC_DECLSPEC int hwloc_get_cpubind(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+
+/** \brief Bind a process \p pid on cpus given in physical bitmap \p set.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and HWLOC_CPUBIND_THREAD is passed in flags,
+ * the binding is applied to that specific thread.
+ *
+ * \note On non-Linux systems, HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_set_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, int flags);
+
+/** \brief Get the current physical binding of process \p pid.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and HWLOC_CPUBIND_THREAD is passed in flags,
+ * the binding for that specific thread is returned.
+ *
+ * \note On non-Linux systems, HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_cpubind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+
+#ifdef hwloc_thread_t
+/** \brief Bind a thread \p thread on cpus given in physical bitmap \p set.
+ *
+ * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note HWLOC_CPUBIND_PROCESS can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_set_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_const_cpuset_t set, int flags);
+#endif
+
+#ifdef hwloc_thread_t
+/** \brief Get the current physical binding of thread \p tid.
+ *
+ * \note \p hwloc_thread_t is \p pthread_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note HWLOC_CPUBIND_PROCESS can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_thread_cpubind(hwloc_topology_t topology, hwloc_thread_t thread, hwloc_cpuset_t set, int flags);
+#endif
+
+/** \brief Get the last physical CPU where the current process or thread ran.
+ *
+ * The operating system may move some tasks from one processor
+ * to another at any time according to their binding,
+ * so this function may return something that is already
+ * outdated.
+ *
+ * \p flags can include either HWLOC_CPUBIND_PROCESS or HWLOC_CPUBIND_THREAD to
+ * specify whether the query should be for the whole process (union of all CPUs
+ * on which all threads are running), or only the current thread. If the
+ * process is single-threaded, flags can be set to zero to let hwloc use
+ * whichever method is available on the underlying OS.
+ */
+HWLOC_DECLSPEC int hwloc_get_last_cpu_location(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+
+/** \brief Get the last physical CPU where a process ran.
+ *
+ * The operating system may move some tasks from one processor
+ * to another at any time according to their binding,
+ * so this function may return something that is already
+ * outdated.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note As a special case on Linux, if a tid (thread ID) is supplied
+ * instead of a pid (process ID) and HWLOC_CPUBIND_THREAD is passed in flags,
+ * the last CPU location of that specific thread is returned.
+ *
+ * \note On non-Linux systems, HWLOC_CPUBIND_THREAD can not be used in \p flags.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_last_cpu_location(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_membinding Memory binding
+ *
+ * Memory binding can be done three ways:
+ *
+ * - explicit memory allocation thanks to hwloc_alloc_membind() and friends:
+ *   the binding will have effect on the memory allocated by these functions.
+ * - implicit memory binding through binding policy: hwloc_set_membind() and
+ *   friends only define the current policy of the process, which will be
+ *   applied to the subsequent calls to malloc() and friends.
+ * - migration of existing memory ranges, thanks to hwloc_set_area_membind()
+ *   and friends, which move already-allocated data.
+ *
+ * Not all operating systems support all three ways.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding support in the currently used operating system.
+ *
+ * When the requested binding operation is not available and the
+ * ::HWLOC_MEMBIND_STRICT flag was passed, the function returns -1.
+ * \p errno will be set to \c ENOSYS when the system does support
+ * the specified action or policy
+ * (e.g., some systems only allow binding memory on a per-thread
+ * basis, whereas other systems only allow binding memory for all
+ * threads in a process).
+ * \p errno will be set to EXDEV when the requested cpuset can not be enforced
+ * (e.g., some systems only allow binding memory to a single NUMA node).
+ *
+ * If ::HWLOC_MEMBIND_STRICT was not passed, the function may fail as well,
+ * or the operating system may use a slightly different operation
+ * (with side-effects, smaller binding set, etc.)
+ * when the requested operation is not exactly supported.
+ *
+ * The most portable form that should be preferred over the others
+ * whenever possible is as follows.
+ * It allocates some memory hopefully bound to the specified set.
+ * To do so, hwloc will possibly have to change the current memory
+ * binding policy in order to actually get the memory bound, if the OS
+ * does not provide any other way to simply allocate bound memory
+ * without changing the policy for all allocations. That is the
+ * difference with hwloc_alloc_membind(), which will never change the
+ * current memory binding policy.
+ *
+ * \code
+ * hwloc_alloc_membind_policy(topology, size, set,
+ *                            HWLOC_MEMBIND_BIND, 0);
+ * \endcode
+ *
+ * Each hwloc memory binding function is available in two forms: one
+ * that takes a CPU set argument and another that takes a NUMA memory
+ * node set argument (see \ref hwlocality_object_sets and \ref
+ * hwlocality_bitmap for a discussion of CPU sets and NUMA memory node
+ * sets).  The names of the latter form end with _nodeset.  It is also
+ * possible to convert between CPU set and node set using
+ * hwloc_cpuset_to_nodeset() or hwloc_cpuset_from_nodeset().
+ *
+ * \sa Some example codes are available under doc/examples/ in the source tree.
+ *
+ * \note On some operating systems, memory binding affects the CPU
+ * binding; see ::HWLOC_MEMBIND_NOCPUBIND
+ * @{
+ */
+
+/** \brief Memory binding policy.
+ *
+ * These constants can be used to choose the binding policy.  Only one policy can
+ * be used at a time (i.e., the values cannot be OR'ed together).
+ *
+ * Not all systems support all kinds of binding.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding policy support in the currently used operating system.
+ * See the "Detailed Description" section of \ref hwlocality_membinding
+ * for a description of errors that can occur.
+ */
+typedef enum {
+  /** \brief Reset the memory allocation policy to the system default.
+   * Depending on the operating system, this may correspond to
+   * HWLOC_MEMBIND_FIRSTTOUCH (Linux),
+   * or HWLOC_MEMBIND_BIND (AIX, HP-UX, OSF, Solaris, Windows).
+   * \hideinitializer */
+  HWLOC_MEMBIND_DEFAULT =	0,
+
+  /** \brief Allocate memory
+   * but do not immediately bind it to a specific locality. Instead,
+   * each page in the allocation is bound only when it is first
+   * touched. Pages are individually bound to the local NUMA node of
+   * the first thread that touches it. If there is not enough memory
+   * on the node, allocation may be done in the specified cpuset
+   * before allocating on other nodes.
+   * \hideinitializer */
+  HWLOC_MEMBIND_FIRSTTOUCH =	1,
+
+  /** \brief Allocate memory on the specified nodes.
+   * \hideinitializer */
+  HWLOC_MEMBIND_BIND =		2,
+
+  /** \brief Allocate memory on the given nodes in an interleaved
+   * / round-robin manner.  The precise layout of the memory across
+   * multiple NUMA nodes is OS/system specific. Interleaving can be
+   * useful when threads distributed across the specified NUMA nodes
+   * will all be accessing the whole memory range concurrently, since
+   * the interleave will then balance the memory references.
+   * \hideinitializer */
+  HWLOC_MEMBIND_INTERLEAVE =	3,
+
+  /** \brief Replicate memory on the given nodes; reads from this
+   * memory will attempt to be serviced from the NUMA node local to
+   * the reading thread. Replicating can be useful when multiple
+   * threads from the specified NUMA nodes will be sharing the same
+   * read-only data.
+   *
+   * This policy can only be used with existing memory allocations
+   * (i.e., the hwloc_set_*membind*() functions); it cannot be used
+   * with functions that allocate new memory (i.e., the hwloc_alloc*()
+   * functions).
+   * \hideinitializer */
+  HWLOC_MEMBIND_REPLICATE =	4,
+
+  /** \brief For each page bound with this policy, by next time
+   * it is touched (and next time only), it is moved from its current
+   * location to the local NUMA node of the thread where the memory
+   * reference occurred (if it needs to be moved at all).
+   * \hideinitializer */
+  HWLOC_MEMBIND_NEXTTOUCH =	5,
+
+  /** \brief Returned by get_membind() functions when multiple
+   * threads or parts of a memory area have differing memory binding
+   * policies.
+   * \hideinitializer */
+  HWLOC_MEMBIND_MIXED = -1
+} hwloc_membind_policy_t;
+
+/** \brief Memory binding flags.
+ *
+ * These flags can be used to refine the binding policy.
+ * All flags can be logically OR'ed together with the exception of
+ * ::HWLOC_MEMBIND_PROCESS and ::HWLOC_MEMBIND_THREAD;
+ * these two flags are mutually exclusive.
+ *
+ * Not all systems support all kinds of binding.
+ * hwloc_topology_get_support() may be used to query about the actual memory
+ * binding support in the currently used operating system.
+ * See the "Detailed Description" section of \ref hwlocality_membinding
+ * for a description of errors that can occur.
+ */
+typedef enum {
+  /** \brief Set policy for all threads of the specified (possibly
+   * multithreaded) process.  This flag is mutually exclusive with
+   * ::HWLOC_MEMBIND_THREAD.
+   * \hideinitializer */
+  HWLOC_MEMBIND_PROCESS =       (1<<0),
+
+ /** \brief Set policy for a specific thread of the current process.
+  * This flag is mutually exclusive with ::HWLOC_MEMBIND_PROCESS.
+  * \hideinitializer */
+  HWLOC_MEMBIND_THREAD =        (1<<1),
+
+ /** Request strict binding from the OS.  The function will fail if
+  * the binding can not be guaranteed / completely enforced.
+  *
+  * This flag has slightly different meanings depending on which
+  * function it is used with.
+  * \hideinitializer  */
+  HWLOC_MEMBIND_STRICT =        (1<<2),
+
+ /** \brief Migrate existing allocated memory.  If the memory cannot
+  * be migrated and the ::HWLOC_MEMBIND_STRICT flag is passed, an error
+  * will be returned.
+  * \hideinitializer  */
+  HWLOC_MEMBIND_MIGRATE =       (1<<3),
+
+  /** \brief Avoid any effect on CPU binding.
+   *
+   * On some operating systems, some underlying memory binding
+   * functions also bind the application to the corresponding CPU(s).
+   * Using this flag will cause hwloc to avoid using OS functions that
+   * could potentially affect CPU bindings.  Note, however, that using
+   * NOCPUBIND may reduce hwloc's overall memory binding
+   * support. Specifically: some of hwloc's memory binding functions
+   * may fail with errno set to ENOSYS when used with NOCPUBIND.
+   * \hideinitializer
+   */
+  HWLOC_MEMBIND_NOCPUBIND =     (1<<4)
+} hwloc_membind_flags_t;
+
+/** \brief Set the default memory binding policy of the current
+ * process or thread to prefer the NUMA node(s) specified by physical \p nodeset
+ *
+ * If neither ::HWLOC_MEMBIND_PROCESS nor ::HWLOC_MEMBIND_THREAD is
+ * specified, the current process is assumed to be single-threaded.
+ * This is the most portable form as it permits hwloc to use either
+ * process-based OS functions or thread-based OS functions, depending
+ * on which are available.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_membind_nodeset(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Set the default memory binding policy of the current
+ * process or thread to prefer the NUMA node(s) near the specified physical \p
+ * cpuset
+ *
+ * If neither ::HWLOC_MEMBIND_PROCESS nor ::HWLOC_MEMBIND_THREAD is
+ * specified, the current process is assumed to be single-threaded.
+ * This is the most portable form as it permits hwloc to use either
+ * process-based OS functions or thread-based OS functions, depending
+ * on which are available.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_membind(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * current process or thread.
+ *
+ * This function has two output parameters: \p nodeset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the current process.  Passing ::HWLOC_MEMBIND_THREAD specifies that
+ * the query target is the current policy and nodeset for only the
+ * thread invoking this function.
+ *
+ * If neither of these flags are passed (which is the most portable
+ * method), the process is assumed to be single threaded.  This allows
+ * hwloc to use either process-based OS functions or thread-based OS
+ * functions, depending on which are available.
+ *
+ * ::HWLOC_MEMBIND_STRICT is only meaningful when ::HWLOC_MEMBIND_PROCESS
+ * is also specified.  In this case, hwloc will check the default
+ * memory policies and nodesets for all threads in the process.  If
+ * they are not identical, -1 is returned and errno is set to EXDEV.
+ * If they are identical, the values are returned in \p nodeset and \p
+ * policy.
+ *
+ * Otherwise, if ::HWLOC_MEMBIND_PROCESS is specified (and
+ * ::HWLOC_MEMBIND_STRICT is \em not specified), \p nodeset is set to
+ * the logical OR of all threads' default nodeset.  If all threads'
+ * default policies are the same, \p policy is set to that policy.  If
+ * they are different, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * In the ::HWLOC_MEMBIND_THREAD case (or when neither
+ * ::HWLOC_MEMBIND_PROCESS or ::HWLOC_MEMBIND_THREAD is specified), there
+ * is only one nodeset and policy; they are returned in \p nodeset and
+ * \p policy, respectively.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_membind_nodeset(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * current process or thread (the locality is returned in \p cpuset as
+ * CPUs near the locality's actual NUMA node(s)).
+ *
+ * This function has two output parameters: \p cpuset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the current process.  Passing ::HWLOC_MEMBIND_THREAD specifies that
+ * the query target is the current policy and nodeset for only the
+ * thread invoking this function.
+ *
+ * If neither of these flags are passed (which is the most portable
+ * method), the process is assumed to be single threaded.  This allows
+ * hwloc to use either process-based OS functions or thread-based OS
+ * functions, depending on which are available.
+ *
+ * ::HWLOC_MEMBIND_STRICT is only meaningful when ::HWLOC_MEMBIND_PROCESS
+ * is also specified.  In this case, hwloc will check the default
+ * memory policies and nodesets for all threads in the process.  If
+ * they are not identical, -1 is returned and errno is set to EXDEV.
+ * If they are identical, the policy is returned in \p policy.  \p
+ * cpuset is set to the union of CPUs near the NUMA node(s) in the
+ * nodeset.
+ *
+ * Otherwise, if ::HWLOC_MEMBIND_PROCESS is specified (and
+ * ::HWLOC_MEMBIND_STRICT is \em not specified), the default nodeset
+ * from each thread is logically OR'ed together.  \p cpuset is set to
+ * the union of CPUs near the NUMA node(s) in the resulting nodeset.
+ * If all threads' default policies are the same, \p policy is set to
+ * that policy.  If they are different, \p policy is set to
+ * ::HWLOC_MEMBIND_MIXED.
+ *
+ * In the ::HWLOC_MEMBIND_THREAD case (or when neither
+ * ::HWLOC_MEMBIND_PROCESS or ::HWLOC_MEMBIND_THREAD is specified), there
+ * is only one nodeset and policy.  The policy is returned in \p
+ * policy; \p cpuset is set to the union of CPUs near the NUMA node(s)
+ * in the \p nodeset.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_membind(hwloc_topology_t topology, hwloc_cpuset_t cpuset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Set the default memory binding policy of the specified
+ * process to prefer the NUMA node(s) specified by physical \p nodeset
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_set_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Set the default memory binding policy of the specified
+ * process to prefer the NUMA node(s) near the specified physical \p cpuset
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_set_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * specified process.
+ *
+ * This function has two output parameters: \p nodeset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the specified process.  If ::HWLOC_MEMBIND_PROCESS is not specified
+ * (which is the most portable method), the process is assumed to be
+ * single threaded.  This allows hwloc to use either process-based OS
+ * functions or thread-based OS functions, depending on which are
+ * available.
+ *
+ * Note that it does not make sense to pass ::HWLOC_MEMBIND_THREAD to
+ * this function.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, hwloc will check the default
+ * memory policies and nodesets for all threads in the specified
+ * process.  If they are not identical, -1 is returned and errno is
+ * set to EXDEV.  If they are identical, the values are returned in \p
+ * nodeset and \p policy.
+ *
+ * Otherwise, \p nodeset is set to the logical OR of all threads'
+ * default nodeset.  If all threads' default policies are the same, \p
+ * policy is set to that policy.  If they are different, \p policy is
+ * set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_membind_nodeset(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Query the default memory binding policy and physical locality of the
+ * specified process (the locality is returned in \p cpuset as CPUs
+ * near the locality's actual NUMA node(s)).
+ *
+ * This function has two output parameters: \p cpuset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the current memory binding policies and nodesets in
+ * the queried target.
+ *
+ * Passing the ::HWLOC_MEMBIND_PROCESS flag specifies that the query
+ * target is the current policies and nodesets for all the threads in
+ * the specified process.  If ::HWLOC_MEMBIND_PROCESS is not specified
+ * (which is the most portable method), the process is assumed to be
+ * single threaded.  This allows hwloc to use either process-based OS
+ * functions or thread-based OS functions, depending on which are
+ * available.
+ *
+ * Note that it does not make sense to pass ::HWLOC_MEMBIND_THREAD to
+ * this function.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, hwloc will check the default
+ * memory policies and nodesets for all threads in the specified
+ * process.  If they are not identical, -1 is returned and errno is
+ * set to EXDEV.  If they are identical, the policy is returned in \p
+ * policy.  \p cpuset is set to the union of CPUs near the NUMA
+ * node(s) in the nodeset.
+ *
+ * Otherwise, the default nodeset from each thread is logically OR'ed
+ * together.  \p cpuset is set to the union of CPUs near the NUMA
+ * node(s) in the resulting nodeset.  If all threads' default policies
+ * are the same, \p policy is set to that policy.  If they are
+ * different, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ */
+HWLOC_DECLSPEC int hwloc_get_proc_membind(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t cpuset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Bind the already-allocated memory identified by (addr, len)
+ * to the NUMA node(s) in physical \p nodeset.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Bind the already-allocated memory identified by (addr, len)
+ * to the NUMA node(s) near physical \p cpuset.
+ *
+ * \return -1 with errno set to ENOSYS if the action is not supported
+ * \return -1 with errno set to EXDEV if the binding cannot be enforced
+ */
+HWLOC_DECLSPEC int hwloc_set_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags);
+
+/** \brief Query the physical NUMA node(s) and binding policy of the memory
+ * identified by (\p addr, \p len ).
+ *
+ * This function has two output parameters: \p nodeset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the memory binding policies and nodesets of the pages
+ * in the address range.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, the target pages are first
+ * checked to see if they all have the same memory binding policy and
+ * nodeset.  If they do not, -1 is returned and errno is set to EXDEV.
+ * If they are identical across all pages, the nodeset and policy are
+ * returned in \p nodeset and \p policy, respectively.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is not specified, \p nodeset is set to the
+ * union of all NUMA node(s) containing pages in the address range.
+ * If all pages in the target have the same policy, it is returned in
+ * \p policy.  Otherwise, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_area_membind_nodeset(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Query the CPUs near the physical NUMA node(s) and binding policy of
+ * the memory identified by (\p addr, \p len ).
+ *
+ * This function has two output parameters: \p cpuset and \p policy.
+ * The values returned in these parameters depend on both the \p flags
+ * passed in and the memory binding policies and nodesets of the pages
+ * in the address range.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is specified, the target pages are first
+ * checked to see if they all have the same memory binding policy and
+ * nodeset.  If they do not, -1 is returned and errno is set to EXDEV.
+ * If they are identical across all pages, the policy is returned in
+ * \p policy.  \p cpuset is set to the union of CPUs near the NUMA
+ * node(s) in the nodeset.
+ *
+ * If ::HWLOC_MEMBIND_STRICT is not specified, the union of all NUMA
+ * node(s) containing pages in the address range is calculated.  \p
+ * cpuset is then set to the CPUs near the NUMA node(s) in this union.
+ * If all pages in the target have the same policy, it is returned in
+ * \p policy.  Otherwise, \p policy is set to ::HWLOC_MEMBIND_MIXED.
+ *
+ * If any other flags are specified, -1 is returned and errno is set
+ * to EINVAL.
+ */
+HWLOC_DECLSPEC int hwloc_get_area_membind(hwloc_topology_t topology, const void *addr, size_t len, hwloc_cpuset_t cpuset, hwloc_membind_policy_t * policy, int flags);
+
+/** \brief Allocate some memory
+ *
+ * This is equivalent to malloc(), except that it tries to allocate
+ * page-aligned memory from the OS.
+ *
+ * \note The allocated memory should be freed with hwloc_free().
+ */
+HWLOC_DECLSPEC void *hwloc_alloc(hwloc_topology_t topology, size_t len);
+
+/** \brief Allocate some memory on the given physical nodeset \p nodeset
+ *
+ * \return NULL with errno set to ENOSYS if the action is not supported
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to EXDEV if the binding cannot be enforced
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to ENOMEM if the memory allocation failed
+ * even before trying to bind.
+ *
+ * \note The allocated memory should be freed with hwloc_free().
+ */
+HWLOC_DECLSPEC void *hwloc_alloc_membind_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Allocate some memory on memory nodes near the given physical cpuset \p cpuset
+ *
+ * \return NULL with errno set to ENOSYS if the action is not supported
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to EXDEV if the binding cannot be enforced
+ * and ::HWLOC_MEMBIND_STRICT is given
+ * \return NULL with errno set to ENOMEM if the memory allocation failed
+ * even before trying to bind.
+ *
+ * \note The allocated memory should be freed with hwloc_free().
+ */
+HWLOC_DECLSPEC void *hwloc_alloc_membind(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t cpuset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Allocate some memory on the given nodeset \p nodeset
+ *
+ * This is similar to hwloc_alloc_membind except that it is allowed to change
+ * the current memory binding policy, thus providing more binding support, at
+ * the expense of changing the current state.
+ */
+static __hwloc_inline void *
+hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Allocate some memory on the memory nodes near given cpuset \p cpuset
+ *
+ * This is similar to hwloc_alloc_membind_policy_nodeset, but for a given cpuset.
+ */
+static __hwloc_inline void *
+hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags) __hwloc_attribute_malloc;
+
+/** \brief Free memory that was previously allocated by hwloc_alloc()
+ * or hwloc_alloc_membind().
+ */
+HWLOC_DECLSPEC int hwloc_free(hwloc_topology_t topology, void *addr, size_t len);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_setsource Changing the Source of Topology Discovery
+ *
+ * If none of the functions below is called, the default is to detect all the objects
+ * of the machine that the caller is allowed to access.
+ *
+ * This default behavior may also be modified through environment variables
+ * if the application did not modify it already.
+ * Setting HWLOC_XMLFILE in the environment enforces the discovery from a XML
+ * file as if hwloc_topology_set_xml() had been called.
+ * Setting HWLOC_SYNTHETIC enforces a synthetic topology as if
+ * hwloc_topology_set_synthetic() had been called.
+ * Setting HWLOC_FSROOT switches to reading the topology from the specified Linux
+ * filesystem root.
+ *
+ * Finally, HWLOC_THISSYSTEM enforces the return value of
+ * hwloc_topology_is_thissystem().
+ *
+ * @{
+ */
+
+/** \brief Change which pid the topology is viewed from
+ *
+ * On some systems, processes may have different views of the machine, for
+ * instance the set of allowed CPUs. By default, hwloc exposes the view from
+ * the current process. Calling hwloc_topology_set_pid() permits to make it
+ * expose the topology of the machine from the point of view of another
+ * process.
+ *
+ * \note \p hwloc_pid_t is \p pid_t on Unix platforms,
+ * and \p HANDLE on native Windows platforms.
+ *
+ * \note -1 is returned and errno is set to ENOSYS on platforms that do not
+ * support this feature.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_pid(hwloc_topology_t __hwloc_restrict topology, hwloc_pid_t pid);
+
+/** \brief Enable synthetic topology.
+ *
+ * Gather topology information from the given \p description,
+ * a space-separated string of numbers describing
+ * the arity of each level.
+ * Each number may be prefixed with a type and a colon to enforce the type
+ * of a level.  If only some level types are enforced, hwloc will try to
+ * choose the other types according to usual topologies, but it may fail
+ * and you may have to specify more level types manually.
+ * See also the \ref synthetic.
+ *
+ * Setting the environment variable HWLOC_SYNTHETIC
+ * may also result in this behavior.
+ *
+ * If \p description was properly parsed and describes a valid topology
+ * configuration, this function returns 0.
+ * Otherwise -1 is returned and errno is set to EINVAL.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from.  You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success.
+ *
+ * \note On success, the synthetic component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_synthetic(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict description);
+
+/** \brief Enable XML-file based topology.
+ *
+ * Gather topology information from the XML file given at \p xmlpath.
+ * Setting the environment variable HWLOC_XMLFILE may also result in this behavior.
+ * This file may have been generated earlier with hwloc_topology_export_xml() in hwloc/export.h,
+ * or lstopo file.xml.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from.  You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \return -1 with errno set to EINVAL on failure to read the XML file.
+ *
+ * \note See also hwloc_topology_set_userdata_import_callback()
+ * for importing application-specific object userdata.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success.  To have hwloc still actually call OS-specific hooks, the
+ * HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
+ * file is really the underlying system.
+ *
+ * \note On success, the XML component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_xml(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict xmlpath);
+
+/** \brief Enable XML based topology using a memory buffer (instead of
+ * a file, as with hwloc_topology_set_xml()).
+ *
+ * Gather topology information from the XML memory buffer given at \p
+ * buffer and of length \p size.  This buffer may have been filled
+ * earlier with hwloc_topology_export_xmlbuffer() in hwloc/export.h.
+ *
+ * Note that this function does not actually load topology
+ * information; it just tells hwloc where to load it from.  You'll
+ * still need to invoke hwloc_topology_load() to actually load the
+ * topology information.
+ *
+ * \return -1 with errno set to EINVAL on failure to read the XML buffer.
+ *
+ * \note See also hwloc_topology_set_userdata_import_callback()
+ * for importing application-specific object userdata.
+ *
+ * \note For convenience, this backend provides empty binding hooks which just
+ * return success.  To have hwloc still actually call OS-specific hooks, the
+ * HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM has to be set to assert that the loaded
+ * file is really the underlying system.
+ *
+ * \note On success, the XML component replaces the previously enabled
+ * component (if any), but the topology is not actually modified until
+ * hwloc_topology_load().
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_xmlbuffer(hwloc_topology_t __hwloc_restrict topology, const char * __hwloc_restrict buffer, int size);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_configuration Topology Detection Configuration and Query
+ *
+ * Several functions can optionally be called between hwloc_topology_init() and
+ * hwloc_topology_load() to configure how the detection should be performed,
+ * e.g. to ignore some objects types, define a synthetic topology, etc.
+ *
+ * @{
+ */
+
+/** \brief Flags to be set onto a topology context before load.
+ *
+ * Flags should be given to hwloc_topology_set_flags().
+ * They may also be returned by hwloc_topology_get_flags().
+ */
+enum hwloc_topology_flags_e {
+ /** \brief Detect the whole system, ignore reservations.
+   *
+   * Gather all resources, even if some were disabled by the administrator.
+   * For instance, ignore Linux Cgroup/Cpusets and gather all processors and memory nodes.
+   *
+   * When this flag is set, each object has allowed_cpuset <= cpuset <= complete_cpuset.
+   * Otherwise allowed_cpuset = cpuset <= complete_cpuset.
+   * The same applies to nodesets.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM = (1UL<<0),
+
+ /** \brief Assume that the selected backend provides the topology for the
+   * system on which we are running.
+   *
+   * This forces hwloc_topology_is_thissystem to return 1, i.e. makes hwloc assume that
+   * the selected backend provides the topology for the system on which we are running,
+   * even if it is not the OS-specific backend but the XML backend for instance.
+   * This means making the binding functions actually call the OS-specific
+   * system calls and really do binding, while the XML backend would otherwise
+   * provide empty hooks just returning success.
+   *
+   * Setting the environment variable HWLOC_THISSYSTEM may also result in the
+   * same behavior.
+   *
+   * This can be used for efficiency reasons to first detect the topology once,
+   * save it to an XML file, and quickly reload it later through the XML
+   * backend, but still having binding functions actually do bind.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM = (1UL<<1),
+
+  /** \brief Detect PCI devices.
+   *
+   * By default, I/O devices are ignored. This flag enables I/O device
+   * detection using the pci backend. Only the common PCI devices (GPUs,
+   * NICs, block devices, ...) and host bridges (objects that connect the host
+   * objects to an I/O subsystem) will be added to the topology.
+   * Additionally it also enables MemoryDevice misc objects.
+   * Uncommon devices and other bridges (such as PCI-to-PCI bridges) will be
+   * ignored.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_IO_DEVICES = (1UL<<2),
+
+  /** \brief Detect PCI bridges.
+   *
+   * This flag should be combined with HWLOC_TOPOLOGY_FLAG_IO_DEVICES to enable
+   * the detection of both common devices and of all useful bridges (bridges that
+   * have at least one device behind them).
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_IO_BRIDGES = (1UL<<3),
+
+  /** \brief Detect the whole PCI hierarchy.
+   *
+   * This flag enables detection of all I/O devices (even the uncommon ones)
+   * and bridges (even those that have no device behind them) using the pci
+   * backend.
+   * This implies HWLOC_TOPOLOGY_FLAG_IO_DEVICES.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_WHOLE_IO = (1UL<<4),
+
+  /** \brief Detect instruction caches.
+   *
+   * This flag enables detection of Instruction caches,
+   * instead of only Data and Unified caches.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_FLAG_ICACHES = (1UL<<5)
+};
+
+/** \brief Set OR'ed flags to non-yet-loaded topology.
+ *
+ * Set a OR'ed set of ::hwloc_topology_flags_e onto a topology that was not yet loaded.
+ *
+ * If this function is called multiple times, the last invokation will erase
+ * and replace the set of flags that was previously set.
+ *
+ * The flags set in a topology may be retrieved with hwloc_topology_get_flags()
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_flags (hwloc_topology_t topology, unsigned long flags);
+
+/** \brief Get OR'ed flags of a topology.
+ *
+ * Get the OR'ed set of ::hwloc_topology_flags_e of a topology.
+ *
+ * \return the flags previously set with hwloc_topology_set_flags().
+ */
+HWLOC_DECLSPEC unsigned long hwloc_topology_get_flags (hwloc_topology_t topology);
+
+/** \brief Does the topology context come from this system?
+ *
+ * \return 1 if this topology context was built using the system
+ * running this program.
+ * \return 0 instead (for instance if using another file-system root,
+ * a XML topology file, or a synthetic topology).
+ */
+HWLOC_DECLSPEC int hwloc_topology_is_thissystem(hwloc_topology_t  __hwloc_restrict topology) __hwloc_attribute_pure;
+
+/** \brief Flags describing actual discovery support for this topology. */
+struct hwloc_topology_discovery_support {
+  /** \brief Detecting the number of PU objects is supported. */
+  unsigned char pu;
+};
+
+/** \brief Flags describing actual PU binding support for this topology. */
+struct hwloc_topology_cpubind_support {
+  /** Binding the whole current process is supported.  */
+  unsigned char set_thisproc_cpubind;
+  /** Getting the binding of the whole current process is supported.  */
+  unsigned char get_thisproc_cpubind;
+  /** Binding a whole given process is supported.  */
+  unsigned char set_proc_cpubind;
+  /** Getting the binding of a whole given process is supported.  */
+  unsigned char get_proc_cpubind;
+  /** Binding the current thread only is supported.  */
+  unsigned char set_thisthread_cpubind;
+  /** Getting the binding of the current thread only is supported.  */
+  unsigned char get_thisthread_cpubind;
+  /** Binding a given thread only is supported.  */
+  unsigned char set_thread_cpubind;
+  /** Getting the binding of a given thread only is supported.  */
+  unsigned char get_thread_cpubind;
+  /** Getting the last processors where the whole current process ran is supported */
+  unsigned char get_thisproc_last_cpu_location;
+  /** Getting the last processors where a whole process ran is supported */
+  unsigned char get_proc_last_cpu_location;
+  /** Getting the last processors where the current thread ran is supported */
+  unsigned char get_thisthread_last_cpu_location;
+};
+
+/** \brief Flags describing actual memory binding support for this topology. */
+struct hwloc_topology_membind_support {
+  /** Binding the whole current process is supported.  */
+  unsigned char set_thisproc_membind;
+  /** Getting the binding of the whole current process is supported.  */
+  unsigned char get_thisproc_membind;
+  /** Binding a whole given process is supported.  */
+  unsigned char set_proc_membind;
+  /** Getting the binding of a whole given process is supported.  */
+  unsigned char get_proc_membind;
+  /** Binding the current thread only is supported.  */
+  unsigned char set_thisthread_membind;
+  /** Getting the binding of the current thread only is supported.  */
+  unsigned char get_thisthread_membind;
+  /** Binding a given memory area is supported. */
+  unsigned char set_area_membind;
+  /** Getting the binding of a given memory area is supported.  */
+  unsigned char get_area_membind;
+  /** Allocating a bound memory area is supported. */
+  unsigned char alloc_membind;
+  /** First-touch policy is supported. */
+  unsigned char firsttouch_membind;
+  /** Bind policy is supported. */
+  unsigned char bind_membind;
+  /** Interleave policy is supported. */
+  unsigned char interleave_membind;
+  /** Replication policy is supported. */
+  unsigned char replicate_membind;
+  /** Next-touch migration policy is supported. */
+  unsigned char nexttouch_membind;
+
+  /** Migration flags is supported. */
+  unsigned char migrate_membind;
+};
+
+/** \brief Set of flags describing actual support for this topology.
+ *
+ * This is retrieved with hwloc_topology_get_support() and will be valid until
+ * the topology object is destroyed.  Note: the values are correct only after
+ * discovery.
+ */
+struct hwloc_topology_support {
+  struct hwloc_topology_discovery_support *discovery;
+  struct hwloc_topology_cpubind_support *cpubind;
+  struct hwloc_topology_membind_support *membind;
+};
+
+/** \brief Retrieve the topology support. */
+HWLOC_DECLSPEC const struct hwloc_topology_support *hwloc_topology_get_support(hwloc_topology_t __hwloc_restrict topology);
+
+/** \brief Ignore an object type.
+ *
+ * Ignore all objects from the given type.
+ * The bottom-level type HWLOC_OBJ_PU and the HWLOC_OBJ_NUMANODE level may not be ignored.
+ * The top-level object of the hierarchy will never be ignored, even if this function
+ * succeeds.
+ * I/O objects may not be ignored, topology flags should be used to configure
+ * their discovery instead.
+ */
+HWLOC_DECLSPEC int hwloc_topology_ignore_type(hwloc_topology_t topology, hwloc_obj_type_t type);
+
+/** \brief Ignore an object type if it does not bring any structure.
+ *
+ * Ignore all objects from the given type as long as they do not bring any structure:
+ * Each ignored object should have a single children or be the only child of its parent.
+ * The bottom-level type HWLOC_OBJ_PU and the HWLOC_OBJ_NUMANODE level may not be ignored.
+ * I/O objects may not be ignored, topology flags should be used to configure
+ * their discovery instead.
+ * Group objects are always ignored if they do not bring any structure
+ * since they are designed to add structure to the topology.
+ * Misc objects cannot be ignored based on the structure since they are only annotations
+ * outside of the main topology structure.
+ */
+HWLOC_DECLSPEC int hwloc_topology_ignore_type_keep_structure(hwloc_topology_t topology, hwloc_obj_type_t type);
+
+/** \brief Ignore all objects that do not bring any structure.
+ *
+ * Ignore all objects that do not bring any structure:
+ * Each ignored object should have a single children or be the only child of its parent.
+ * I/O objects may not be ignored, topology flags should be used to configure
+ * their discovery instead.
+ */
+HWLOC_DECLSPEC int hwloc_topology_ignore_all_keep_structure(hwloc_topology_t topology);
+
+/** \brief Provide a distance matrix.
+ *
+ * Provide the matrix of distances between a set of objects of the given type.
+ * The set may or may not contain all the existing objects of this type.
+ * The objects are specified by their OS/physical index in the \p os_index
+ * array. The \p distances matrix follows the same order.
+ * The distance from object i to object j in the i*nbobjs+j.
+ *
+ * A single latency matrix may be defined for each type.
+ * If another distance matrix already exists for the given type,
+ * either because the user specified it or because the OS offers it,
+ * it will be replaced by the given one.
+ * If \p nbobjs is \c 0, \p os_index is \c NULL and \p distances is \c NULL,
+ * the existing distance matrix for the given type is removed.
+ *
+ * \note Distance matrices are ignored in multi-node topologies.
+ */
+HWLOC_DECLSPEC int hwloc_topology_set_distance_matrix(hwloc_topology_t __hwloc_restrict topology,
+						      hwloc_obj_type_t type, unsigned nbobjs,
+						      unsigned *os_index, float *distances);
+
+/** \brief Set the topology-specific userdata pointer.
+ *
+ * Each topology may store one application-given private data pointer.
+ * It is initialized to \c NULL.
+ * hwloc will never modify it.
+ *
+ * Use it as you wish, after hwloc_topology_init() and until hwloc_topolog_destroy().
+ *
+ * This pointer is not exported to XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata(hwloc_topology_t topology, const void *userdata);
+
+/** \brief Retrieve the topology-specific userdata pointer.
+ *
+ * Retrieve the application-given private data pointer that was
+ * previously set with hwloc_topology_set_userdata().
+ */
+HWLOC_DECLSPEC void * hwloc_topology_get_userdata(hwloc_topology_t topology);
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_tinker Modifying a loaded Topology
+ * @{
+ */
+
+/** \brief Flags to be given to hwloc_topology_restrict(). */
+enum hwloc_restrict_flags_e {
+  /** \brief Adapt distance matrices according to objects being removed during restriction.
+   * If this flag is not set, distance matrices are removed.
+   * \hideinitializer
+   */
+  HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES = (1<<0),
+
+  /** \brief Move Misc objects to ancestors if their parents are removed during restriction.
+   * If this flag is not set, Misc objects are removed when their parents are removed.
+   * \hideinitializer
+   */
+  HWLOC_RESTRICT_FLAG_ADAPT_MISC = (1<<1),
+
+  /** \brief Move I/O objects to ancestors if their parents are removed during restriction.
+   * If this flag is not set, I/O devices and bridges are removed when their parents are removed.
+   * \hideinitializer
+   */
+  HWLOC_RESTRICT_FLAG_ADAPT_IO = (1<<2)
+};
+
+/** \brief Restrict the topology to the given CPU set.
+ *
+ * Topology \p topology is modified so as to remove all objects that
+ * are not included (or partially included) in the CPU set \p cpuset.
+ * All objects CPU and node sets are restricted accordingly.
+ *
+ * \p flags is a OR'ed set of ::hwloc_restrict_flags_e.
+ *
+ * \note This call may not be reverted by restricting back to a larger
+ * cpuset. Once dropped during restriction, objects may not be brought
+ * back, except by loading another topology with hwloc_topology_load().
+ *
+ * \return 0 on success.
+ *
+ * \return -1 with errno set to EINVAL if the input cpuset is invalid.
+ * The topology is not modified in this case.
+ *
+ * \return -1 with errno set to ENOMEM on failure to allocate internal data.
+ * The topology is reinitialized in this case. It should be either
+ * destroyed with hwloc_topology_destroy() or configured and loaded again.
+ */
+HWLOC_DECLSPEC int hwloc_topology_restrict(hwloc_topology_t __hwloc_restrict topology, hwloc_const_cpuset_t cpuset, unsigned long flags);
+
+/** \brief Add a MISC object as a leaf of the topology
+ *
+ * A new MISC object will be created and inserted into the topology at the
+ * position given by parent. It is appended to the list of existing Misc children,
+ * without ever adding any intermediate hierarchy level. This is useful for
+ * annotating the topology without actually changing the hierarchy.
+ *
+ * \p name will be copied to the setup the new object attributes.
+ * However, the new leaf object will not have any \p cpuset.
+ *
+ * \return the newly-created object
+ *
+ * \note If \p name contains some non-printable characters, they will
+ * be dropped when exporting to XML, see hwloc_topology_export_xml() in hwloc/export.h.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_misc_object(hwloc_topology_t topology, hwloc_obj_t parent, const char *name);
+
+/** \brief Allocate a Group object to insert later with hwloc_topology_insert_group_object().
+ *
+ * This function returns a new Group object.
+ * The caller should (at least) initialize its sets before inserting the object.
+ * See hwloc_topology_insert_group_object().
+ *
+ * Custom name/value info pairs may be added with hwloc_obj_add_info() after
+ * insertion. For instance the Type info key allows to display something else
+ * than "Group" as the type name for this object in lstopo.
+ *
+ * It is recommended not to set any other object attribute before insertion,
+ * since the Group may get discarded during insertion.
+ *
+ * The object will be destroyed if passed to hwloc_topology_insert_group_object()
+ * without any set defined.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_alloc_group_object(hwloc_topology_t topology);
+
+/** \brief Add more structure to the topology by adding an intermediate Group
+ *
+ * The caller should first allocate a new Group object with hwloc_topology_alloc_group_object().
+ * Then it must initialize some of its sets to specify the final location of the Group
+ * in the topology.
+ * Then the object can be passed to this function for actual insertion in the topology.
+ *
+ * Either the cpuset or nodeset field (or both, if compatible) may be used to do so.
+ * If inserting with respect to the complete topology (including disallowed, offline
+ * or unknown object), complete_cpuset and/or complete_nodeset may be used instead.
+ * It grouping several objects, hwloc_obj_add_other_obj_sets() is an easy way to
+ * build the Group sets iteratively.
+ *
+ * \return The inserted object if it was properly inserted.
+ *
+ * \return An existing object if the Group was discarded because the topology already
+ * contained an object at the same location (the Group did not add any locality information).
+ * Any name/info key pair set before inserting is appended to the existing object.
+ *
+ * \return \c NULL if the insertion failed because of conflicting sets in topology tree.
+ *
+ * \return \c NULL if Group objects are always ignored in the topology.
+ *
+ * \return \c NULL if the object was discarded because no set was initialized in the Group
+ * before insert, or all of them were empty.
+ */
+HWLOC_DECLSPEC hwloc_obj_t hwloc_topology_insert_group_object(hwloc_topology_t topology, hwloc_obj_t group);
+
+/** \brief Setup object cpusets/nodesets by OR'ing another object's sets.
+ *
+ * For each defined cpuset or nodeset in \p src, allocate the corresponding set
+ * in \p dst and add \p src to it by OR'ing sets.
+ *
+ * This function is convenient between hwloc_topology_alloc_group_object()
+ * and hwloc_topology_insert_group_object(). It builds the sets of the new Group
+ * that will be inserted as a new intermediate parent of several objects.
+ */
+HWLOC_DECLSPEC int hwloc_obj_add_other_obj_sets(hwloc_obj_t dst, hwloc_obj_t src);
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+/* high-level helpers */
+#include <hwloc/helper.h>
+
+/* inline code of some functions above */
+#include <hwloc/inlines.h>
+
+/* exporting to XML or synthetic */
+#include <hwloc/export.h>
+
+/* topology diffs */
+#include <hwloc/diff.h>
+
+/* deprecated headers */
+#include <hwloc/deprecated.h>
+
+#endif /* HWLOC_H */
diff --git a/ext/hwloc/include/hwloc/autogen/config.h b/ext/hwloc/include/hwloc/autogen/config.h
new file mode 100644
index 0000000..f18e952
--- /dev/null
+++ b/ext/hwloc/include/hwloc/autogen/config.h
@@ -0,0 +1,202 @@
+/* include/hwloc/autogen/config.h.  Generated from config.h.in by configure.  */
+/* -*- c -*-
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* The configuration file */
+
+#ifndef HWLOC_CONFIG_H
+#define HWLOC_CONFIG_H
+
+#if (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
+# define __hwloc_restrict __restrict
+#else
+# if __STDC_VERSION__ >= 199901L
+#  define __hwloc_restrict restrict
+# else
+#  define __hwloc_restrict
+# endif
+#endif
+
+/* Note that if we're compiling C++, then just use the "inline"
+   keyword, since it's part of C++ */
+#if defined(c_plusplus) || defined(__cplusplus)
+#  define __hwloc_inline inline
+#elif defined(_MSC_VER) || defined(__HP_cc)
+#  define __hwloc_inline __inline
+#else
+#  define __hwloc_inline __inline__
+#endif
+
+/*
+ * Note: this is public.  We can not assume anything from the compiler used
+ * by the application and thus the HWLOC_HAVE_* macros below are not
+ * fetched from the autoconf result here. We only automatically use a few
+ * well-known easy cases.
+ */
+
+/* Some handy constants to make the logic below a little more readable */
+#if defined(__cplusplus) && \
+    (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR >= 4))
+#define GXX_ABOVE_3_4 1
+#else
+#define GXX_ABOVE_3_4 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
+#define GCC_ABOVE_2_95 1
+#else
+#define GCC_ABOVE_2_95 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96))
+#define GCC_ABOVE_2_96 1
+#else
+#define GCC_ABOVE_2_96 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
+#define GCC_ABOVE_3_3 1
+#else
+#define GCC_ABOVE_3_3 0
+#endif
+
+/* Maybe before gcc 2.95 too */
+#ifdef HWLOC_HAVE_ATTRIBUTE_UNUSED
+#define __HWLOC_HAVE_ATTRIBUTE_UNUSED HWLOC_HAVE_ATTRIBUTE_UNUSED 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_UNUSED (GXX_ABOVE_3_4 || GCC_ABOVE_2_95)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_UNUSED 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_UNUSED
+# define __hwloc_attribute_unused __attribute__((__unused__))
+#else
+# define __hwloc_attribute_unused
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_MALLOC
+#define __HWLOC_HAVE_ATTRIBUTE_MALLOC HWLOC_HAVE_ATTRIBUTE_MALLOC 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_MALLOC (GXX_ABOVE_3_4 || GCC_ABOVE_2_96)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_MALLOC 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_MALLOC
+# define __hwloc_attribute_malloc __attribute__((__malloc__))
+#else
+# define __hwloc_attribute_malloc
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_CONST
+#define __HWLOC_HAVE_ATTRIBUTE_CONST HWLOC_HAVE_ATTRIBUTE_CONST 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_CONST (GXX_ABOVE_3_4 || GCC_ABOVE_2_95)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_CONST 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_CONST
+# define __hwloc_attribute_const __attribute__((__const__))
+#else
+# define __hwloc_attribute_const
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_PURE
+#define __HWLOC_HAVE_ATTRIBUTE_PURE HWLOC_HAVE_ATTRIBUTE_PURE 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_PURE (GXX_ABOVE_3_4 || GCC_ABOVE_2_96)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_PURE 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_PURE
+# define __hwloc_attribute_pure __attribute__((__pure__))
+#else
+# define __hwloc_attribute_pure
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+#define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED HWLOC_HAVE_ATTRIBUTE_DEPRECATED 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED (GXX_ABOVE_3_4 || GCC_ABOVE_3_3)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+# define __hwloc_attribute_deprecated __attribute__((__deprecated__))
+#else
+# define __hwloc_attribute_deprecated
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+#define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS (GXX_ABOVE_3_4 || GCC_ABOVE_3_3)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+# define __hwloc_attribute_may_alias __attribute__((__may_alias__))
+#else
+# define __hwloc_attribute_may_alias
+#endif
+
+#ifdef HWLOC_C_HAVE_VISIBILITY
+# if HWLOC_C_HAVE_VISIBILITY
+#  define HWLOC_DECLSPEC __attribute__((__visibility__("default")))
+# else
+#  define HWLOC_DECLSPEC
+# endif
+#else
+# define HWLOC_DECLSPEC
+#endif
+
+/* Defined to 1 on Linux */
+#define HWLOC_LINUX_SYS 1
+
+/* Defined to 1 if the CPU_SET macro works */
+#define HWLOC_HAVE_CPU_SET 1
+
+/* Defined to 1 if you have the `windows.h' header. */
+/* #undef HWLOC_HAVE_WINDOWS_H */
+#define hwloc_pid_t pid_t
+#define hwloc_thread_t pthread_t
+
+#ifdef HWLOC_HAVE_WINDOWS_H
+
+#  include <windows.h>
+typedef DWORDLONG hwloc_uint64_t;
+
+#else /* HWLOC_HAVE_WINDOWS_H */
+
+#  ifdef hwloc_thread_t
+#    include <pthread.h>
+#  endif /* hwloc_thread_t */
+
+/* Defined to 1 if you have the <stdint.h> header file. */
+#  define HWLOC_HAVE_STDINT_H 1
+
+#  include <unistd.h>
+#  ifdef HWLOC_HAVE_STDINT_H
+#    include <stdint.h>
+#  endif
+typedef uint64_t hwloc_uint64_t;
+
+#endif /* HWLOC_HAVE_WINDOWS_H */
+
+/* Whether we need to re-define all the hwloc public symbols or not */
+#define HWLOC_SYM_TRANSFORM 0
+
+/* The hwloc symbol prefix */
+#define HWLOC_SYM_PREFIX hwloc_
+
+/* The hwloc symbol prefix in all caps */
+#define HWLOC_SYM_PREFIX_CAPS HWLOC_
+
+#endif /* HWLOC_CONFIG_H */
diff --git a/ext/hwloc/include/hwloc/autogen/config.h.in b/ext/hwloc/include/hwloc/autogen/config.h.in
new file mode 100644
index 0000000..e101b0a
--- /dev/null
+++ b/ext/hwloc/include/hwloc/autogen/config.h.in
@@ -0,0 +1,201 @@
+/* -*- c -*-
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* The configuration file */
+
+#ifndef HWLOC_CONFIG_H
+#define HWLOC_CONFIG_H
+
+#if (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
+# define __hwloc_restrict __restrict
+#else
+# if __STDC_VERSION__ >= 199901L
+#  define __hwloc_restrict restrict
+# else
+#  define __hwloc_restrict
+# endif
+#endif
+
+/* Note that if we're compiling C++, then just use the "inline"
+   keyword, since it's part of C++ */
+#if defined(c_plusplus) || defined(__cplusplus)
+#  define __hwloc_inline inline
+#elif defined(_MSC_VER) || defined(__HP_cc)
+#  define __hwloc_inline __inline
+#else
+#  define __hwloc_inline __inline__
+#endif
+
+/*
+ * Note: this is public.  We can not assume anything from the compiler used
+ * by the application and thus the HWLOC_HAVE_* macros below are not
+ * fetched from the autoconf result here. We only automatically use a few
+ * well-known easy cases.
+ */
+
+/* Some handy constants to make the logic below a little more readable */
+#if defined(__cplusplus) && \
+    (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR >= 4))
+#define GXX_ABOVE_3_4 1
+#else
+#define GXX_ABOVE_3_4 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 95))
+#define GCC_ABOVE_2_95 1
+#else
+#define GCC_ABOVE_2_95 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 2 || (__GNUC__ == 2 && __GNUC_MINOR__ >= 96))
+#define GCC_ABOVE_2_96 1
+#else
+#define GCC_ABOVE_2_96 0
+#endif
+
+#if !defined(__cplusplus) && \
+    (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 3))
+#define GCC_ABOVE_3_3 1
+#else
+#define GCC_ABOVE_3_3 0
+#endif
+
+/* Maybe before gcc 2.95 too */
+#ifdef HWLOC_HAVE_ATTRIBUTE_UNUSED
+#define __HWLOC_HAVE_ATTRIBUTE_UNUSED HWLOC_HAVE_ATTRIBUTE_UNUSED 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_UNUSED (GXX_ABOVE_3_4 || GCC_ABOVE_2_95)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_UNUSED 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_UNUSED
+# define __hwloc_attribute_unused __attribute__((__unused__))
+#else
+# define __hwloc_attribute_unused
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_MALLOC
+#define __HWLOC_HAVE_ATTRIBUTE_MALLOC HWLOC_HAVE_ATTRIBUTE_MALLOC 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_MALLOC (GXX_ABOVE_3_4 || GCC_ABOVE_2_96)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_MALLOC 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_MALLOC
+# define __hwloc_attribute_malloc __attribute__((__malloc__))
+#else
+# define __hwloc_attribute_malloc
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_CONST
+#define __HWLOC_HAVE_ATTRIBUTE_CONST HWLOC_HAVE_ATTRIBUTE_CONST 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_CONST (GXX_ABOVE_3_4 || GCC_ABOVE_2_95)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_CONST 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_CONST
+# define __hwloc_attribute_const __attribute__((__const__))
+#else
+# define __hwloc_attribute_const
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_PURE
+#define __HWLOC_HAVE_ATTRIBUTE_PURE HWLOC_HAVE_ATTRIBUTE_PURE 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_PURE (GXX_ABOVE_3_4 || GCC_ABOVE_2_96)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_PURE 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_PURE
+# define __hwloc_attribute_pure __attribute__((__pure__))
+#else
+# define __hwloc_attribute_pure
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+#define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED HWLOC_HAVE_ATTRIBUTE_DEPRECATED 
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED (GXX_ABOVE_3_4 || GCC_ABOVE_3_3)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_DEPRECATED 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_DEPRECATED
+# define __hwloc_attribute_deprecated __attribute__((__deprecated__))
+#else
+# define __hwloc_attribute_deprecated
+#endif
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+#define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+#elif defined(__GNUC__)
+# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS (GXX_ABOVE_3_4 || GCC_ABOVE_3_3)
+#else
+# define __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 0
+#endif
+#if __HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS
+# define __hwloc_attribute_may_alias __attribute__((__may_alias__))
+#else
+# define __hwloc_attribute_may_alias
+#endif
+
+#ifdef HWLOC_C_HAVE_VISIBILITY
+# if HWLOC_C_HAVE_VISIBILITY
+#  define HWLOC_DECLSPEC __attribute__((__visibility__("default")))
+# else
+#  define HWLOC_DECLSPEC
+# endif
+#else
+# define HWLOC_DECLSPEC
+#endif
+
+/* Defined to 1 on Linux */
+#undef HWLOC_LINUX_SYS
+
+/* Defined to 1 if the CPU_SET macro works */
+#undef HWLOC_HAVE_CPU_SET
+
+/* Defined to 1 if you have the `windows.h' header. */
+#undef HWLOC_HAVE_WINDOWS_H
+#undef hwloc_pid_t
+#undef hwloc_thread_t
+
+#ifdef HWLOC_HAVE_WINDOWS_H
+
+#  include <windows.h>
+typedef DWORDLONG hwloc_uint64_t;
+
+#else /* HWLOC_HAVE_WINDOWS_H */
+
+#  ifdef hwloc_thread_t
+#    include <pthread.h>
+#  endif /* hwloc_thread_t */
+
+/* Defined to 1 if you have the <stdint.h> header file. */
+#  undef HWLOC_HAVE_STDINT_H
+
+#  include <unistd.h>
+#  ifdef HWLOC_HAVE_STDINT_H
+#    include <stdint.h>
+#  endif
+typedef uint64_t hwloc_uint64_t;
+
+#endif /* HWLOC_HAVE_WINDOWS_H */
+
+/* Whether we need to re-define all the hwloc public symbols or not */
+#undef HWLOC_SYM_TRANSFORM
+
+/* The hwloc symbol prefix */
+#undef HWLOC_SYM_PREFIX
+
+/* The hwloc symbol prefix in all caps */
+#undef HWLOC_SYM_PREFIX_CAPS
+
+#endif /* HWLOC_CONFIG_H */
diff --git a/ext/hwloc/include/hwloc/autogen/stamp-h2 b/ext/hwloc/include/hwloc/autogen/stamp-h2
new file mode 100644
index 0000000..804e0ac
--- /dev/null
+++ b/ext/hwloc/include/hwloc/autogen/stamp-h2
@@ -0,0 +1 @@
+timestamp for include/hwloc/autogen/config.h
diff --git a/ext/hwloc/include/hwloc/bitmap.h b/ext/hwloc/include/hwloc/bitmap.h
new file mode 100644
index 0000000..bb18f65
--- /dev/null
+++ b/ext/hwloc/include/hwloc/bitmap.h
@@ -0,0 +1,359 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief The bitmap API, for use in hwloc itself.
+ */
+
+#ifndef HWLOC_BITMAP_H
+#define HWLOC_BITMAP_H
+
+#include <hwloc/autogen/config.h>
+#include <assert.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_bitmap The bitmap API
+ *
+ * The ::hwloc_bitmap_t type represents a set of objects, typically OS
+ * processors -- which may actually be hardware threads (represented
+ * by ::hwloc_cpuset_t, which is a typedef for ::hwloc_bitmap_t) -- or
+ * memory nodes (represented by ::hwloc_nodeset_t, which is also a
+ * typedef for ::hwloc_bitmap_t).
+ *
+ * <em>Both CPU and node sets are always indexed by OS physical number.</em>
+ *
+ * \note CPU sets and nodesets are described in \ref hwlocality_object_sets.
+ *
+ * A bitmap may be of infinite size.
+ *
+ * \note Several examples of using the bitmap API are available under the
+ * doc/examples/ directory in the source tree.
+ * Regression tests such as tests/hwloc_bitmap*.c also make intensive use
+ * of this API.
+ * @{
+ */
+
+
+/** \brief
+ * Set of bits represented as an opaque pointer to an internal bitmap.
+ */
+typedef struct hwloc_bitmap_s * hwloc_bitmap_t;
+/** \brief a non-modifiable ::hwloc_bitmap_t */
+typedef const struct hwloc_bitmap_s * hwloc_const_bitmap_t;
+
+
+/*
+ * Bitmap allocation, freeing and copying.
+ */
+
+/** \brief Allocate a new empty bitmap.
+ *
+ * \returns A valid bitmap or \c NULL.
+ *
+ * The bitmap should be freed by a corresponding call to
+ * hwloc_bitmap_free().
+ */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_alloc(void) __hwloc_attribute_malloc;
+
+/** \brief Allocate a new full bitmap. */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_alloc_full(void) __hwloc_attribute_malloc;
+
+/** \brief Free bitmap \p bitmap.
+ *
+ * If \p bitmap is \c NULL, no operation is performed.
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_free(hwloc_bitmap_t bitmap);
+
+/** \brief Duplicate bitmap \p bitmap by allocating a new bitmap and copying \p bitmap contents.
+ *
+ * If \p bitmap is \c NULL, \c NULL is returned.
+ */
+HWLOC_DECLSPEC hwloc_bitmap_t hwloc_bitmap_dup(hwloc_const_bitmap_t bitmap) __hwloc_attribute_malloc;
+
+/** \brief Copy the contents of bitmap \p src into the already allocated bitmap \p dst */
+HWLOC_DECLSPEC void hwloc_bitmap_copy(hwloc_bitmap_t dst, hwloc_const_bitmap_t src);
+
+
+/*
+ * Bitmap/String Conversion
+ */
+
+/** \brief Stringify a bitmap.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated string.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a bitmap string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+/** \brief Stringify a bitmap in the list format.
+ *
+ * Lists are comma-separated indexes or ranges.
+ * Ranges are dash separated indexes.
+ * The last range may not have a ending indexes if the bitmap is infinite.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated list string.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a list string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_list_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+/** \brief Stringify a bitmap in the taskset-specific format.
+ *
+ * The taskset command manipulates bitmap strings that contain a single
+ * (possible very long) hexadecimal number starting with 0x.
+ *
+ * Up to \p buflen characters may be written in buffer \p buf.
+ *
+ * If \p buflen is 0, \p buf may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_snprintf(char * __hwloc_restrict buf, size_t buflen, hwloc_const_bitmap_t bitmap);
+
+/** \brief Stringify a bitmap into a newly allocated taskset-specific string.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_asprintf(char ** strp, hwloc_const_bitmap_t bitmap);
+
+/** \brief Parse a taskset-specific bitmap string and stores it in bitmap \p bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_taskset_sscanf(hwloc_bitmap_t bitmap, const char * __hwloc_restrict string);
+
+
+/*
+ * Building bitmaps.
+ */
+
+/** \brief Empty the bitmap \p bitmap */
+HWLOC_DECLSPEC void hwloc_bitmap_zero(hwloc_bitmap_t bitmap);
+
+/** \brief Fill bitmap \p bitmap with all possible indexes (even if those objects don't exist or are otherwise unavailable) */
+HWLOC_DECLSPEC void hwloc_bitmap_fill(hwloc_bitmap_t bitmap);
+
+/** \brief Empty the bitmap \p bitmap and add bit \p id */
+HWLOC_DECLSPEC void hwloc_bitmap_only(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Fill the bitmap \p and clear the index \p id */
+HWLOC_DECLSPEC void hwloc_bitmap_allbut(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Setup bitmap \p bitmap from unsigned long \p mask */
+HWLOC_DECLSPEC void hwloc_bitmap_from_ulong(hwloc_bitmap_t bitmap, unsigned long mask);
+
+/** \brief Setup bitmap \p bitmap from unsigned long \p mask used as \p i -th subset */
+HWLOC_DECLSPEC void hwloc_bitmap_from_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask);
+
+
+/*
+ * Modifying bitmaps.
+ */
+
+/** \brief Add index \p id in bitmap \p bitmap */
+HWLOC_DECLSPEC void hwloc_bitmap_set(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Add indexes from \p begin to \p end in bitmap \p bitmap.
+ *
+ * If \p end is \c -1, the range is infinite.
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_set_range(hwloc_bitmap_t bitmap, unsigned begin, int end);
+
+/** \brief Replace \p i -th subset of bitmap \p bitmap with unsigned long \p mask */
+HWLOC_DECLSPEC void hwloc_bitmap_set_ith_ulong(hwloc_bitmap_t bitmap, unsigned i, unsigned long mask);
+
+/** \brief Remove index \p id from bitmap \p bitmap */
+HWLOC_DECLSPEC void hwloc_bitmap_clr(hwloc_bitmap_t bitmap, unsigned id);
+
+/** \brief Remove indexes from \p begin to \p end in bitmap \p bitmap.
+ *
+ * If \p end is \c -1, the range is infinite.
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_clr_range(hwloc_bitmap_t bitmap, unsigned begin, int end);
+
+/** \brief Keep a single index among those set in bitmap \p bitmap
+ *
+ * May be useful before binding so that the process does not
+ * have a chance of migrating between multiple logical CPUs
+ * in the original mask.
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_singlify(hwloc_bitmap_t bitmap);
+
+
+/*
+ * Consulting bitmaps.
+ */
+
+/** \brief Convert the beginning part of bitmap \p bitmap into unsigned long \p mask */
+HWLOC_DECLSPEC unsigned long hwloc_bitmap_to_ulong(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Convert the \p i -th subset of bitmap \p bitmap into unsigned long mask */
+HWLOC_DECLSPEC unsigned long hwloc_bitmap_to_ith_ulong(hwloc_const_bitmap_t bitmap, unsigned i) __hwloc_attribute_pure;
+
+/** \brief Test whether index \p id is part of bitmap \p bitmap */
+HWLOC_DECLSPEC int hwloc_bitmap_isset(hwloc_const_bitmap_t bitmap, unsigned id) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap is empty */
+HWLOC_DECLSPEC int hwloc_bitmap_iszero(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap is completely full */
+HWLOC_DECLSPEC int hwloc_bitmap_isfull(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the first index (least significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is set.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_first(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the next index in bitmap \p bitmap which is after index \p prev
+ *
+ * If \p prev is -1, the first index is returned.
+ *
+ * \return -1 if no index with higher index is bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_next(hwloc_const_bitmap_t bitmap, int prev) __hwloc_attribute_pure;
+
+/** \brief Compute the last index (most significant bit) in bitmap \p bitmap
+ *
+ * \return -1 if no index is bitmap, or if the index bitmap is infinite.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_last(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Compute the "weight" of bitmap \p bitmap (i.e., number of
+ * indexes that are in the bitmap).
+ *
+ * \return the number of indexes that are in the bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_weight(hwloc_const_bitmap_t bitmap) __hwloc_attribute_pure;
+
+/** \brief Loop macro iterating on bitmap \p bitmap
+ * \hideinitializer
+ *
+ * \p index is the loop variable; it should be an unsigned int.  The
+ * first iteration will set \p index to the lowest index in the bitmap.
+ * Successive iterations will iterate through, in order, all remaining
+ * indexes that in the bitmap.  To be specific: each iteration will return a
+ * value for \p index such that hwloc_bitmap_isset(bitmap, index) is true.
+ *
+ * The assert prevents the loop from being infinite if the bitmap is infinite.
+ */
+#define hwloc_bitmap_foreach_begin(id, bitmap) \
+do { \
+        assert(hwloc_bitmap_weight(bitmap) != -1); \
+        for (id = hwloc_bitmap_first(bitmap); \
+             (unsigned) id != (unsigned) -1; \
+             id = hwloc_bitmap_next(bitmap, id)) { \
+/** \brief End of loop. Needs a terminating ';'.
+ * \hideinitializer
+ *
+ * \sa hwloc_bitmap_foreach_begin */
+#define hwloc_bitmap_foreach_end() \
+        } \
+} while (0)
+
+
+/*
+ * Combining bitmaps.
+ */
+
+/** \brief Or bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_or (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief And bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_and (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief And bitmap \p bitmap1 and the negation of \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_andnot (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief Xor bitmaps \p bitmap1 and \p bitmap2 and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap1 or \p bitmap2
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_xor (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2);
+
+/** \brief Negate bitmap \p bitmap and store the result in bitmap \p res
+ *
+ * \p res can be the same as \p bitmap
+ */
+HWLOC_DECLSPEC void hwloc_bitmap_not (hwloc_bitmap_t res, hwloc_const_bitmap_t bitmap);
+
+
+/*
+ * Comparing bitmaps.
+ */
+
+/** \brief Test whether bitmaps \p bitmap1 and \p bitmap2 intersects */
+HWLOC_DECLSPEC int hwloc_bitmap_intersects (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p sub_bitmap is part of bitmap \p super_bitmap */
+HWLOC_DECLSPEC int hwloc_bitmap_isincluded (hwloc_const_bitmap_t sub_bitmap, hwloc_const_bitmap_t super_bitmap) __hwloc_attribute_pure;
+
+/** \brief Test whether bitmap \p bitmap1 is equal to bitmap \p bitmap2 */
+HWLOC_DECLSPEC int hwloc_bitmap_isequal (hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 using their lowest index.
+ *
+ * Smaller least significant bit is smaller.
+ * The empty bitmap is considered higher than anything.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_compare_first(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 in lexicographic order.
+ *
+ * Lexicographic comparison of bitmaps, starting for their highest indexes.
+ * Compare last indexes first, then second, etc.
+ * The empty bitmap is considered lower than anything.
+ *
+ * \note This is different from the non-existing hwloc_bitmap_compare_last()
+ * which would only compare the highest index of each bitmap.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_compare(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_BITMAP_H */
diff --git a/ext/hwloc/include/hwloc/cuda.h b/ext/hwloc/include/hwloc/cuda.h
new file mode 100644
index 0000000..a02d677
--- /dev/null
+++ b/ext/hwloc/include/hwloc/cuda.h
@@ -0,0 +1,224 @@
+/*
+ * Copyright © 2010-2015 Inria.  All rights reserved.
+ * Copyright © 2010-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the CUDA Driver API.
+ *
+ * Applications that use both hwloc and the CUDA Driver API may want to
+ * include this file so as to get topology information for CUDA devices.
+ *
+ */
+
+#ifndef HWLOC_CUDA_H
+#define HWLOC_CUDA_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <cuda.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_cuda Interoperability with the CUDA Driver API
+ *
+ * This interface offers ways to retrieve topology information about
+ * CUDA devices when using the CUDA Driver API.
+ *
+ * @{
+ */
+
+/** \brief Return the domain, bus and device IDs of the CUDA device \p cudevice.
+ *
+ * Device \p cudevice must match the local machine.
+ */
+static __hwloc_inline int
+hwloc_cuda_get_device_pci_ids(hwloc_topology_t topology __hwloc_attribute_unused,
+			      CUdevice cudevice, int *domain, int *bus, int *dev)
+{
+  CUresult cres;
+
+#if CUDA_VERSION >= 4000
+  cres = cuDeviceGetAttribute(domain, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, cudevice);
+  if (cres != CUDA_SUCCESS) {
+    errno = ENOSYS;
+    return -1;
+  }
+#else
+  *domain = 0;
+#endif
+  cres = cuDeviceGetAttribute(bus, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, cudevice);
+  if (cres != CUDA_SUCCESS) {
+    errno = ENOSYS;
+    return -1;
+  }
+  cres = cuDeviceGetAttribute(dev, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, cudevice);
+  if (cres != CUDA_SUCCESS) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+  return 0;
+}
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to device \p cudevice.
+ *
+ * Return the CPU set describing the locality of the CUDA device \p cudevice.
+ *
+ * Topology \p topology and device \p cudevice must match the local machine.
+ * I/O devices detection and the CUDA component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_cuda_get_device_osdev()
+ * and hwloc_cuda_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_cuda_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			     CUdevice cudevice, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+  /* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_CUDA_DEVICE_SYSFS_PATH_MAX 128
+  char path[HWLOC_CUDA_DEVICE_SYSFS_PATH_MAX];
+  FILE *sysfile = NULL;
+  int domainid, busid, deviceid;
+
+  if (hwloc_cuda_get_device_pci_ids(topology, cudevice, &domainid, &busid, &deviceid))
+    return -1;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", domainid, busid, deviceid);
+  sysfile = fopen(path, "r");
+  if (!sysfile)
+    return -1;
+
+  hwloc_linux_parse_cpumap_file(sysfile, set);
+  if (hwloc_bitmap_iszero(set))
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+  fclose(sysfile);
+#else
+  /* Non-Linux systems simply get a full cpuset */
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc PCI device object corresponding to the
+ * CUDA device \p cudevice.
+ *
+ * Return the PCI device object describing the CUDA device \p cudevice.
+ * Return NULL if there is none.
+ *
+ * Topology \p topology and device \p cudevice must match the local machine.
+ * I/O devices detection must be enabled in topology \p topology.
+ * The CUDA component is not needed in the topology.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cuda_get_device_pcidev(hwloc_topology_t topology, CUdevice cudevice)
+{
+  int domain, bus, dev;
+
+  if (hwloc_cuda_get_device_pci_ids(topology, cudevice, &domain, &bus, &dev))
+    return NULL;
+
+  return hwloc_get_pcidev_by_busid(topology, domain, bus, dev, 0);
+}
+
+/** \brief Get the hwloc OS device object corresponding to CUDA device \p cudevice.
+ *
+ * Return the hwloc OS device object that describes the given
+ * CUDA device \p cudevice. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p cudevice must match the local machine.
+ * I/O devices detection and the NVML component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_cuda_get_device_cpuset().
+ *
+ * \note The corresponding hwloc PCI device may be found by looking
+ * at the result parent pointer.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cuda_get_device_osdev(hwloc_topology_t topology, CUdevice cudevice)
+{
+	hwloc_obj_t osdev = NULL;
+	int domain, bus, dev;
+
+	if (hwloc_cuda_get_device_pci_ids(topology, cudevice, &domain, &bus, &dev))
+		return NULL;
+
+	osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		hwloc_obj_t pcidev = osdev->parent;
+		if (strncmp(osdev->name, "cuda", 4))
+			continue;
+		if (pcidev
+		    && pcidev->type == HWLOC_OBJ_PCI_DEVICE
+		    && (int) pcidev->attr->pcidev.domain == domain
+		    && (int) pcidev->attr->pcidev.bus == bus
+		    && (int) pcidev->attr->pcidev.dev == dev
+		    && pcidev->attr->pcidev.func == 0)
+			return osdev;
+	}
+
+	return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * CUDA device whose index is \p idx.
+ *
+ * Return the OS device object describing the CUDA device whose
+ * index is \p idx. Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the CUDA component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ *
+ * \note This function is identical to hwloc_cudart_get_device_osdev_by_index().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cuda_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+		    && osdev->name
+		    && !strncmp("cuda", osdev->name, 4)
+		    && atoi(osdev->name + 4) == (int) idx)
+			return osdev;
+	}
+	return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_CUDA_H */
diff --git a/ext/hwloc/include/hwloc/cudart.h b/ext/hwloc/include/hwloc/cudart.h
new file mode 100644
index 0000000..759c3cf
--- /dev/null
+++ b/ext/hwloc/include/hwloc/cudart.h
@@ -0,0 +1,184 @@
+/*
+ * Copyright © 2010-2015 Inria.  All rights reserved.
+ * Copyright © 2010-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the CUDA Runtime API.
+ *
+ * Applications that use both hwloc and the CUDA Runtime API may want to
+ * include this file so as to get topology information for CUDA devices.
+ *
+ */
+
+#ifndef HWLOC_CUDART_H
+#define HWLOC_CUDART_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <cuda.h> /* for CUDA_VERSION */
+#include <cuda_runtime_api.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_cudart Interoperability with the CUDA Runtime API
+ *
+ * This interface offers ways to retrieve topology information about
+ * CUDA devices when using the CUDA Runtime API.
+ *
+ * @{
+ */
+
+/** \brief Return the domain, bus and device IDs of the CUDA device whose index is \p idx.
+ *
+ * Device index \p idx must match the local machine.
+ */
+static __hwloc_inline int
+hwloc_cudart_get_device_pci_ids(hwloc_topology_t topology __hwloc_attribute_unused,
+				int idx, int *domain, int *bus, int *dev)
+{
+  cudaError_t cerr;
+  struct cudaDeviceProp prop;
+
+  cerr = cudaGetDeviceProperties(&prop, idx);
+  if (cerr) {
+    errno = ENOSYS;
+    return -1;
+  }
+
+#if CUDA_VERSION >= 4000
+  *domain = prop.pciDomainID;
+#else
+  *domain = 0;
+#endif
+
+  *bus = prop.pciBusID;
+  *dev = prop.pciDeviceID;
+
+  return 0;
+}
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to device \p idx.
+ *
+ * Return the CPU set describing the locality of the CUDA device
+ * whose index is \p idx.
+ *
+ * Topology \p topology and device \p idx must match the local machine.
+ * I/O devices detection and the CUDA component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_cudart_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_cudart_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			       int idx, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+  /* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_CUDART_DEVICE_SYSFS_PATH_MAX 128
+  char path[HWLOC_CUDART_DEVICE_SYSFS_PATH_MAX];
+  FILE *sysfile = NULL;
+  int domain, bus, dev;
+
+  if (hwloc_cudart_get_device_pci_ids(topology, idx, &domain, &bus, &dev))
+    return -1;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", domain, bus, dev);
+  sysfile = fopen(path, "r");
+  if (!sysfile)
+    return -1;
+
+  hwloc_linux_parse_cpumap_file(sysfile, set);
+  if (hwloc_bitmap_iszero(set))
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+  fclose(sysfile);
+#else
+  /* Non-Linux systems simply get a full cpuset */
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc PCI device object corresponding to the
+ * CUDA device whose index is \p idx.
+ *
+ * Return the PCI device object describing the CUDA device whose
+ * index is \p idx. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p idx must match the local machine.
+ * I/O devices detection must be enabled in topology \p topology.
+ * The CUDA component is not needed in the topology.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cudart_get_device_pcidev(hwloc_topology_t topology, int idx)
+{
+  int domain, bus, dev;
+
+  if (hwloc_cudart_get_device_pci_ids(topology, idx, &domain, &bus, &dev))
+    return NULL;
+
+  return hwloc_get_pcidev_by_busid(topology, domain, bus, dev, 0);
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * CUDA device whose index is \p idx.
+ *
+ * Return the OS device object describing the CUDA device whose
+ * index is \p idx. Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the CUDA component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_cudart_get_device_cpuset().
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ *
+ * \note This function is identical to hwloc_cuda_get_device_osdev_by_index().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_cudart_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+		    && osdev->name
+		    && !strncmp("cuda", osdev->name, 4)
+		    && atoi(osdev->name + 4) == (int) idx)
+			return osdev;
+	}
+	return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_CUDART_H */
diff --git a/ext/hwloc/include/hwloc/deprecated.h b/ext/hwloc/include/hwloc/deprecated.h
new file mode 100644
index 0000000..c4370b6
--- /dev/null
+++ b/ext/hwloc/include/hwloc/deprecated.h
@@ -0,0 +1,114 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/**
+ * This file contains the inline code of functions declared in hwloc.h
+ */
+
+#ifndef HWLOC_DEPRECATED_H
+#define HWLOC_DEPRECATED_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* backward compat with v1.10 before Socket->Package renaming */
+#define HWLOC_OBJ_SOCKET HWLOC_OBJ_PACKAGE
+/* backward compat with v1.10 before Node->NUMANode clarification */
+#define HWLOC_OBJ_NODE HWLOC_OBJ_NUMANODE
+
+/** \brief Return an object type from the string
+ *
+ * \return -1 if unrecognized.
+ */
+HWLOC_DECLSPEC hwloc_obj_type_t hwloc_obj_type_of_string (const char * string) __hwloc_attribute_pure __hwloc_attribute_deprecated;
+
+/** \brief Stringify a given topology object into a human-readable form.
+ *
+ * \note This function is deprecated in favor of hwloc_obj_type_snprintf()
+ * and hwloc_obj_attr_snprintf() since it is not very flexible and
+ * only prints physical/OS indexes.
+ *
+ * Fill string \p string up to \p size characters with the description
+ * of topology object \p obj in topology \p topology.
+ *
+ * If \p verbose is set, a longer description is used. Otherwise a
+ * short description is used.
+ *
+ * \p indexprefix is used to prefix the \p os_index attribute number of
+ * the object in the description. If \c NULL, the \c # character is used.
+ *
+ * If \p size is 0, \p string may safely be \c NULL.
+ *
+ * \return the number of character that were actually written if not truncating,
+ * or that would have been written (not including the ending \\0).
+ */
+HWLOC_DECLSPEC int hwloc_obj_snprintf(char * __hwloc_restrict string, size_t size,
+				      hwloc_topology_t topology, hwloc_obj_t obj,
+				      const char * __hwloc_restrict indexprefix, int verbose) __hwloc_attribute_deprecated;
+
+/** \brief Distribute \p n items over the topology under \p root
+ *
+ * Array \p cpuset will be filled with \p n cpusets recursively distributed
+ * linearly over the topology under \p root, down to depth \p until (which can
+ * be INT_MAX to distribute down to the finest level).
+ *
+ * This is typically useful when an application wants to distribute \p n
+ * threads over a machine, giving each of them as much private cache as
+ * possible and keeping them locally in number order.
+ *
+ * The caller may typically want to also call hwloc_bitmap_singlify()
+ * before binding a thread so that it does not move at all.
+ *
+ * \note This function requires the \p root object to have a CPU set.
+ */
+static __hwloc_inline void
+hwloc_distribute(hwloc_topology_t topology, hwloc_obj_t root, hwloc_cpuset_t *set, unsigned n, unsigned until) __hwloc_attribute_deprecated;
+static __hwloc_inline void
+hwloc_distribute(hwloc_topology_t topology, hwloc_obj_t root, hwloc_cpuset_t *set, unsigned n, unsigned until)
+{
+  hwloc_distrib(topology, &root, 1, set, n, until, 0);
+}
+
+/** \brief Distribute \p n items over the topology under \p roots
+ *
+ * This is the same as hwloc_distribute, but takes an array of roots instead of
+ * just one root.
+ *
+ * \note This function requires the \p roots objects to have a CPU set.
+ */
+static __hwloc_inline void
+hwloc_distributev(hwloc_topology_t topology, hwloc_obj_t *roots, unsigned n_roots, hwloc_cpuset_t *set, unsigned n, unsigned until) __hwloc_attribute_deprecated;
+static __hwloc_inline void
+hwloc_distributev(hwloc_topology_t topology, hwloc_obj_t *roots, unsigned n_roots, hwloc_cpuset_t *set, unsigned n, unsigned until)
+{
+  hwloc_distrib(topology, roots, n_roots, set, n, until, 0);
+}
+
+/** \brief Insert a misc object by parent.
+ *
+ * Identical to hwloc_topology_insert_misc_object().
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name) __hwloc_attribute_deprecated;
+static __hwloc_inline hwloc_obj_t
+hwloc_topology_insert_misc_object_by_parent(hwloc_topology_t topology, hwloc_obj_t parent, const char *name)
+{
+  return hwloc_topology_insert_misc_object(topology, parent, name);
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_INLINES_H */
diff --git a/ext/hwloc/include/hwloc/diff.h b/ext/hwloc/include/hwloc/diff.h
new file mode 100644
index 0000000..3f1beb1
--- /dev/null
+++ b/ext/hwloc/include/hwloc/diff.h
@@ -0,0 +1,299 @@
+/*
+ * Copyright © 2013-2014 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Topology differences.
+ */
+
+#ifndef HWLOC_DIFF_H
+#define HWLOC_DIFF_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_diff Topology differences
+ *
+ * Applications that manipulate many similar topologies, for instance
+ * one for each node of a homogeneous cluster, may want to compress
+ * topologies to reduce the memory footprint.
+ *
+ * This file offers a way to manipulate the difference between topologies
+ * and export/import it to/from XML.
+ * Compression may therefore be achieved by storing one topology
+ * entirely while the others are only described by their differences
+ * with the former.
+ * The actual topology can be reconstructed when actually needed by
+ * applying the precomputed difference to the reference topology.
+ *
+ * This interface targets very similar nodes.
+ * Only very simple differences between topologies are actually
+ * supported, for instance a change in the memory size, the name
+ * of the object, or some info attribute.
+ * More complex differences such as adding or removing objects cannot
+ * be represented in the difference structures and therefore return
+ * errors.
+ *
+ * It means that there is no need to apply the difference when
+ * looking at the tree organization (how many levels, how many
+ * objects per level, what kind of objects, CPU and node sets, etc)
+ * and when binding to objects.
+ * However the difference must be applied when looking at object
+ * attributes such as the name, the memory size or info attributes.
+ *
+ * @{
+ */
+
+
+/** \brief Type of one object attribute difference.
+ */
+typedef enum hwloc_topology_diff_obj_attr_type_e {
+  /** \brief The object local memory is modified.
+   * The union is a hwloc_topology_diff_obj_attr_uint64_s
+   * (and the index field is ignored).
+   */
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE,
+
+  /** \brief The object name is modified.
+   * The union is a hwloc_topology_diff_obj_attr_string_s
+   * (and the name field is ignored).
+   */
+
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME,
+  /** \brief the value of an info attribute is modified.
+   * The union is a hwloc_topology_diff_obj_attr_string_s.
+   */
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO
+} hwloc_topology_diff_obj_attr_type_t;
+
+/** \brief One object attribute difference.
+ */
+union hwloc_topology_diff_obj_attr_u {
+  struct hwloc_topology_diff_obj_attr_generic_s {
+    /* each part of the union must start with these */
+    hwloc_topology_diff_obj_attr_type_t type;
+  } generic;
+
+  /** \brief Integer attribute modification with an optional index. */
+  struct hwloc_topology_diff_obj_attr_uint64_s {
+    /* used for storing integer attributes */
+    hwloc_topology_diff_obj_attr_type_t type;
+    hwloc_uint64_t index; /* not used for SIZE */
+    hwloc_uint64_t oldvalue;
+    hwloc_uint64_t newvalue;
+  } uint64;
+
+  /** \brief String attribute modification with an optional name */
+  struct hwloc_topology_diff_obj_attr_string_s {
+    /* used for storing name and info pairs */
+    hwloc_topology_diff_obj_attr_type_t type;
+    char *name; /* not used for NAME */
+    char *oldvalue;
+    char *newvalue;
+  } string;
+};
+
+
+/** \brief Type of one element of a difference list.
+ */
+typedef enum hwloc_topology_diff_type_e {
+  /*< \brief An object attribute was changed.
+  * The union is a hwloc_topology_diff_obj_attr_s.
+  */
+  HWLOC_TOPOLOGY_DIFF_OBJ_ATTR,
+
+  /*< \brief The difference is too complex,
+   * it cannot be represented. The difference below
+   * this object has not been checked.
+   * hwloc_topology_diff_build() will return 1.
+   *
+   * The union is a hwloc_topology_diff_too_complex_s.
+   */
+  HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX
+} hwloc_topology_diff_type_t;
+
+/** \brief One element of a difference list between two topologies.
+ */
+typedef union hwloc_topology_diff_u {
+  struct hwloc_topology_diff_generic_s {
+    /* each part of the union must start with these */
+    hwloc_topology_diff_type_t type;
+    union hwloc_topology_diff_u * next; /* pointer to the next element of the list, or NULL */
+  } generic;
+
+  /* A difference in an object attribute. */
+  struct hwloc_topology_diff_obj_attr_s {
+    hwloc_topology_diff_type_t type; /* must be HWLOC_TOPOLOGY_DIFF_OBJ_ATTR */
+    union hwloc_topology_diff_u * next;
+    /* List of attribute differences for a single object */
+    unsigned obj_depth;
+    unsigned obj_index;
+    union hwloc_topology_diff_obj_attr_u diff;
+  } obj_attr;
+
+  /* A difference that is too complex. */
+  struct hwloc_topology_diff_too_complex_s {
+    hwloc_topology_diff_type_t type; /* must be HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX */
+    union hwloc_topology_diff_u * next;
+    /* Where we had to stop computing the diff in the first topology */
+    unsigned obj_depth;
+    unsigned obj_index;
+  } too_complex;
+} * hwloc_topology_diff_t;
+
+
+/** \brief Compute the difference between 2 topologies.
+ *
+ * The difference is stored as a list of hwloc_topology_diff_t entries
+ * starting at \p diff.
+ * It is computed by doing a depth-first traversal of both topology trees
+ * simultaneously.
+ *
+ * If the difference between 2 objects is too complex to be represented
+ * (for instance if some objects have different types, or different numbers
+ * of children), a special diff entry of type HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX
+ * is queued.
+ * The computation of the diff does not continue below these objects.
+ * So each such diff entry means that the difference between two subtrees
+ * could not be computed.
+ *
+ * \return 0 if the difference can be represented properly.
+ *
+ * \return 0 with \p diff pointing to NULL if there is no difference
+ * between the topologies.
+ *
+ * \return 1 if the difference is too complex (see above). Some entries in
+ * the list will be of type HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX.
+ *
+ * \return -1 on any other error.
+ *
+ * \note \p flags is currently not used. It should be 0.
+ *
+ * \note The output diff has to be freed with hwloc_topology_diff_destroy().
+ *
+ * \note The output diff can only be exported to XML or passed to
+ * hwloc_topology_diff_apply() if 0 was returned, i.e. if no entry of type
+ * HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX is listed.
+ *
+ * \note The output diff may be modified by removing some entries from
+ * the list. The removed entries should be freed by passing them to
+ * to hwloc_topology_diff_destroy() (possible as another list).
+*/
+HWLOC_DECLSPEC int hwloc_topology_diff_build(hwloc_topology_t topology, hwloc_topology_t newtopology, unsigned long flags, hwloc_topology_diff_t *diff);
+
+/** \brief Flags to be given to hwloc_topology_diff_apply().
+ */
+enum hwloc_topology_diff_apply_flags_e {
+  /** \brief Apply topology diff in reverse direction.
+   * \hideinitializer
+   */
+  HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE = (1UL<<0)
+};
+
+/** \brief Apply a topology diff to an existing topology.
+ *
+ * \p flags is an OR'ed set of hwloc_topology_diff_apply_flags_e.
+ *
+ * The new topology is modified in place. hwloc_topology_dup()
+ * may be used to duplicate it before patching.
+ *
+ * If the difference cannot be applied entirely, all previous applied
+ * elements are unapplied before returning.
+ *
+ * \return 0 on success.
+ *
+ * \return -N if applying the difference failed while trying
+ * to apply the N-th part of the difference. For instance -1
+ * is returned if the very first difference element could not
+ * be applied.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_apply(hwloc_topology_t topology, hwloc_topology_diff_t diff, unsigned long flags);
+
+/** \brief Destroy a list of topology differences.
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_destroy(hwloc_topology_t topology, hwloc_topology_diff_t diff);
+
+/** \brief Load a list of topology differences from a XML file.
+ *
+ * If not \c NULL, \p refname will be filled with the identifier
+ * string of the reference topology for the difference file,
+ * if any was specified in the XML file.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ *
+ * \note the pointer returned in refname should later be freed
+ * by the caller.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_load_xml(hwloc_topology_t topology, const char *xmlpath, hwloc_topology_diff_t *diff, char **refname);
+
+/** \brief Export a list of topology differences to a XML file.
+ *
+ * If not \c NULL, \p refname defines an identifier string
+ * for the reference topology which was used as a base when
+ * computing this difference.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ * This attribute is given back when reading the diff from XML.
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_export_xml(hwloc_topology_t topology, hwloc_topology_diff_t diff, const char *refname, const char *xmlpath);
+
+/** \brief Load a list of topology differences from a XML buffer.
+ *
+ * If not \c NULL, \p refname will be filled with the identifier
+ * string of the reference topology for the difference file,
+ * if any was specified in the XML file.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ *
+ * \note the pointer returned in refname should later be freed
+ * by the caller.
+  */
+HWLOC_DECLSPEC int hwloc_topology_diff_load_xmlbuffer(hwloc_topology_t topology, const char *xmlbuffer, int buflen, hwloc_topology_diff_t *diff, char **refname);
+
+/** \brief Export a list of topology differences to a XML buffer.
+ *
+ * If not \c NULL, \p refname defines an identifier string
+ * for the reference topology which was used as a base when
+ * computing this difference.
+ * This identifier is usually the name of the other XML file
+ * that contains the reference topology.
+ * This attribute is given back when reading the diff from XML.
+ *
+ * \note The XML buffer should later be freed with hwloc_free_xmlbuffer().
+ *
+ * \note The \p topology parameter must be a valid topology
+ * but it is not required that it is related to \p diff.
+ */
+HWLOC_DECLSPEC int hwloc_topology_diff_export_xmlbuffer(hwloc_topology_t topology, hwloc_topology_diff_t diff, const char *refname, char **xmlbuffer, int *buflen);
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_HELPER_H */
diff --git a/ext/hwloc/include/hwloc/export.h b/ext/hwloc/include/hwloc/export.h
new file mode 100644
index 0000000..194ee6c
--- /dev/null
+++ b/ext/hwloc/include/hwloc/export.h
@@ -0,0 +1,221 @@
+/*
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Exporting Topologies to XML or to Synthetic strings.
+ */
+
+#ifndef HWLOC_EXPORT_H
+#define HWLOC_EXPORT_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#elif 0
+}
+#endif
+
+
+/** \defgroup hwlocality_xmlexport Exporting Topologies to XML
+ * @{
+ */
+
+/** \brief Export the topology into an XML file.
+ *
+ * This file may be loaded later through hwloc_topology_set_xml().
+ *
+ * \return -1 if a failure occured.
+ *
+ * \note See also hwloc_topology_set_userdata_export_callback()
+ * for exporting application-specific object userdata.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ *
+ * \note Only printable characters may be exported to XML string attributes.
+ * Any other character, especially any non-ASCII character, will be silently
+ * dropped.
+ *
+ * \note If \p name is "-", the XML output is sent to the standard output.
+ */
+HWLOC_DECLSPEC int hwloc_topology_export_xml(hwloc_topology_t topology, const char *xmlpath);
+
+/** \brief Export the topology into a newly-allocated XML memory buffer.
+ *
+ * \p xmlbuffer is allocated by the callee and should be freed with
+ * hwloc_free_xmlbuffer() later in the caller.
+ *
+ * This memory buffer may be loaded later through hwloc_topology_set_xmlbuffer().
+ *
+ * \return -1 if a failure occured.
+ *
+ * \note See also hwloc_topology_set_userdata_export_callback()
+ * for exporting application-specific object userdata.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ *
+ * \note Only printable characters may be exported to XML string attributes.
+ * Any other character, especially any non-ASCII character, will be silently
+ * dropped.
+ */
+HWLOC_DECLSPEC int hwloc_topology_export_xmlbuffer(hwloc_topology_t topology, char **xmlbuffer, int *buflen);
+
+/** \brief Free a buffer allocated by hwloc_topology_export_xmlbuffer() */
+HWLOC_DECLSPEC void hwloc_free_xmlbuffer(hwloc_topology_t topology, char *xmlbuffer);
+
+/** \brief Set the application-specific callback for exporting object userdata
+ *
+ * The object userdata pointer is not exported to XML by default because hwloc
+ * does not know what it contains.
+ *
+ * This function lets applications set \p export_cb to a callback function
+ * that converts this opaque userdata into an exportable string.
+ *
+ * \p export_cb is invoked during XML export for each object whose
+ * \p userdata pointer is not \c NULL.
+ * The callback should use hwloc_export_obj_userdata() or
+ * hwloc_export_obj_userdata_base64() to actually export
+ * something to XML (possibly multiple times per object).
+ *
+ * \p export_cb may be set to \c NULL if userdata should not be exported to XML.
+ *
+ * \note The topology-specific userdata pointer is ignored when exporting to XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata_export_callback(hwloc_topology_t topology,
+								void (*export_cb)(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj));
+
+/** \brief Export some object userdata to XML
+ *
+ * This function may only be called from within the export() callback passed
+ * to hwloc_topology_set_userdata_export_callback().
+ * It may be invoked one of multiple times to export some userdata to XML.
+ * The \p buffer content of length \p length is stored with optional name
+ * \p name.
+ *
+ * When importing this XML file, the import() callback (if set) will be
+ * called exactly as many times as hwloc_export_obj_userdata() was called
+ * during export(). It will receive the corresponding \p name, \p buffer
+ * and \p length arguments.
+ *
+ * \p reserved, \p topology and \p obj must be the first three parameters
+ * that were given to the export callback.
+ *
+ * Only printable characters may be exported to XML string attributes.
+ * If a non-printable character is passed in \p name or \p buffer,
+ * the function returns -1 with errno set to EINVAL.
+ *
+ * If exporting binary data, the application should first encode into
+ * printable characters only (or use hwloc_export_obj_userdata_base64()).
+ * It should also take care of portability issues if the export may
+ * be reimported on a different architecture.
+ */
+HWLOC_DECLSPEC int hwloc_export_obj_userdata(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length);
+
+/** \brief Encode and export some object userdata to XML
+ *
+ * This function is similar to hwloc_export_obj_userdata() but it encodes
+ * the input buffer into printable characters before exporting.
+ * On import, decoding is automatically performed before the data is given
+ * to the import() callback if any.
+ *
+ * This function may only be called from within the export() callback passed
+ * to hwloc_topology_set_userdata_export_callback().
+ *
+ * The function does not take care of portability issues if the export
+ * may be reimported on a different architecture.
+ */
+HWLOC_DECLSPEC int hwloc_export_obj_userdata_base64(void *reserved, hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length);
+
+/** \brief Set the application-specific callback for importing userdata
+ *
+ * On XML import, userdata is ignored by default because hwloc does not know
+ * how to store it in memory.
+ *
+ * This function lets applications set \p import_cb to a callback function
+ * that will get the XML-stored userdata and store it in the object as expected
+ * by the application.
+ *
+ * \p import_cb is called during hwloc_topology_load() as many times as
+ * hwloc_export_obj_userdata() was called during export. The topology
+ * is not entirely setup yet. Object attributes are ready to consult,
+ * but links between objects are not.
+ *
+ * \p import_cb may be \c NULL if userdata should be ignored during import.
+ *
+ * \note \p buffer contains \p length characters followed by a null byte ('\0').
+ *
+ * \note This function should be called before hwloc_topology_load().
+ *
+ * \note The topology-specific userdata pointer is ignored when importing from XML.
+ */
+HWLOC_DECLSPEC void hwloc_topology_set_userdata_import_callback(hwloc_topology_t topology,
+								void (*import_cb)(hwloc_topology_t topology, hwloc_obj_t obj, const char *name, const void *buffer, size_t length));
+
+/** @} */
+
+
+/** \defgroup hwlocality_syntheticexport Exporting Topologies to Synthetic
+ * @{
+ */
+
+/** \brief Flags for exporting synthetic topologies.
+ *
+ * Flags to be given as a OR'ed set to hwloc_topology_export_synthetic().
+ */
+enum hwloc_topology_export_synthetic_flags_e {
+ /** \brief Export extended types such as L2dcache as basic types such as Cache.
+  *
+  * This is required if loading the synthetic description with hwloc < 1.9.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES = (1UL<<0),
+
+ /** \brief Do not export level attributes.
+  *
+  * Ignore level attributes such as memory/cache sizes or PU indexes.
+  * This is required if loading the synthetic description with hwloc < 1.10.
+  * \hideinitializer
+  */
+ HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS = (1UL<<1)
+};
+
+/** \brief Export the topology as a synthetic string.
+ *
+ * At most \p buflen characters will be written in \p buffer,
+ * including the terminating \0.
+ *
+ * This exported string may be given back to hwloc_topology_set_synthetic().
+ *
+ * \p flags is a OR'ed set of hwloc_topology_export_synthetic_flags_e.
+ *
+ * \return The number of characters that were written,
+ * not including the terminating \0.
+ *
+ * \return -1 if the topology could not be exported,
+ * for instance if it is not symmetric.
+ *
+ * \note I/O and Misc children are ignored, the synthetic string only
+ * describes normal children.
+ *
+ * \note A 1024-byte buffer should be large enough for exporting
+ * topologies in the vast majority of cases.
+ */
+  HWLOC_DECLSPEC int hwloc_topology_export_synthetic(hwloc_topology_t topology, char *buffer, size_t buflen, unsigned long flags);
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_EXPORT_H */
diff --git a/ext/hwloc/include/hwloc/gl.h b/ext/hwloc/include/hwloc/gl.h
new file mode 100644
index 0000000..4b8b3f2
--- /dev/null
+++ b/ext/hwloc/include/hwloc/gl.h
@@ -0,0 +1,135 @@
+/*
+ * Copyright © 2012 Blue Brain Project, EPFL. All rights reserved.
+ * Copyright © 2012-2013 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and OpenGL displays.
+ *
+ * Applications that use both hwloc and OpenGL may want to include
+ * this file so as to get topology information for OpenGL displays.
+ */
+
+#ifndef HWLOC_GL_H
+#define HWLOC_GL_H
+
+#include <hwloc.h>
+
+#include <stdio.h>
+#include <string.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_gl Interoperability with OpenGL displays
+ *
+ * This interface offers ways to retrieve topology information about
+ * OpenGL displays.
+ *
+ * Only the NVIDIA display locality information is currently available,
+ * using the NV-CONTROL X11 extension and the NVCtrl library.
+ *
+ * @{
+ */
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * OpenGL display given by port and device index.
+ *
+ * Return the OS device object describing the OpenGL display
+ * whose port (server) is \p port and device (screen) is \p device.
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the GL component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_gl_get_display_osdev_by_port_device(hwloc_topology_t topology,
+					  unsigned port, unsigned device)
+{
+        unsigned x = (unsigned) -1, y = (unsigned) -1;
+        hwloc_obj_t osdev = NULL;
+        while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+                if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+                    && osdev->name
+                    && sscanf(osdev->name, ":%u.%u", &x, &y) == 2
+                    && port == x && device == y)
+                        return osdev;
+        }
+	errno = EINVAL;
+        return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * OpenGL display given by name.
+ *
+ * Return the OS device object describing the OpenGL display
+ * whose name is \p name, built as ":port.device" such as ":0.0" .
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the GL component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_gl_get_display_osdev_by_name(hwloc_topology_t topology,
+				   const char *name)
+{
+        hwloc_obj_t osdev = NULL;
+        while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+                if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+                    && osdev->name
+                    && !strcmp(name, osdev->name))
+                        return osdev;
+        }
+	errno = EINVAL;
+        return NULL;
+}
+
+/** \brief Get the OpenGL display port and device corresponding
+ * to the given hwloc OS object.
+ *
+ * Return the OpenGL display port (server) in \p port and device (screen)
+ * in \p screen that correspond to the given hwloc OS device object.
+ * Return \c -1 if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the GL component must be enabled in the topology.
+ */
+static __hwloc_inline int
+hwloc_gl_get_display_by_osdev(hwloc_topology_t topology __hwloc_attribute_unused,
+			      hwloc_obj_t osdev,
+			      unsigned *port, unsigned *device)
+{
+	unsigned x = -1, y = -1;
+	if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+	    && sscanf(osdev->name, ":%u.%u", &x, &y) == 2) {
+		*port = x;
+		*device = y;
+		return 0;
+	}
+	errno = EINVAL;
+	return -1;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_GL_H */
+
diff --git a/ext/hwloc/include/hwloc/glibc-sched.h b/ext/hwloc/include/hwloc/glibc-sched.h
new file mode 100644
index 0000000..1f9ba7c
--- /dev/null
+++ b/ext/hwloc/include/hwloc/glibc-sched.h
@@ -0,0 +1,125 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2013 inria.  All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and glibc scheduling routines.
+ *
+ * Applications that use both hwloc and glibc scheduling routines such as
+ * sched_getaffinity() or pthread_attr_setaffinity_np() may want to include
+ * this file so as to ease conversion between their respective types.
+ */
+
+#ifndef HWLOC_GLIBC_SCHED_H
+#define HWLOC_GLIBC_SCHED_H
+
+#include <hwloc.h>
+#include <hwloc/helper.h>
+#include <assert.h>
+
+#if !defined _GNU_SOURCE || !defined _SCHED_H || (!defined CPU_SETSIZE && !defined sched_priority)
+#error Please make sure to include sched.h before including glibc-sched.h, and define _GNU_SOURCE before any inclusion of sched.h
+#endif
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+#ifdef HWLOC_HAVE_CPU_SET
+
+
+/** \defgroup hwlocality_glibc_sched Interoperability with glibc sched affinity
+ *
+ * This interface offers ways to convert between hwloc cpusets and glibc cpusets
+ * such as those manipulated by sched_getaffinity() or pthread_attr_setaffinity_np().
+ *
+ * \note Topology \p topology must match the current machine.
+ *
+ * @{
+ */
+
+
+/** \brief Convert hwloc CPU set \p toposet into glibc sched affinity CPU set \p schedset
+ *
+ * This function may be used before calling sched_setaffinity or any other function
+ * that takes a cpu_set_t as input parameter.
+ *
+ * \p schedsetsize should be sizeof(cpu_set_t) unless \p schedset was dynamically allocated with CPU_ALLOC
+ */
+static __hwloc_inline int
+hwloc_cpuset_to_glibc_sched_affinity(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t hwlocset,
+				    cpu_set_t *schedset, size_t schedsetsize)
+{
+#ifdef CPU_ZERO_S
+  unsigned cpu;
+  CPU_ZERO_S(schedsetsize, schedset);
+  hwloc_bitmap_foreach_begin(cpu, hwlocset)
+    CPU_SET_S(cpu, schedsetsize, schedset);
+  hwloc_bitmap_foreach_end();
+#else /* !CPU_ZERO_S */
+  unsigned cpu;
+  CPU_ZERO(schedset);
+  assert(schedsetsize == sizeof(cpu_set_t));
+  hwloc_bitmap_foreach_begin(cpu, hwlocset)
+    CPU_SET(cpu, schedset);
+  hwloc_bitmap_foreach_end();
+#endif /* !CPU_ZERO_S */
+  return 0;
+}
+
+/** \brief Convert glibc sched affinity CPU set \p schedset into hwloc CPU set
+ *
+ * This function may be used before calling sched_setaffinity  or any other function
+ * that takes a cpu_set_t  as input parameter.
+ *
+ * \p schedsetsize should be sizeof(cpu_set_t) unless \p schedset was dynamically allocated with CPU_ALLOC
+ */
+static __hwloc_inline int
+hwloc_cpuset_from_glibc_sched_affinity(hwloc_topology_t topology __hwloc_attribute_unused, hwloc_cpuset_t hwlocset,
+                                       const cpu_set_t *schedset, size_t schedsetsize)
+{
+  int cpu;
+#ifdef CPU_ZERO_S
+  int count;
+#endif
+  hwloc_bitmap_zero(hwlocset);
+#ifdef CPU_ZERO_S
+  count = CPU_COUNT_S(schedsetsize, schedset);
+  cpu = 0;
+  while (count) {
+    if (CPU_ISSET_S(cpu, schedsetsize, schedset)) {
+      hwloc_bitmap_set(hwlocset, cpu);
+      count--;
+    }
+    cpu++;
+  }
+#else /* !CPU_ZERO_S */
+  /* sched.h does not support dynamic cpu_set_t (introduced in glibc 2.7),
+   * assume we have a very old interface without CPU_COUNT (added in 2.6)
+   */
+  assert(schedsetsize == sizeof(cpu_set_t));
+  for(cpu=0; cpu<CPU_SETSIZE; cpu++)
+    if (CPU_ISSET(cpu, schedset))
+      hwloc_bitmap_set(hwlocset, cpu);
+#endif /* !CPU_ZERO_S */
+  return 0;
+}
+
+/** @} */
+
+
+#endif /* CPU_SET */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_GLIBC_SCHED_H */
diff --git a/ext/hwloc/include/hwloc/helper.h b/ext/hwloc/include/hwloc/helper.h
new file mode 100644
index 0000000..883b87d
--- /dev/null
+++ b/ext/hwloc/include/hwloc/helper.h
@@ -0,0 +1,1249 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief High-level hwloc traversal helpers.
+ */
+
+#ifndef HWLOC_HELPER_H
+#define HWLOC_HELPER_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#include <stdlib.h>
+#include <errno.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_helper_find_inside Finding Objects inside a CPU set
+ * @{
+ */
+
+/** \brief Get the first largest object included in the given cpuset \p set.
+ *
+ * \return the first object that is included in \p set and whose parent is not.
+ *
+ * This is convenient for iterating over all largest objects within a CPU set
+ * by doing a loop getting the first largest object and clearing its CPU set
+ * from the remaining CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_first_largest_obj_inside_cpuset(hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+  hwloc_obj_t obj = hwloc_get_root_obj(topology);
+  if (!hwloc_bitmap_intersects(obj->cpuset, set))
+    return NULL;
+  while (!hwloc_bitmap_isincluded(obj->cpuset, set)) {
+    /* while the object intersects without being included, look at its children */
+    hwloc_obj_t child = obj->first_child;
+    while (child) {
+      if (hwloc_bitmap_intersects(child->cpuset, set))
+	break;
+      child = child->next_sibling;
+    }
+    if (!child)
+      /* no child intersects, return their father */
+      return obj;
+    /* found one intersecting child, look at its children */
+    obj = child;
+  }
+  /* obj is included, return it */
+  return obj;
+}
+
+/** \brief Get the set of largest objects covering exactly a given cpuset \p set
+ *
+ * \return the number of objects returned in \p objs.
+ */
+HWLOC_DECLSPEC int hwloc_get_largest_objs_inside_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+						 hwloc_obj_t * __hwloc_restrict objs, int max);
+
+/** \brief Return the next object at depth \p depth included in CPU set \p set.
+ *
+ * If \p prev is \c NULL, return the first object at depth \p depth
+ * included in \p set.  The next invokation should pass the previous
+ * return value in \p prev so as to obtain the next object in \p set.
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					   unsigned depth, hwloc_obj_t prev)
+{
+  hwloc_obj_t next = hwloc_get_next_obj_by_depth(topology, depth, prev);
+  if (!next)
+    return NULL;
+  while (next && !hwloc_bitmap_isincluded(next->cpuset, set))
+    next = next->next_cousin;
+  return next;
+}
+
+/** \brief Return the next object of type \p type included in CPU set \p set.
+ *
+ * If there are multiple or no depth for given type, return \c NULL
+ * and let the caller fallback to
+ * hwloc_get_next_obj_inside_cpuset_by_depth().
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					  hwloc_obj_type_t type, hwloc_obj_t prev)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_next_obj_inside_cpuset_by_depth(topology, set, depth, prev);
+}
+
+/** \brief Return the (logically) \p idx -th object at depth \p depth included in CPU set \p set.
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				      unsigned depth, unsigned idx) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				      unsigned depth, unsigned idx)
+{
+  hwloc_obj_t obj = hwloc_get_obj_by_depth (topology, depth, 0);
+  unsigned count = 0;
+  if (!obj)
+    return NULL;
+  while (obj) {
+    if (hwloc_bitmap_isincluded(obj->cpuset, set)) {
+      if (count == idx)
+	return obj;
+      count++;
+    }
+    obj = obj->next_cousin;
+  }
+  return NULL;
+}
+
+/** \brief Return the \p idx -th object of type \p type included in CPU set \p set.
+ *
+ * If there are multiple or no depth for given type, return \c NULL
+ * and let the caller fallback to
+ * hwloc_get_obj_inside_cpuset_by_depth().
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				     hwloc_obj_type_t type, unsigned idx) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+				     hwloc_obj_type_t type, unsigned idx)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_obj_inside_cpuset_by_depth(topology, set, depth, idx);
+}
+
+/** \brief Return the number of objects at depth \p depth included in CPU set \p set.
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline unsigned
+hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					 unsigned depth) __hwloc_attribute_pure;
+static __hwloc_inline unsigned
+hwloc_get_nbobjs_inside_cpuset_by_depth (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					 unsigned depth)
+{
+  hwloc_obj_t obj = hwloc_get_obj_by_depth (topology, depth, 0);
+  unsigned count = 0;
+  if (!obj)
+    return 0;
+  while (obj) {
+    if (hwloc_bitmap_isincluded(obj->cpuset, set))
+      count++;
+    obj = obj->next_cousin;
+  }
+  return count;
+}
+
+/** \brief Return the number of objects of type \p type included in CPU set \p set.
+ *
+ * If no object for that type exists inside CPU set \p set, 0 is
+ * returned.  If there are several levels with objects of that type
+ * inside CPU set \p set, -1 is returned.
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline int
+hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					hwloc_obj_type_t type) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_get_nbobjs_inside_cpuset_by_type (hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    return 0;
+  if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return -1; /* FIXME: agregate nbobjs from different levels? */
+  return hwloc_get_nbobjs_inside_cpuset_by_depth(topology, set, depth);
+}
+
+/** \brief Return the logical index among the objects included in CPU set \p set.
+ *
+ * Consult all objects in the same level as \p obj and inside CPU set \p set
+ * in the logical order, and return the index of \p obj within them.
+ * If \p set covers the entire topology, this is the logical index of \p obj.
+ * Otherwise, this is similar to a logical index within the part of the topology
+ * defined by CPU set \p set.
+ *
+ * \note This function cannot work if obj does not have CPU sets (I/O objects).
+ */
+static __hwloc_inline int
+hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				   hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_get_obj_index_inside_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				   hwloc_obj_t obj)
+{
+  int idx = 0;
+  if (!hwloc_bitmap_isincluded(obj->cpuset, set))
+    return -1;
+  /* count how many objects are inside the cpuset on the way from us to the beginning of the level */
+  while ((obj = obj->prev_cousin) != NULL)
+    if (hwloc_bitmap_isincluded(obj->cpuset, set))
+      idx++;
+  return idx;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_covering Finding Objects covering at least CPU set
+ * @{
+ */
+
+/** \brief Get the child covering at least CPU set \p set.
+ *
+ * \return \c NULL if no child matches or if \p set is empty.
+ *
+ * \note This function cannot work if parent does not have a CPU set (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_child_covering_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				hwloc_obj_t parent) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_child_covering_cpuset (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_const_cpuset_t set,
+				hwloc_obj_t parent)
+{
+  hwloc_obj_t child;
+  if (hwloc_bitmap_iszero(set))
+    return NULL;
+  child = parent->first_child;
+  while (child) {
+    if (child->cpuset && hwloc_bitmap_isincluded(set, child->cpuset))
+      return child;
+    child = child->next_sibling;
+  }
+  return NULL;
+}
+
+/** \brief Get the lowest object covering at least CPU set \p set
+ *
+ * \return \c NULL if no object matches or if \p set is empty.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+  struct hwloc_obj *current = hwloc_get_root_obj(topology);
+  if (hwloc_bitmap_iszero(set) || !hwloc_bitmap_isincluded(set, current->cpuset))
+    return NULL;
+  while (1) {
+    hwloc_obj_t child = hwloc_get_child_covering_cpuset(topology, set, current);
+    if (!child)
+      return current;
+    current = child;
+  }
+}
+
+/** \brief Iterate through same-depth objects covering at least CPU set \p set
+ *
+ * If object \p prev is \c NULL, return the first object at depth \p
+ * depth covering at least part of CPU set \p set.  The next
+ * invokation should pass the previous return value in \p prev so as
+ * to obtain the next object covering at least another part of \p set.
+ *
+ * \note This function cannot work if objects at the given depth do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_covering_cpuset_by_depth(hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					    unsigned depth, hwloc_obj_t prev)
+{
+  hwloc_obj_t next = hwloc_get_next_obj_by_depth(topology, depth, prev);
+  if (!next)
+    return NULL;
+  while (next && !hwloc_bitmap_intersects(set, next->cpuset))
+    next = next->next_cousin;
+  return next;
+}
+
+/** \brief Iterate through same-type objects covering at least CPU set \p set
+ *
+ * If object \p prev is \c NULL, return the first object of type \p
+ * type covering at least part of CPU set \p set.  The next invokation
+ * should pass the previous return value in \p prev so as to obtain
+ * the next object of type \p type covering at least another part of
+ * \p set.
+ *
+ * If there are no or multiple depths for type \p type, \c NULL is returned.
+ * The caller may fallback to hwloc_get_next_obj_covering_cpuset_by_depth()
+ * for each depth.
+ *
+ * \note This function cannot work if objects of the given type do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_covering_cpuset_by_type(hwloc_topology_t topology, hwloc_const_cpuset_t set,
+					   hwloc_obj_type_t type, hwloc_obj_t prev)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_next_obj_covering_cpuset_by_depth(topology, set, depth, prev);
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_ancestors Looking at Ancestor and Child Objects
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Returns the ancestor object of \p obj at depth \p depth. */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, unsigned depth, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_depth (hwloc_topology_t topology __hwloc_attribute_unused, unsigned depth, hwloc_obj_t obj)
+{
+  hwloc_obj_t ancestor = obj;
+  if (obj->depth < depth)
+    return NULL;
+  while (ancestor && ancestor->depth > depth)
+    ancestor = ancestor->parent;
+  return ancestor;
+}
+
+/** \brief Returns the ancestor object of \p obj with type \p type. */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_type_t type, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_ancestor_obj_by_type (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_type_t type, hwloc_obj_t obj)
+{
+  hwloc_obj_t ancestor = obj->parent;
+  while (ancestor && ancestor->type != type)
+    ancestor = ancestor->parent;
+  return ancestor;
+}
+
+/** \brief Returns the common parent object to objects lvl1 and lvl2 */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj1, hwloc_obj_t obj2) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_common_ancestor_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj1, hwloc_obj_t obj2)
+{
+  /* the loop isn't so easy since intermediate ancestors may have
+   * different depth, causing us to alternate between using obj1->parent
+   * and obj2->parent. Also, even if at some point we find ancestors of
+   * of the same depth, their ancestors may have different depth again.
+   */
+  while (obj1 != obj2) {
+    while (obj1->depth > obj2->depth)
+      obj1 = obj1->parent;
+    while (obj2->depth > obj1->depth)
+      obj2 = obj2->parent;
+    if (obj1 != obj2 && obj1->depth == obj2->depth) {
+      obj1 = obj1->parent;
+      obj2 = obj2->parent;
+    }
+  }
+  return obj1;
+}
+
+/** \brief Returns true if \p obj is inside the subtree beginning with ancestor object \p subtree_root.
+ *
+ * \note This function cannot work if \p obj and \p subtree_root objects do
+ * not have CPU sets (I/O objects).
+ */
+static __hwloc_inline int
+hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj, hwloc_obj_t subtree_root) __hwloc_attribute_pure;
+static __hwloc_inline int
+hwloc_obj_is_in_subtree (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj, hwloc_obj_t subtree_root)
+{
+  return obj->cpuset && subtree_root->cpuset && hwloc_bitmap_isincluded(obj->cpuset, subtree_root->cpuset);
+}
+
+/** \brief Return the next child.
+ *
+ * Return the next child among the normal children list, then among the I/O
+ * children list, then among the Misc children list.
+ *
+ * If \p prev is \c NULL, return the first child.
+ *
+ * Return \c NULL when there is no next child.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_child (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t parent, hwloc_obj_t prev)
+{
+  hwloc_obj_t obj;
+  int state = 0;
+  if (prev) {
+    if (prev->type == HWLOC_OBJ_MISC)
+      state = 2;
+    else if (prev->type == HWLOC_OBJ_BRIDGE || prev->type == HWLOC_OBJ_PCI_DEVICE || prev->type == HWLOC_OBJ_OS_DEVICE)
+      state = 1;
+    obj = prev->next_sibling;
+  } else {
+    obj = parent->first_child;
+  }
+  if (!obj && state == 0) {
+    obj = parent->io_first_child;
+    state = 1;
+  }
+  if (!obj && state == 1) {
+    obj = parent->misc_first_child;
+    state = 2;
+  }
+  return obj;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_cache Looking at Cache Objects
+ * @{
+ */
+
+/** \brief Find the depth of cache objects matching cache depth and type.
+ *
+ * Return the depth of the topology level that contains cache objects
+ * whose attributes match \p cachedepth and \p cachetype. This function
+ * intends to disambiguate the case where hwloc_get_type_depth() returns
+ * \p HWLOC_TYPE_DEPTH_MULTIPLE.
+ *
+ * If no cache level matches, \p HWLOC_TYPE_DEPTH_UNKNOWN is returned.
+ *
+ * If \p cachetype is \p HWLOC_OBJ_CACHE_UNIFIED, the depth of the
+ * unique matching unified cache level is returned.
+ *
+ * If \p cachetype is \p HWLOC_OBJ_CACHE_DATA or \p HWLOC_OBJ_CACHE_INSTRUCTION,
+ * either a matching cache, or a unified cache is returned.
+ *
+ * If \p cachetype is \c -1, it is ignored and multiple levels may
+ * match. The function returns either the depth of a uniquely matching
+ * level or \p HWLOC_TYPE_DEPTH_MULTIPLE.
+ */
+static __hwloc_inline int
+hwloc_get_cache_type_depth (hwloc_topology_t topology,
+			    unsigned cachelevel, hwloc_obj_cache_type_t cachetype)
+{
+  int depth;
+  int found = HWLOC_TYPE_DEPTH_UNKNOWN;
+  for (depth=0; ; depth++) {
+    hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, depth, 0);
+    if (!obj)
+      break;
+    if (obj->type != HWLOC_OBJ_CACHE || obj->attr->cache.depth != cachelevel)
+      /* doesn't match, try next depth */
+      continue;
+    if (cachetype == (hwloc_obj_cache_type_t) -1) {
+      if (found != HWLOC_TYPE_DEPTH_UNKNOWN) {
+	/* second match, return MULTIPLE */
+        return HWLOC_TYPE_DEPTH_MULTIPLE;
+      }
+      /* first match, mark it as found */
+      found = depth;
+      continue;
+    }
+    if (obj->attr->cache.type == cachetype || obj->attr->cache.type == HWLOC_OBJ_CACHE_UNIFIED)
+      /* exact match (either unified is alone, or we match instruction or data), return immediately */
+      return depth;
+  }
+  /* went to the bottom, return what we found */
+  return found;
+}
+
+/** \brief Get the first cache covering a cpuset \p set
+ *
+ * \return \c NULL if no cache matches.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_cache_covering_cpuset (hwloc_topology_t topology, hwloc_const_cpuset_t set)
+{
+  hwloc_obj_t current = hwloc_get_obj_covering_cpuset(topology, set);
+  while (current) {
+    if (current->type == HWLOC_OBJ_CACHE)
+      return current;
+    current = current->parent;
+  }
+  return NULL;
+}
+
+/** \brief Get the first cache shared between an object and somebody else.
+ *
+ * \return \c NULL if no cache matches or if an invalid object is given.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_shared_cache_covering_obj (hwloc_topology_t topology __hwloc_attribute_unused, hwloc_obj_t obj)
+{
+  hwloc_obj_t current = obj->parent;
+  if (!obj->cpuset)
+    return NULL;
+  while (current) {
+    if (!hwloc_bitmap_isequal(current->cpuset, obj->cpuset)
+        && current->type == HWLOC_OBJ_CACHE)
+      return current;
+    current = current->parent;
+  }
+  return NULL;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_find_misc Finding objects, miscellaneous helpers
+ * @{
+ *
+ * Be sure to see the figure in \ref termsanddefs that shows a
+ * complete topology tree, including depths, child/sibling/cousin
+ * relationships, and an example of an asymmetric topology where one
+ * package has fewer caches than its peers.
+ */
+
+/** \brief Returns the object of type ::HWLOC_OBJ_PU with \p os_index.
+ *
+ * This function is useful for converting a CPU set into the PU
+ * objects it contains.
+ * When retrieving the current binding (e.g. with hwloc_get_cpubind()),
+ * one may iterate over the bits of the resulting CPU set with
+ * hwloc_bitmap_foreach_begin(), and find the corresponding PUs
+ * with this function.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pu_obj_by_os_index(hwloc_topology_t topology, unsigned os_index)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PU, obj)) != NULL)
+    if (obj->os_index == os_index)
+      return obj;
+  return NULL;
+}
+
+/** \brief Returns the object of type ::HWLOC_OBJ_NUMANODE with \p os_index.
+ *
+ * This function is useful for converting a nodeset into the NUMA node
+ * objects it contains.
+ * When retrieving the current binding (e.g. with hwloc_get_membind_nodeset()),
+ * one may iterate over the bits of the resulting nodeset with
+ * hwloc_bitmap_foreach_begin(), and find the corresponding NUMA nodes
+ * with this function.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_numanode_obj_by_os_index(hwloc_topology_t topology, unsigned os_index) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_numanode_obj_by_os_index(hwloc_topology_t topology, unsigned os_index)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, obj)) != NULL)
+    if (obj->os_index == os_index)
+      return obj;
+  return NULL;
+}
+
+/** \brief Do a depth-first traversal of the topology to find and sort
+ *
+ * all objects that are at the same depth than \p src.
+ * Report in \p objs up to \p max physically closest ones to \p src.
+ *
+ * \return the number of objects returned in \p objs.
+ *
+ * \return 0 if \p src is an I/O object.
+ *
+ * \note This function requires the \p src object to have a CPU set.
+ */
+/* TODO: rather provide an iterator? Provide a way to know how much should be allocated? By returning the total number of objects instead? */
+HWLOC_DECLSPEC unsigned hwloc_get_closest_objs (hwloc_topology_t topology, hwloc_obj_t src, hwloc_obj_t * __hwloc_restrict objs, unsigned max);
+
+/** \brief Find an object below another object, both specified by types and indexes.
+ *
+ * Start from the top system object and find object of type \p type1
+ * and logical index \p idx1.  Then look below this object and find another
+ * object of type \p type2 and logical index \p idx2.  Indexes are specified
+ * within the parent, not withing the entire system.
+ *
+ * For instance, if type1 is PACKAGE, idx1 is 2, type2 is CORE and idx2
+ * is 3, return the fourth core object below the third package.
+ *
+ * \note This function requires these objects to have a CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_by_type (hwloc_topology_t topology,
+			     hwloc_obj_type_t type1, unsigned idx1,
+			     hwloc_obj_type_t type2, unsigned idx2) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_by_type (hwloc_topology_t topology,
+			     hwloc_obj_type_t type1, unsigned idx1,
+			     hwloc_obj_type_t type2, unsigned idx2)
+{
+  hwloc_obj_t obj;
+  obj = hwloc_get_obj_by_type (topology, type1, idx1);
+  if (!obj)
+    return NULL;
+  return hwloc_get_obj_inside_cpuset_by_type(topology, obj->cpuset, type2, idx2);
+}
+
+/** \brief Find an object below a chain of objects specified by types and indexes.
+ *
+ * This is a generalized version of hwloc_get_obj_below_by_type().
+ *
+ * Arrays \p typev and \p idxv must contain \p nr types and indexes.
+ *
+ * Start from the top system object and walk the arrays \p typev and \p idxv.
+ * For each type and logical index couple in the arrays, look under the previously found
+ * object to find the index-th object of the given type.
+ * Indexes are specified within the parent, not withing the entire system.
+ *
+ * For instance, if nr is 3, typev contains NODE, PACKAGE and CORE,
+ * and idxv contains 0, 1 and 2, return the third core object below
+ * the second package below the first NUMA node.
+ *
+ * \note This function requires all these objects and the root object
+ * to have a CPU set.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_array_by_type (hwloc_topology_t topology, int nr, hwloc_obj_type_t *typev, unsigned *idxv) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_below_array_by_type (hwloc_topology_t topology, int nr, hwloc_obj_type_t *typev, unsigned *idxv)
+{
+  hwloc_obj_t obj = hwloc_get_root_obj(topology);
+  int i;
+  for(i=0; i<nr; i++) {
+    if (!obj)
+      return NULL;
+    obj = hwloc_get_obj_inside_cpuset_by_type(topology, obj->cpuset, typev[i], idxv[i]);
+  }
+  return obj;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_distribute Distributing items over a topology
+ * @{
+ */
+
+/** \brief Flags to be given to hwloc_distrib().
+ */
+enum hwloc_distrib_flags_e {
+  /** \brief Distrib in reverse order, starting from the last objects.
+   * \hideinitializer
+   */
+  HWLOC_DISTRIB_FLAG_REVERSE = (1UL<<0)
+};
+
+/** \brief Distribute \p n items over the topology under \p roots
+ *
+ * Array \p set will be filled with \p n cpusets recursively distributed
+ * linearly over the topology under objects \p roots, down to depth \p until
+ * (which can be INT_MAX to distribute down to the finest level).
+ *
+ * \p n_roots is usually 1 and \p roots only contains the topology root object
+ * so as to distribute over the entire topology.
+ *
+ * This is typically useful when an application wants to distribute \p n
+ * threads over a machine, giving each of them as much private cache as
+ * possible and keeping them locally in number order.
+ *
+ * The caller may typically want to also call hwloc_bitmap_singlify()
+ * before binding a thread so that it does not move at all.
+ *
+ * \p flags should be 0 or a OR'ed set of ::hwloc_distrib_flags_e.
+ *
+ * \note This function requires the \p roots objects to have a CPU set.
+ *
+ * \note This function replaces the now deprecated hwloc_distribute()
+ * and hwloc_distributev() functions.
+ */
+static __hwloc_inline int
+hwloc_distrib(hwloc_topology_t topology,
+	      hwloc_obj_t *roots, unsigned n_roots,
+	      hwloc_cpuset_t *set,
+	      unsigned n,
+	      unsigned until, unsigned long flags)
+{
+  unsigned i;
+  unsigned tot_weight;
+  unsigned given, givenweight;
+  hwloc_cpuset_t *cpusetp = set;
+
+  if (flags & ~HWLOC_DISTRIB_FLAG_REVERSE) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  tot_weight = 0;
+  for (i = 0; i < n_roots; i++)
+    tot_weight += hwloc_bitmap_weight(roots[i]->cpuset);
+
+  for (i = 0, given = 0, givenweight = 0; i < n_roots; i++) {
+    unsigned chunk, weight;
+    hwloc_obj_t root = roots[flags & HWLOC_DISTRIB_FLAG_REVERSE ? n_roots-1-i : i];
+    hwloc_cpuset_t cpuset = root->cpuset;
+    weight = hwloc_bitmap_weight(cpuset);
+    if (!weight)
+      continue;
+    /* Give to root a chunk proportional to its weight.
+     * If previous chunks got rounded-up, we may get a bit less. */
+    chunk = (( (givenweight+weight) * n  + tot_weight-1) / tot_weight)
+          - ((  givenweight         * n  + tot_weight-1) / tot_weight);
+    if (!root->arity || chunk <= 1 || root->depth >= until) {
+      /* We can't split any more, put everything there.  */
+      if (chunk) {
+	/* Fill cpusets with ours */
+	unsigned j;
+	for (j=0; j < chunk; j++)
+	  cpusetp[j] = hwloc_bitmap_dup(cpuset);
+      } else {
+	/* We got no chunk, just merge our cpuset to a previous one
+	 * (the first chunk cannot be empty)
+	 * so that this root doesn't get ignored.
+	 */
+	assert(given);
+	hwloc_bitmap_or(cpusetp[-1], cpusetp[-1], cpuset);
+      }
+    } else {
+      /* Still more to distribute, recurse into children */
+      hwloc_distrib(topology, root->children, root->arity, cpusetp, chunk, until, flags);
+    }
+    cpusetp += chunk;
+    given += chunk;
+    givenweight += weight;
+  }
+
+  return 0;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_topology_sets CPU and node sets of entire topologies
+ * @{
+ */
+/** \brief Get complete CPU set
+ *
+ * \return the complete CPU set of logical processors of the system.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_complete_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_complete_cpuset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->complete_cpuset;
+}
+
+/** \brief Get topology CPU set
+ *
+ * \return the CPU set of logical processors of the system for which hwloc
+ * provides topology information. This is equivalent to the cpuset of the
+ * system object.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_topology_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_topology_cpuset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->cpuset;
+}
+
+/** \brief Get allowed CPU set
+ *
+ * \return the CPU set of allowed logical processors of the system.
+ *
+ * \note The returned cpuset is not newly allocated and should thus not be
+ * changed or freed, hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_cpuset_t
+hwloc_topology_get_allowed_cpuset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->allowed_cpuset;
+}
+
+/** \brief Get complete node set
+ *
+ * \return the complete node set of memory of the system.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_complete_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_complete_nodeset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->complete_nodeset;
+}
+
+/** \brief Get topology node set
+ *
+ * \return the node set of memory of the system for which hwloc
+ * provides topology information. This is equivalent to the nodeset of the
+ * system object.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed; hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_topology_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_topology_nodeset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->nodeset;
+}
+
+/** \brief Get allowed node set
+ *
+ * \return the node set of allowed memory of the system.
+ *
+ * \note The returned nodeset is not newly allocated and should thus not be
+ * changed or freed, hwloc_bitmap_dup() must be used to obtain a local copy.
+ */
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology) __hwloc_attribute_pure;
+static __hwloc_inline hwloc_const_nodeset_t
+hwloc_topology_get_allowed_nodeset(hwloc_topology_t topology)
+{
+  return hwloc_get_root_obj(topology)->allowed_nodeset;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_helper_nodeset_convert Converting between CPU sets and node sets
+ *
+ * There are two semantics for converting cpusets to nodesets depending on how
+ * non-NUMA machines are handled.
+ *
+ * When manipulating nodesets for memory binding, non-NUMA machines should be
+ * considered as having a single NUMA node. The standard conversion routines
+ * below should be used so that marking the first bit of the nodeset means
+ * that memory should be bound to a non-NUMA whole machine.
+ *
+ * When manipulating nodesets as an actual list of NUMA nodes without any
+ * need to handle memory binding on non-NUMA machines, the strict conversion
+ * routines may be used instead.
+ * @{
+ */
+
+/** \brief Convert a CPU set into a NUMA node set and handle non-NUMA cases
+ *
+ * If some NUMA nodes have no CPUs at all, this function never sets their
+ * indexes in the output node set, even if a full CPU set is given in input.
+ *
+ * If the topology contains no NUMA nodes, the machine is considered
+ * as a single memory node, and the following behavior is used:
+ * If \p cpuset is empty, \p nodeset will be emptied as well.
+ * Otherwise \p nodeset will be entirely filled.
+ */
+static __hwloc_inline void
+hwloc_cpuset_to_nodeset(hwloc_topology_t topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset)
+{
+	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+	hwloc_obj_t obj = NULL;
+	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+	hwloc_bitmap_zero(nodeset);
+	while ((obj = hwloc_get_next_obj_covering_cpuset_by_depth(topology, _cpuset, depth, obj)) != NULL)
+		hwloc_bitmap_set(nodeset, obj->os_index);
+}
+
+/** \brief Convert a CPU set into a NUMA node set without handling non-NUMA cases
+ *
+ * This is the strict variant of ::hwloc_cpuset_to_nodeset. It does not fix
+ * non-NUMA cases. If the topology contains some NUMA nodes, behave exactly
+ * the same. However, if the topology contains no NUMA nodes, return an empty
+ * nodeset.
+ */
+static __hwloc_inline void
+hwloc_cpuset_to_nodeset_strict(struct hwloc_topology *topology, hwloc_const_cpuset_t _cpuset, hwloc_nodeset_t nodeset)
+{
+	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+	hwloc_obj_t obj = NULL;
+	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+	hwloc_bitmap_zero(nodeset);
+	while ((obj = hwloc_get_next_obj_covering_cpuset_by_depth(topology, _cpuset, depth, obj)) != NULL)
+		hwloc_bitmap_set(nodeset, obj->os_index);
+}
+
+/** \brief Convert a NUMA node set into a CPU set and handle non-NUMA cases
+ *
+ * If the topology contains no NUMA nodes, the machine is considered
+ * as a single memory node, and the following behavior is used:
+ * If \p nodeset is empty, \p cpuset will be emptied as well.
+ * Otherwise \p cpuset will be entirely filled.
+ * This is useful for manipulating memory binding sets.
+ */
+static __hwloc_inline void
+hwloc_cpuset_from_nodeset(hwloc_topology_t topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset)
+{
+	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+	hwloc_obj_t obj = NULL;
+	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+	hwloc_bitmap_zero(_cpuset);
+	while ((obj = hwloc_get_next_obj_by_depth(topology, depth, obj)) != NULL) {
+		if (hwloc_bitmap_isset(nodeset, obj->os_index))
+			/* no need to check obj->cpuset because objects in levels always have a cpuset */
+			hwloc_bitmap_or(_cpuset, _cpuset, obj->cpuset);
+	}
+}
+
+/** \brief Convert a NUMA node set into a CPU set without handling non-NUMA cases
+ *
+ * This is the strict variant of ::hwloc_cpuset_from_nodeset. It does not fix
+ * non-NUMA cases. If the topology contains some NUMA nodes, behave exactly
+ * the same. However, if the topology contains no NUMA nodes, return an empty
+ * cpuset.
+ */
+static __hwloc_inline void
+hwloc_cpuset_from_nodeset_strict(struct hwloc_topology *topology, hwloc_cpuset_t _cpuset, hwloc_const_nodeset_t nodeset)
+{
+	int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+	hwloc_obj_t obj = NULL;
+	assert(depth != HWLOC_TYPE_DEPTH_UNKNOWN);
+	hwloc_bitmap_zero(_cpuset);
+	while ((obj = hwloc_get_next_obj_by_depth(topology, depth, obj)) != NULL)
+		if (hwloc_bitmap_isset(nodeset, obj->os_index))
+			/* no need to check obj->cpuset because objects in levels always have a cpuset */
+			hwloc_bitmap_or(_cpuset, _cpuset, obj->cpuset);
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_distances Manipulating Distances
+ * @{
+ */
+
+/** \brief Get the distances between all objects at the given depth.
+ *
+ * \return a distances structure containing a matrix with all distances
+ * between all objects at the given depth.
+ *
+ * Slot i+nbobjs*j contains the distance from the object of logical index i
+ * the object of logical index j.
+ *
+ * \note This function only returns matrices covering the whole topology,
+ * without any unknown distance value. Those matrices are available in
+ * top-level object of the hierarchy. Matrices of lower objects are not
+ * reported here since they cover only part of the machine.
+ *
+ * The returned structure belongs to the hwloc library. The caller should
+ * not modify or free it.
+ *
+ * \return \c NULL if no such distance matrix exists.
+ */
+
+static __hwloc_inline const struct hwloc_distances_s *
+hwloc_get_whole_distance_matrix_by_depth(hwloc_topology_t topology, unsigned depth)
+{
+  hwloc_obj_t root = hwloc_get_root_obj(topology);
+  unsigned i;
+  for(i=0; i<root->distances_count; i++)
+    if (root->distances[i]->relative_depth == depth)
+      return root->distances[i];
+  return NULL;
+}
+
+/** \brief Get the distances between all objects of a given type.
+ *
+ * \return a distances structure containing a matrix with all distances
+ * between all objects of the given type.
+ *
+ * Slot i+nbobjs*j contains the distance from the object of logical index i
+ * the object of logical index j.
+ *
+ * \note This function only returns matrices covering the whole topology,
+ * without any unknown distance value. Those matrices are available in
+ * top-level object of the hierarchy. Matrices of lower objects are not
+ * reported here since they cover only part of the machine.
+ *
+ * The returned structure belongs to the hwloc library. The caller should
+ * not modify or free it.
+ *
+ * \return \c NULL if no such distance matrix exists.
+ */
+
+static __hwloc_inline const struct hwloc_distances_s *
+hwloc_get_whole_distance_matrix_by_type(hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth < 0)
+    return NULL;
+  return hwloc_get_whole_distance_matrix_by_depth(topology, depth);
+}
+
+/** \brief Get distances for the given depth and covering some objects
+ *
+ * Return a distance matrix that describes depth \p depth and covers at
+ * least object \p obj and all its children.
+ *
+ * When looking for the distance between some objects, a common ancestor should
+ * be passed in \p obj.
+ *
+ * \p firstp is set to logical index of the first object described by the matrix.
+ *
+ * The returned structure belongs to the hwloc library. The caller should
+ * not modify or free it.
+ */
+static __hwloc_inline const struct hwloc_distances_s *
+hwloc_get_distance_matrix_covering_obj_by_depth(hwloc_topology_t topology,
+						hwloc_obj_t obj, unsigned depth,
+						unsigned *firstp)
+{
+  if (!obj->cpuset)
+    return NULL;
+  while (obj) {
+    unsigned i;
+    for(i=0; i<obj->distances_count; i++)
+      if (obj->distances[i]->relative_depth == depth - obj->depth) {
+	if (!obj->distances[i]->nbobjs)
+	  continue;
+	*firstp = hwloc_get_next_obj_inside_cpuset_by_depth(topology, obj->cpuset, depth, NULL)->logical_index;
+	return obj->distances[i];
+      }
+    obj = obj->parent;
+  }
+  return NULL;
+}
+
+/** \brief Get the latency in both directions between two objects.
+ *
+ * Look at ancestor objects from the bottom to the top until one of them
+ * contains a distance matrix that matches the objects exactly.
+ *
+ * \p latency gets the value from object \p obj1 to \p obj2, while
+ * \p reverse_latency gets the reverse-direction value, which
+ * may be different on some architectures.
+ *
+ * \return -1 if no ancestor contains a matching latency matrix.
+ */
+static __hwloc_inline int
+hwloc_get_latency(hwloc_topology_t topology,
+		   hwloc_obj_t obj1, hwloc_obj_t obj2,
+		   float *latency, float *reverse_latency)
+{
+  hwloc_obj_t ancestor;
+  const struct hwloc_distances_s * distances;
+  unsigned first_logical ;
+
+  if (obj1->depth != obj2->depth) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  ancestor = hwloc_get_common_ancestor_obj(topology, obj1, obj2);
+  distances = hwloc_get_distance_matrix_covering_obj_by_depth(topology, ancestor, obj1->depth, &first_logical);
+  if (distances && distances->latency) {
+    const float * latency_matrix = distances->latency;
+    unsigned nbobjs = distances->nbobjs;
+    unsigned l1 = obj1->logical_index - first_logical;
+    unsigned l2 = obj2->logical_index - first_logical;
+    *latency = latency_matrix[l1*nbobjs+l2];
+    *reverse_latency = latency_matrix[l2*nbobjs+l1];
+    return 0;
+  }
+
+  errno = ENOSYS;
+  return -1;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_advanced_io Finding I/O objects
+ * @{
+ */
+
+/** \brief Get the first non-I/O ancestor object.
+ *
+ * Given the I/O object \p ioobj, find the smallest non-I/O ancestor
+ * object. This regular object may then be used for binding because
+ * its locality is the same as \p ioobj.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_non_io_ancestor_obj(hwloc_topology_t topology __hwloc_attribute_unused,
+			      hwloc_obj_t ioobj)
+{
+  hwloc_obj_t obj = ioobj;
+  while (obj && !obj->cpuset) {
+    obj = obj->parent;
+  }
+  return obj;
+}
+
+/** \brief Get the next PCI device in the system.
+ *
+ * \return the first PCI device if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_pcidev(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+  return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_PCI_DEVICE, prev);
+}
+
+/** \brief Find the PCI device object matching the PCI bus id
+ * given domain, bus device and function PCI bus id.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pcidev_by_busid(hwloc_topology_t topology,
+			  unsigned domain, unsigned bus, unsigned dev, unsigned func)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = hwloc_get_next_pcidev(topology, obj)) != NULL) {
+    if (obj->attr->pcidev.domain == domain
+	&& obj->attr->pcidev.bus == bus
+	&& obj->attr->pcidev.dev == dev
+	&& obj->attr->pcidev.func == func)
+      return obj;
+  }
+  return NULL;
+}
+
+/** \brief Find the PCI device object matching the PCI bus id
+ * given as a string xxxx:yy:zz.t or yy:zz.t.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_pcidev_by_busidstring(hwloc_topology_t topology, const char *busid)
+{
+  unsigned domain = 0; /* default */
+  unsigned bus, dev, func;
+
+  if (sscanf(busid, "%x:%x.%x", &bus, &dev, &func) != 3
+      && sscanf(busid, "%x:%x:%x.%x", &domain, &bus, &dev, &func) != 4) {
+    errno = EINVAL;
+    return NULL;
+  }
+
+  return hwloc_get_pcidev_by_busid(topology, domain, bus, dev, func);
+}
+
+/** \brief Get the next OS device in the system.
+ *
+ * \return the first OS device if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_osdev(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+  return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_OS_DEVICE, prev);
+}
+
+/** \brief Get the next bridge in the system.
+ *
+ * \return the first bridge if \p prev is \c NULL.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_bridge(hwloc_topology_t topology, hwloc_obj_t prev)
+{
+  return hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_BRIDGE, prev);
+}
+
+/* \brief Checks whether a given bridge covers a given PCI bus.
+ */
+static __hwloc_inline int
+hwloc_bridge_covers_pcibus(hwloc_obj_t bridge,
+			   unsigned domain, unsigned bus)
+{
+  return bridge->type == HWLOC_OBJ_BRIDGE
+    && bridge->attr->bridge.downstream_type == HWLOC_OBJ_BRIDGE_PCI
+    && bridge->attr->bridge.downstream.pci.domain == domain
+    && bridge->attr->bridge.downstream.pci.secondary_bus <= bus
+    && bridge->attr->bridge.downstream.pci.subordinate_bus >= bus;
+}
+
+/** \brief Find the hostbridge that covers the given PCI bus.
+ *
+ * This is useful for finding the locality of a bus because
+ * it is the hostbridge parent cpuset.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_get_hostbridge_by_pcibus(hwloc_topology_t topology,
+			       unsigned domain, unsigned bus)
+{
+  hwloc_obj_t obj = NULL;
+  while ((obj = hwloc_get_next_bridge(topology, obj)) != NULL) {
+    if (hwloc_bridge_covers_pcibus(obj, domain, bus)) {
+      /* found bridge covering this pcibus, make sure it's a hostbridge */
+      assert(obj->attr->bridge.upstream_type == HWLOC_OBJ_BRIDGE_HOST);
+      assert(obj->parent->type != HWLOC_OBJ_BRIDGE);
+      assert(obj->parent->cpuset);
+      return obj;
+    }
+  }
+  return NULL;
+}
+
+/** @} */
+
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_HELPER_H */
diff --git a/ext/hwloc/include/hwloc/inlines.h b/ext/hwloc/include/hwloc/inlines.h
new file mode 100644
index 0000000..7281750
--- /dev/null
+++ b/ext/hwloc/include/hwloc/inlines.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2013 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2010 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/**
+ * This file contains the inline code of functions declared in hwloc.h
+ */
+
+#ifndef HWLOC_INLINES_H
+#define HWLOC_INLINES_H
+
+#ifndef HWLOC_H
+#error Please include the main hwloc.h instead
+#endif
+
+#include <stdlib.h>
+#include <errno.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static __hwloc_inline int
+hwloc_get_type_or_below_depth (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+
+  if (depth != HWLOC_TYPE_DEPTH_UNKNOWN)
+    return depth;
+
+  /* find the highest existing level with type order >= */
+  for(depth = hwloc_get_type_depth(topology, HWLOC_OBJ_PU); ; depth--)
+    if (hwloc_compare_types(hwloc_get_depth_type(topology, depth), type) < 0)
+      return depth+1;
+
+  /* Shouldn't ever happen, as there is always a SYSTEM level with lower order and known depth.  */
+  /* abort(); */
+}
+
+static __hwloc_inline int
+hwloc_get_type_or_above_depth (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+
+  if (depth != HWLOC_TYPE_DEPTH_UNKNOWN)
+    return depth;
+
+  /* find the lowest existing level with type order <= */
+  for(depth = 0; ; depth++)
+    if (hwloc_compare_types(hwloc_get_depth_type(topology, depth), type) > 0)
+      return depth-1;
+
+  /* Shouldn't ever happen, as there is always a PU level with higher order and known depth.  */
+  /* abort(); */
+}
+
+static __hwloc_inline int
+hwloc_get_nbobjs_by_type (hwloc_topology_t topology, hwloc_obj_type_t type)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    return 0;
+  if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return -1; /* FIXME: agregate nbobjs from different levels? */
+  return hwloc_get_nbobjs_by_depth(topology, depth);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type, unsigned idx)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN)
+    return NULL;
+  if (depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_obj_by_depth(topology, depth, idx);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_depth (hwloc_topology_t topology, unsigned depth, hwloc_obj_t prev)
+{
+  if (!prev)
+    return hwloc_get_obj_by_depth (topology, depth, 0);
+  if (prev->depth != depth)
+    return NULL;
+  return prev->next_cousin;
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_next_obj_by_type (hwloc_topology_t topology, hwloc_obj_type_t type,
+			    hwloc_obj_t prev)
+{
+  int depth = hwloc_get_type_depth(topology, type);
+  if (depth == HWLOC_TYPE_DEPTH_UNKNOWN || depth == HWLOC_TYPE_DEPTH_MULTIPLE)
+    return NULL;
+  return hwloc_get_next_obj_by_depth (topology, depth, prev);
+}
+
+static __hwloc_inline hwloc_obj_t
+hwloc_get_root_obj (hwloc_topology_t topology)
+{
+  return hwloc_get_obj_by_depth (topology, 0, 0);
+}
+
+static __hwloc_inline const char *
+hwloc_obj_get_info_by_name(hwloc_obj_t obj, const char *name)
+{
+  unsigned i;
+  for(i=0; i<obj->infos_count; i++)
+    if (!strcmp(obj->infos[i].name, name))
+      return obj->infos[i].value;
+  return NULL;
+}
+
+static __hwloc_inline void *
+hwloc_alloc_membind_policy_nodeset(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags)
+{
+  void *p = hwloc_alloc_membind_nodeset(topology, len, nodeset, policy, flags);
+  if (p)
+    return p;
+  hwloc_set_membind_nodeset(topology, nodeset, policy, flags);
+  p = hwloc_alloc(topology, len);
+  if (p && policy != HWLOC_MEMBIND_FIRSTTOUCH)
+    /* Enforce the binding by touching the data */
+    memset(p, 0, len);
+  return p;
+}
+
+static __hwloc_inline void *
+hwloc_alloc_membind_policy(hwloc_topology_t topology, size_t len, hwloc_const_cpuset_t set, hwloc_membind_policy_t policy, int flags)
+{
+  void *p = hwloc_alloc_membind(topology, len, set, policy, flags);
+  if (p)
+    return p;
+  hwloc_set_membind(topology, set, policy, flags);
+  p = hwloc_alloc(topology, len);
+  if (p && policy != HWLOC_MEMBIND_FIRSTTOUCH)
+    /* Enforce the binding by touching the data */
+    memset(p, 0, len);
+  return p;
+}
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_INLINES_H */
diff --git a/ext/hwloc/include/hwloc/intel-mic.h b/ext/hwloc/include/hwloc/intel-mic.h
new file mode 100644
index 0000000..d58237b
--- /dev/null
+++ b/ext/hwloc/include/hwloc/intel-mic.h
@@ -0,0 +1,143 @@
+/*
+ * Copyright © 2013 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Intel Xeon Phi (MIC).
+ *
+ * Applications that use both hwloc and Intel Xeon Phi (MIC) may want to
+ * include this file so as to get topology information for MIC devices.
+ */
+
+#ifndef HWLOC_INTEL_MIC_H
+#define HWLOC_INTEL_MIC_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#include <dirent.h>
+#include <string.h>
+#endif
+
+#include <stdio.h>
+#include <stdlib.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_intel_mic Interoperability with Intel Xeon Phi (MIC)
+ *
+ * This interface offers ways to retrieve topology information about
+ * Intel Xeon Phi (MIC) devices.
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to MIC device whose index is \p idx.
+ *
+ * Return the CPU set describing the locality of the MIC device whose index is \p idx.
+ *
+ * Topology \p topology and device index \p idx must match the local machine.
+ * I/O devices detection is not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_intel_mic_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_intel_mic_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+				  int idx __hwloc_attribute_unused,
+				  hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+	/* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_INTEL_MIC_DEVICE_SYSFS_PATH_MAX 128
+	char path[HWLOC_INTEL_MIC_DEVICE_SYSFS_PATH_MAX];
+	DIR *sysdir = NULL;
+	FILE *sysfile = NULL;
+	struct dirent *dirent;
+	unsigned pcibus, pcidev, pcifunc;
+
+	if (!hwloc_topology_is_thissystem(topology)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	sprintf(path, "/sys/class/mic/mic%d", idx);
+	sysdir = opendir(path);
+	if (!sysdir)
+		return -1;
+
+	while ((dirent = readdir(sysdir)) != NULL) {
+		if (sscanf(dirent->d_name, "pci_%02x:%02x.%02x", &pcibus, &pcidev, &pcifunc) == 3) {
+			sprintf(path, "/sys/class/mic/mic%d/pci_%02x:%02x.%02x/local_cpus", idx, pcibus, pcidev, pcifunc);
+			sysfile = fopen(path, "r");
+			if (!sysfile) {
+				closedir(sysdir);
+				return -1;
+			}
+
+			hwloc_linux_parse_cpumap_file(sysfile, set);
+			if (hwloc_bitmap_iszero(set))
+				hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+			fclose(sysfile);
+			break;
+		}
+	}
+
+	closedir(sysdir);
+#else
+	/* Non-Linux systems simply get a full cpuset */
+	hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+	return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * MIC device for the given index.
+ *
+ * Return the OS device object describing the MIC device whose index is \p idx.
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_intel_mic_get_device_osdev_by_index(hwloc_topology_t topology,
+					  unsigned idx)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+                    && osdev->name
+		    && !strncmp("mic", osdev->name, 3)
+		    && atoi(osdev->name + 3) == (int) idx)
+                        return osdev;
+        }
+        return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_INTEL_MIC_H */
diff --git a/ext/hwloc/include/hwloc/linux-libnuma.h b/ext/hwloc/include/hwloc/linux-libnuma.h
new file mode 100644
index 0000000..0ce2591
--- /dev/null
+++ b/ext/hwloc/include/hwloc/linux-libnuma.h
@@ -0,0 +1,273 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2010, 2012 Université Bordeaux
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Linux libnuma.
+ *
+ * Applications that use both Linux libnuma and hwloc may want to
+ * include this file so as to ease conversion between their respective types.
+*/
+
+#ifndef HWLOC_LINUX_LIBNUMA_H
+#define HWLOC_LINUX_LIBNUMA_H
+
+#include <hwloc.h>
+#include <numa.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_linux_libnuma_ulongs Interoperability with Linux libnuma unsigned long masks
+ *
+ * This interface helps converting between Linux libnuma unsigned long masks
+ * and hwloc cpusets and nodesets.
+ *
+ * \note Topology \p topology must match the current machine.
+ *
+ * \note The behavior of libnuma is undefined if the kernel is not NUMA-aware.
+ * (when CONFIG_NUMA is not set in the kernel configuration).
+ * This helper and libnuma may thus not be strictly compatible in this case,
+ * which may be detected by checking whether numa_available() returns -1.
+ *
+ * @{
+ */
+
+
+/** \brief Convert hwloc CPU set \p cpuset into the array of unsigned long \p mask
+ *
+ * \p mask is the array of unsigned long that will be filled.
+ * \p maxnode contains the maximal node number that may be stored in \p mask.
+ * \p maxnode will be set to the maximal node number that was found, plus one.
+ *
+ * This function may be used before calling set_mempolicy, mbind, migrate_pages
+ * or any other function that takes an array of unsigned long and a maximal
+ * node number as input parameter.
+ */
+static __hwloc_inline int
+hwloc_cpuset_to_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset,
+				    unsigned long *mask, unsigned long *maxnode)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  unsigned long outmaxnode = -1;
+  hwloc_obj_t node = NULL;
+
+  /* round-up to the next ulong and clear all bytes */
+  *maxnode = (*maxnode + 8*sizeof(*mask) - 1) & ~(8*sizeof(*mask) - 1);
+  memset(mask, 0, *maxnode/8);
+
+  while ((node = hwloc_get_next_obj_covering_cpuset_by_depth(topology, cpuset, depth, node)) != NULL) {
+    if (node->os_index >= *maxnode)
+      continue;
+    mask[node->os_index/sizeof(*mask)/8] |= 1UL << (node->os_index % (sizeof(*mask)*8));
+    if (outmaxnode == (unsigned long) -1 || outmaxnode < node->os_index)
+      outmaxnode = node->os_index;
+  }
+
+  *maxnode = outmaxnode+1;
+  return 0;
+}
+
+/** \brief Convert hwloc NUMA node set \p nodeset into the array of unsigned long \p mask
+ *
+ * \p mask is the array of unsigned long that will be filled.
+ * \p maxnode contains the maximal node number that may be stored in \p mask.
+ * \p maxnode will be set to the maximal node number that was found, plus one.
+ *
+ * This function may be used before calling set_mempolicy, mbind, migrate_pages
+ * or any other function that takes an array of unsigned long and a maximal
+ * node number as input parameter.
+ */
+static __hwloc_inline int
+hwloc_nodeset_to_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset,
+				      unsigned long *mask, unsigned long *maxnode)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  unsigned long outmaxnode = -1;
+  hwloc_obj_t node = NULL;
+
+  /* round-up to the next ulong and clear all bytes */
+  *maxnode = (*maxnode + 8*sizeof(*mask) - 1) & ~(8*sizeof(*mask) - 1);
+  memset(mask, 0, *maxnode/8);
+
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL) {
+    if (node->os_index >= *maxnode)
+      continue;
+    if (!hwloc_bitmap_isset(nodeset, node->os_index))
+      continue;
+    mask[node->os_index/sizeof(*mask)/8] |= 1UL << (node->os_index % (sizeof(*mask)*8));
+    if (outmaxnode == (unsigned long) -1 || outmaxnode < node->os_index)
+      outmaxnode = node->os_index;
+  }
+
+  *maxnode = outmaxnode+1;
+  return 0;
+}
+
+/** \brief Convert the array of unsigned long \p mask into hwloc CPU set
+ *
+ * \p mask is a array of unsigned long that will be read.
+ * \p maxnode contains the maximal node number that may be read in \p mask.
+ *
+ * This function may be used after calling get_mempolicy or any other function
+ * that takes an array of unsigned long as output parameter (and possibly
+ * a maximal node number as input parameter).
+ */
+static __hwloc_inline int
+hwloc_cpuset_from_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_cpuset_t cpuset,
+				      const unsigned long *mask, unsigned long maxnode)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  hwloc_bitmap_zero(cpuset);
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (node->os_index < maxnode
+	&& (mask[node->os_index/sizeof(*mask)/8] & (1UL << (node->os_index % (sizeof(*mask)*8)))))
+      hwloc_bitmap_or(cpuset, cpuset, node->cpuset);
+  return 0;
+}
+
+/** \brief Convert the array of unsigned long \p mask into hwloc NUMA node set
+ *
+ * \p mask is a array of unsigned long that will be read.
+ * \p maxnode contains the maximal node number that may be read in \p mask.
+ *
+ * This function may be used after calling get_mempolicy or any other function
+ * that takes an array of unsigned long as output parameter (and possibly
+ * a maximal node number as input parameter).
+ */
+static __hwloc_inline int
+hwloc_nodeset_from_linux_libnuma_ulongs(hwloc_topology_t topology, hwloc_nodeset_t nodeset,
+					const unsigned long *mask, unsigned long maxnode)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  hwloc_bitmap_zero(nodeset);
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (node->os_index < maxnode
+	&& (mask[node->os_index/sizeof(*mask)/8] & (1UL << (node->os_index % (sizeof(*mask)*8)))))
+      hwloc_bitmap_set(nodeset, node->os_index);
+  return 0;
+}
+
+/** @} */
+
+
+
+/** \defgroup hwlocality_linux_libnuma_bitmask Interoperability with Linux libnuma bitmask
+ *
+ * This interface helps converting between Linux libnuma bitmasks
+ * and hwloc cpusets and nodesets.
+ *
+ * \note Topology \p topology must match the current machine.
+ *
+ * \note The behavior of libnuma is undefined if the kernel is not NUMA-aware.
+ * (when CONFIG_NUMA is not set in the kernel configuration).
+ * This helper and libnuma may thus not be strictly compatible in this case,
+ * which may be detected by checking whether numa_available() returns -1.
+ *
+ * @{
+ */
+
+
+/** \brief Convert hwloc CPU set \p cpuset into the returned libnuma bitmask
+ *
+ * The returned bitmask should later be freed with numa_bitmask_free.
+ *
+ * This function may be used before calling many numa_ functions
+ * that use a struct bitmask as an input parameter.
+ *
+ * \return newly allocated struct bitmask.
+ */
+static __hwloc_inline struct bitmask *
+hwloc_cpuset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset) __hwloc_attribute_malloc;
+static __hwloc_inline struct bitmask *
+hwloc_cpuset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_cpuset_t cpuset)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  struct bitmask *bitmask = numa_allocate_cpumask();
+  if (!bitmask)
+    return NULL;
+  while ((node = hwloc_get_next_obj_covering_cpuset_by_depth(topology, cpuset, depth, node)) != NULL)
+    if (node->memory.local_memory)
+      numa_bitmask_setbit(bitmask, node->os_index);
+  return bitmask;
+}
+
+/** \brief Convert hwloc NUMA node set \p nodeset into the returned libnuma bitmask
+ *
+ * The returned bitmask should later be freed with numa_bitmask_free.
+ *
+ * This function may be used before calling many numa_ functions
+ * that use a struct bitmask as an input parameter.
+ *
+ * \return newly allocated struct bitmask.
+ */
+static __hwloc_inline struct bitmask *
+hwloc_nodeset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset) __hwloc_attribute_malloc;
+static __hwloc_inline struct bitmask *
+hwloc_nodeset_to_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  struct bitmask *bitmask = numa_allocate_cpumask();
+  if (!bitmask)
+    return NULL;
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (hwloc_bitmap_isset(nodeset, node->os_index) && node->memory.local_memory)
+      numa_bitmask_setbit(bitmask, node->os_index);
+  return bitmask;
+}
+
+/** \brief Convert libnuma bitmask \p bitmask into hwloc CPU set \p cpuset
+ *
+ * This function may be used after calling many numa_ functions
+ * that use a struct bitmask as an output parameter.
+ */
+static __hwloc_inline int
+hwloc_cpuset_from_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_cpuset_t cpuset,
+					const struct bitmask *bitmask)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  hwloc_bitmap_zero(cpuset);
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (numa_bitmask_isbitset(bitmask, node->os_index))
+      hwloc_bitmap_or(cpuset, cpuset, node->cpuset);
+  return 0;
+}
+
+/** \brief Convert libnuma bitmask \p bitmask into hwloc NUMA node set \p nodeset
+ *
+ * This function may be used after calling many numa_ functions
+ * that use a struct bitmask as an output parameter.
+ */
+static __hwloc_inline int
+hwloc_nodeset_from_linux_libnuma_bitmask(hwloc_topology_t topology, hwloc_nodeset_t nodeset,
+					 const struct bitmask *bitmask)
+{
+  int depth = hwloc_get_type_depth(topology, HWLOC_OBJ_NUMANODE);
+  hwloc_obj_t node = NULL;
+  hwloc_bitmap_zero(nodeset);
+  while ((node = hwloc_get_next_obj_by_depth(topology, depth, node)) != NULL)
+    if (numa_bitmask_isbitset(bitmask, node->os_index))
+      hwloc_bitmap_set(nodeset, node->os_index);
+  return 0;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_LINUX_NUMA_H */
diff --git a/ext/hwloc/include/hwloc/linux.h b/ext/hwloc/include/hwloc/linux.h
new file mode 100644
index 0000000..4ddc900
--- /dev/null
+++ b/ext/hwloc/include/hwloc/linux.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2013 Inria.  All rights reserved.
+ * Copyright © 2009-2011 Université Bordeaux
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Linux.
+ *
+ * Applications that use hwloc on Linux may want to include this file
+ * if using some low-level Linux features.
+ */
+
+#ifndef HWLOC_LINUX_H
+#define HWLOC_LINUX_H
+
+#include <hwloc.h>
+#include <stdio.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_linux Linux-specific helpers
+ *
+ * This includes helpers for manipulating Linux kernel cpumap files, and hwloc
+ * equivalents of the Linux sched_setaffinity and sched_getaffinity system calls.
+ *
+ * @{
+ */
+
+/** \brief Convert a linux kernel cpumap file \p file into hwloc CPU set.
+ *
+ * Might be used when reading CPU set from sysfs attributes such as topology
+ * and caches for processors, or local_cpus for devices.
+ */
+HWLOC_DECLSPEC int hwloc_linux_parse_cpumap_file(FILE *file, hwloc_cpuset_t set);
+
+/** \brief Bind a thread \p tid on cpus given in cpuset \p set
+ *
+ * The behavior is exactly the same as the Linux sched_setaffinity system call,
+ * but uses a hwloc cpuset.
+ *
+ * \note This is equivalent to calling hwloc_set_proc_cpubind() with
+ * HWLOC_CPUBIND_THREAD as flags.
+ */
+HWLOC_DECLSPEC int hwloc_linux_set_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_const_cpuset_t set);
+
+/** \brief Get the current binding of thread \p tid
+ *
+ * The behavior is exactly the same as the Linux sched_getaffinity system call,
+ * but uses a hwloc cpuset.
+ *
+ * \note This is equivalent to calling hwloc_get_proc_cpubind() with
+ * HWLOC_CPUBIND_THREAD as flags.
+ */
+HWLOC_DECLSPEC int hwloc_linux_get_tid_cpubind(hwloc_topology_t topology, pid_t tid, hwloc_cpuset_t set);
+
+/** \brief Get the last physical CPU where thread \p tid ran.
+ *
+ * \note This is equivalent to calling hwloc_get_proc_last_cpu_location() with
+ * HWLOC_CPUBIND_THREAD as flags.
+ */
+HWLOC_DECLSPEC int hwloc_linux_get_tid_last_cpu_location(hwloc_topology_t topology, pid_t tid, hwloc_bitmap_t set);
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_GLIBC_SCHED_H */
diff --git a/ext/hwloc/include/hwloc/myriexpress.h b/ext/hwloc/include/hwloc/myriexpress.h
new file mode 100644
index 0000000..68ff88f
--- /dev/null
+++ b/ext/hwloc/include/hwloc/myriexpress.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright © 2010-2014 Inria.  All rights reserved.
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and Myrinet Express.
+ *
+ * Applications that use both hwloc and Myrinet Express verbs may want to
+ * include this file so as to get topology information for Myrinet hardware.
+ *
+ */
+
+#ifndef HWLOC_MYRIEXPRESS_H
+#define HWLOC_MYRIEXPRESS_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+
+#include <myriexpress.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_myriexpress Interoperability with Myrinet Express
+ *
+ * This interface offers ways to retrieve topology information about
+ * Myrinet Express hardware.
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close the MX board \p id.
+ *
+ * Return the CPU set describing the locality of the Myrinet Express
+ * board whose index is \p id.
+ *
+ * Topology \p topology and device \p id must match the local machine.
+ * I/O devices detection is not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * No additional information about the device is available.
+ */
+static __hwloc_inline int
+hwloc_mx_board_get_device_cpuset(hwloc_topology_t topology,
+				 unsigned id, hwloc_cpuset_t set)
+{
+  uint32_t in, out;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  in = id;
+  if (mx_get_info(NULL, MX_NUMA_NODE, &in, sizeof(in), &out, sizeof(out)) != MX_SUCCESS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (out != (uint32_t) -1) {
+    hwloc_obj_t obj = NULL;
+    while ((obj = hwloc_get_next_obj_by_type(topology, HWLOC_OBJ_NUMANODE, obj)) != NULL)
+      if (obj->os_index == out) {
+	hwloc_bitmap_copy(set, obj->cpuset);
+	goto out;
+      }
+  }
+  /* fallback to the full topology cpuset */
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+ out:
+  return 0;
+}
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close the MX endpoint \p endpoint.
+ *
+ * Return the CPU set describing the locality of the Myrinet Express
+ * board that runs the MX endpoint \p endpoint.
+ *
+ * Topology \p topology and device \p id must match the local machine.
+ * I/O devices detection is not needed in the topology.
+ *
+ * The function only returns the locality of the endpoint.
+ * No additional information about the endpoint or device is available.
+ */
+static __hwloc_inline int
+hwloc_mx_endpoint_get_device_cpuset(hwloc_topology_t topology,
+				    mx_endpoint_t endpoint, hwloc_cpuset_t set)
+{
+  uint64_t nid;
+  uint32_t nindex, eid;
+  mx_endpoint_addr_t eaddr;
+
+  if (mx_get_endpoint_addr(endpoint, &eaddr) != MX_SUCCESS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (mx_decompose_endpoint_addr(eaddr, &nid, &eid) != MX_SUCCESS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  if (mx_nic_id_to_board_number(nid, &nindex) != MX_SUCCESS) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  return hwloc_mx_board_get_device_cpuset(topology, nindex, set);
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_MYRIEXPRESS_H */
diff --git a/ext/hwloc/include/hwloc/nvml.h b/ext/hwloc/include/hwloc/nvml.h
new file mode 100644
index 0000000..462b332
--- /dev/null
+++ b/ext/hwloc/include/hwloc/nvml.h
@@ -0,0 +1,176 @@
+/*
+ * Copyright © 2012-2013 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the NVIDIA Management Library.
+ *
+ * Applications that use both hwloc and the NVIDIA Management Library may want to
+ * include this file so as to get topology information for NVML devices.
+ */
+
+#ifndef HWLOC_NVML_H
+#define HWLOC_NVML_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <nvml.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_nvml Interoperability with the NVIDIA Management Library
+ *
+ * This interface offers ways to retrieve topology information about
+ * devices managed by the NVIDIA Management Library (NVML).
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to NVML device \p device.
+ *
+ * Return the CPU set describing the locality of the NVML device \p device.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the NVML component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_nvml_get_device_osdev()
+ * and hwloc_nvml_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_nvml_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			     nvmlDevice_t device, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+  /* If we're on Linux, use the sysfs mechanism to get the local cpus */
+#define HWLOC_NVML_DEVICE_SYSFS_PATH_MAX 128
+  char path[HWLOC_NVML_DEVICE_SYSFS_PATH_MAX];
+  FILE *sysfile = NULL;
+  nvmlReturn_t nvres;
+  nvmlPciInfo_t pci;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  nvres = nvmlDeviceGetPciInfo(device, &pci);
+  if (NVML_SUCCESS != nvres) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  sprintf(path, "/sys/bus/pci/devices/%04x:%02x:%02x.0/local_cpus", pci.domain, pci.bus, pci.device);
+  sysfile = fopen(path, "r");
+  if (!sysfile)
+    return -1;
+
+  hwloc_linux_parse_cpumap_file(sysfile, set);
+  if (hwloc_bitmap_iszero(set))
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+  fclose(sysfile);
+#else
+  /* Non-Linux systems simply get a full cpuset */
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * NVML device whose index is \p idx.
+ *
+ * Return the OS device object describing the NVML device whose
+ * index is \p idx. Returns NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the NVML component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_nvml_get_device_osdev_by_index(hwloc_topology_t topology, unsigned idx)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+                if (HWLOC_OBJ_OSDEV_GPU == osdev->attr->osdev.type
+                    && osdev->name
+		    && !strncmp("nvml", osdev->name, 4)
+		    && atoi(osdev->name + 4) == (int) idx)
+                        return osdev;
+        }
+        return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to NVML device \p device.
+ *
+ * Return the hwloc OS device object that describes the given
+ * NVML device \p device. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the NVML component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_nvml_get_device_cpuset().
+ *
+ * \note The corresponding hwloc PCI device may be found by looking
+ * at the result parent pointer.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_nvml_get_device_osdev(hwloc_topology_t topology, nvmlDevice_t device)
+{
+	hwloc_obj_t osdev;
+	nvmlReturn_t nvres;
+	nvmlPciInfo_t pci;
+
+	if (!hwloc_topology_is_thissystem(topology)) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	nvres = nvmlDeviceGetPciInfo(device, &pci);
+	if (NVML_SUCCESS != nvres)
+		return NULL;
+
+	osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		hwloc_obj_t pcidev = osdev->parent;
+		if (strncmp(osdev->name, "nvml", 4))
+			continue;
+		if (pcidev
+		    && pcidev->type == HWLOC_OBJ_PCI_DEVICE
+		    && pcidev->attr->pcidev.domain == pci.domain
+		    && pcidev->attr->pcidev.bus == pci.bus
+		    && pcidev->attr->pcidev.dev == pci.device
+		    && pcidev->attr->pcidev.func == 0)
+			return osdev;
+	}
+
+	return NULL;
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_NVML_H */
diff --git a/ext/hwloc/include/hwloc/opencl.h b/ext/hwloc/include/hwloc/opencl.h
new file mode 100644
index 0000000..0301ad9
--- /dev/null
+++ b/ext/hwloc/include/hwloc/opencl.h
@@ -0,0 +1,199 @@
+/*
+ * Copyright © 2012-2013 Inria.  All rights reserved.
+ * Copyright © 2013 Université Bordeaux.  All right reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and the OpenCL interface.
+ *
+ * Applications that use both hwloc and OpenCL may want to
+ * include this file so as to get topology information for OpenCL devices.
+ */
+
+#ifndef HWLOC_OPENCL_H
+#define HWLOC_OPENCL_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#include <hwloc/helper.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <CL/cl.h>
+#include <CL/cl_ext.h>
+
+#include <stdio.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_opencl Interoperability with OpenCL
+ *
+ * This interface offers ways to retrieve topology information about
+ * OpenCL devices.
+ *
+ * Only the AMD OpenCL interface currently offers useful locality information
+ * about its devices.
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to OpenCL device \p device.
+ *
+ * Return the CPU set describing the locality of the OpenCL device \p device.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the OpenCL component are not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_opencl_get_device_osdev()
+ * and hwloc_opencl_get_device_osdev_by_index().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux with the AMD OpenCL implementation; other systems will simply
+ * get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_opencl_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			       cl_device_id device __hwloc_attribute_unused,
+			       hwloc_cpuset_t set)
+{
+#if (defined HWLOC_LINUX_SYS) && (defined CL_DEVICE_TOPOLOGY_AMD)
+	/* If we're on Linux + AMD OpenCL, use the AMD extension + the sysfs mechanism to get the local cpus */
+#define HWLOC_OPENCL_DEVICE_SYSFS_PATH_MAX 128
+	char path[HWLOC_OPENCL_DEVICE_SYSFS_PATH_MAX];
+	FILE *sysfile = NULL;
+	cl_device_topology_amd amdtopo;
+	cl_int clret;
+
+	if (!hwloc_topology_is_thissystem(topology)) {
+		errno = EINVAL;
+		return -1;
+	}
+
+	clret = clGetDeviceInfo(device, CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
+	if (CL_SUCCESS != clret) {
+		hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+		return 0;
+	}
+	if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) {
+		hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+		return 0;
+	}
+
+	sprintf(path, "/sys/bus/pci/devices/0000:%02x:%02x.%01x/local_cpus", amdtopo.pcie.bus, amdtopo.pcie.device, amdtopo.pcie.function);
+	sysfile = fopen(path, "r");
+	if (!sysfile)
+		return -1;
+
+	hwloc_linux_parse_cpumap_file(sysfile, set);
+	if (hwloc_bitmap_iszero(set))
+		hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+	fclose(sysfile);
+#else
+	/* Non-Linux + AMD OpenCL systems simply get a full cpuset */
+	hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the
+ * OpenCL device for the given indexes.
+ *
+ * Return the OS device object describing the OpenCL device
+ * whose platform index is \p platform_index,
+ * and whose device index within this platform if \p device_index.
+ * Return NULL if there is none.
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection and the OpenCL component must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_opencl_get_device_osdev_by_index(hwloc_topology_t topology,
+				       unsigned platform_index, unsigned device_index)
+{
+	unsigned x = (unsigned) -1, y = (unsigned) -1;
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_COPROC == osdev->attr->osdev.type
+                    && osdev->name
+		    && sscanf(osdev->name, "opencl%ud%u", &x, &y) == 2
+		    && platform_index == x && device_index == y)
+                        return osdev;
+        }
+        return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to OpenCL device \p device.
+ *
+ * Return the hwloc OS device object that describes the given
+ * OpenCL device \p device. Return NULL if there is none.
+ *
+ * Topology \p topology and device \p device must match the local machine.
+ * I/O devices detection and the OpenCL component must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_opencl_get_device_cpuset().
+ *
+ * \note The corresponding hwloc PCI device may be found by looking
+ * at the result parent pointer.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_opencl_get_device_osdev(hwloc_topology_t topology __hwloc_attribute_unused,
+			      cl_device_id device __hwloc_attribute_unused)
+{
+#ifdef CL_DEVICE_TOPOLOGY_AMD
+	hwloc_obj_t osdev;
+	cl_device_topology_amd amdtopo;
+	cl_int clret;
+
+	clret = clGetDeviceInfo(device, CL_DEVICE_TOPOLOGY_AMD, sizeof(amdtopo), &amdtopo, NULL);
+	if (CL_SUCCESS != clret) {
+		errno = EINVAL;
+		return NULL;
+	}
+	if (CL_DEVICE_TOPOLOGY_TYPE_PCIE_AMD != amdtopo.raw.type) {
+		errno = EINVAL;
+		return NULL;
+	}
+
+	osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		hwloc_obj_t pcidev = osdev->parent;
+		if (strncmp(osdev->name, "opencl", 6))
+			continue;
+		if (pcidev
+		    && pcidev->type == HWLOC_OBJ_PCI_DEVICE
+		    && pcidev->attr->pcidev.domain == 0
+		    && pcidev->attr->pcidev.bus == amdtopo.pcie.bus
+		    && pcidev->attr->pcidev.dev == amdtopo.pcie.device
+		    && pcidev->attr->pcidev.func == amdtopo.pcie.function)
+			return osdev;
+	}
+
+	return NULL;
+#else
+	return NULL;
+#endif
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_OPENCL_H */
diff --git a/ext/hwloc/include/hwloc/openfabrics-verbs.h b/ext/hwloc/include/hwloc/openfabrics-verbs.h
new file mode 100644
index 0000000..c6b8533
--- /dev/null
+++ b/ext/hwloc/include/hwloc/openfabrics-verbs.h
@@ -0,0 +1,155 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2013 Inria.  All rights reserved.
+ * Copyright © 2009-2010 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/** \file
+ * \brief Macros to help interaction between hwloc and OpenFabrics
+ * verbs.
+ *
+ * Applications that use both hwloc and OpenFabrics verbs may want to
+ * include this file so as to get topology information for OpenFabrics
+ * hardware.
+ *
+ */
+
+#ifndef HWLOC_OPENFABRICS_VERBS_H
+#define HWLOC_OPENFABRICS_VERBS_H
+
+#include <hwloc.h>
+#include <hwloc/autogen/config.h>
+#ifdef HWLOC_LINUX_SYS
+#include <hwloc/linux.h>
+#endif
+
+#include <infiniband/verbs.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/** \defgroup hwlocality_openfabrics Interoperability with OpenFabrics
+ *
+ * This interface offers ways to retrieve topology information about
+ * OpenFabrics devices.
+ *
+ * @{
+ */
+
+/** \brief Get the CPU set of logical processors that are physically
+ * close to device \p ibdev.
+ *
+ * Return the CPU set describing the locality of the OpenFabrics
+ * device \p ibdev.
+ *
+ * Topology \p topology and device \p ibdev must match the local machine.
+ * I/O devices detection is not needed in the topology.
+ *
+ * The function only returns the locality of the device.
+ * If more information about the device is needed, OS objects should
+ * be used instead, see hwloc_ibv_get_device_osdev()
+ * and hwloc_ibv_get_device_osdev_by_name().
+ *
+ * This function is currently only implemented in a meaningful way for
+ * Linux; other systems will simply get a full cpuset.
+ */
+static __hwloc_inline int
+hwloc_ibv_get_device_cpuset(hwloc_topology_t topology __hwloc_attribute_unused,
+			    struct ibv_device *ibdev, hwloc_cpuset_t set)
+{
+#ifdef HWLOC_LINUX_SYS
+  /* If we're on Linux, use the verbs-provided sysfs mechanism to
+     get the local cpus */
+#define HWLOC_OPENFABRICS_VERBS_SYSFS_PATH_MAX 128
+  char path[HWLOC_OPENFABRICS_VERBS_SYSFS_PATH_MAX];
+  FILE *sysfile = NULL;
+
+  if (!hwloc_topology_is_thissystem(topology)) {
+    errno = EINVAL;
+    return -1;
+  }
+
+  sprintf(path, "/sys/class/infiniband/%s/device/local_cpus",
+	  ibv_get_device_name(ibdev));
+  sysfile = fopen(path, "r");
+  if (!sysfile)
+    return -1;
+
+  hwloc_linux_parse_cpumap_file(sysfile, set);
+  if (hwloc_bitmap_iszero(set))
+    hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+
+  fclose(sysfile);
+#else
+  /* Non-Linux systems simply get a full cpuset */
+  hwloc_bitmap_copy(set, hwloc_topology_get_complete_cpuset(topology));
+#endif
+  return 0;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the OpenFabrics
+ * device named \p ibname.
+ *
+ * Return the OS device object describing the OpenFabrics device whose
+ * name is \p ibname. Returns NULL if there is none.
+ * The name \p ibname is usually obtained from ibv_get_device_name().
+ *
+ * The topology \p topology does not necessarily have to match the current
+ * machine. For instance the topology may be an XML import of a remote host.
+ * I/O devices detection must be enabled in the topology.
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_ibv_get_device_osdev_by_name(hwloc_topology_t topology,
+				   const char *ibname)
+{
+	hwloc_obj_t osdev = NULL;
+	while ((osdev = hwloc_get_next_osdev(topology, osdev)) != NULL) {
+		if (HWLOC_OBJ_OSDEV_OPENFABRICS == osdev->attr->osdev.type
+		    && osdev->name && !strcmp(ibname, osdev->name))
+			return osdev;
+	}
+	return NULL;
+}
+
+/** \brief Get the hwloc OS device object corresponding to the OpenFabrics
+ * device \p ibdev.
+ *
+ * Return the OS device object describing the OpenFabrics device \p ibdev.
+ * Returns NULL if there is none.
+ *
+ * Topology \p topology and device \p ibdev must match the local machine.
+ * I/O devices detection must be enabled in the topology.
+ * If not, the locality of the object may still be found using
+ * hwloc_ibv_get_device_cpuset().
+ *
+ * \note The corresponding PCI device object can be obtained by looking
+ * at the OS device parent object.
+ */
+static __hwloc_inline hwloc_obj_t
+hwloc_ibv_get_device_osdev(hwloc_topology_t topology,
+			   struct ibv_device *ibdev)
+{
+	if (!hwloc_topology_is_thissystem(topology)) {
+		errno = EINVAL;
+		return NULL;
+	}
+	return hwloc_ibv_get_device_osdev_by_name(topology, ibv_get_device_name(ibdev));
+}
+
+/** @} */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_OPENFABRICS_VERBS_H */
diff --git a/ext/hwloc/include/hwloc/plugins.h b/ext/hwloc/include/hwloc/plugins.h
new file mode 100644
index 0000000..7fc794d
--- /dev/null
+++ b/ext/hwloc/include/hwloc/plugins.h
@@ -0,0 +1,433 @@
+/*
+ * Copyright © 2013-2015 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#ifndef HWLOC_PLUGINS_H
+#define HWLOC_PLUGINS_H
+
+/** \file
+ * \brief Public interface for building hwloc plugins.
+ */
+
+struct hwloc_backend;
+
+#include <hwloc.h>
+#ifdef HWLOC_INSIDE_PLUGIN
+/* needed for hwloc_plugin_check_namespace() */
+#include <ltdl.h>
+#endif
+
+
+
+/** \defgroup hwlocality_disc_components Components and Plugins: Discovery components
+ * @{
+ */
+
+/** \brief Discovery component type */
+typedef enum hwloc_disc_component_type_e {
+  /** \brief CPU-only discovery through the OS, or generic no-OS support.
+   * \hideinitializer */
+  HWLOC_DISC_COMPONENT_TYPE_CPU = (1<<0),
+
+  /** \brief xml or synthetic,
+   * platform-specific components such as bgq.
+   * Anything the discovers CPU and everything else.
+   * No misc backend is expected to complement a global component.
+   * \hideinitializer */
+  HWLOC_DISC_COMPONENT_TYPE_GLOBAL = (1<<1),
+
+  /** \brief OpenCL, Cuda, etc.
+   * \hideinitializer */
+  HWLOC_DISC_COMPONENT_TYPE_MISC = (1<<2)
+} hwloc_disc_component_type_t;
+
+/** \brief Discovery component structure
+ *
+ * This is the major kind of components, taking care of the discovery.
+ * They are registered by generic components, either statically-built or as plugins.
+ */
+struct hwloc_disc_component {
+  /** \brief Discovery component type */
+  hwloc_disc_component_type_t type;
+
+  /** \brief Name.
+   * If this component is built as a plugin, this name does not have to match the plugin filename.
+   */
+  const char *name;
+
+  /** \brief Component types to exclude, as an OR'ed set of HWLOC_DISC_COMPONENT_TYPE_*.
+   *
+   * For a GLOBAL component, this usually includes all other types (~0).
+   *
+   * Other components only exclude types that may bring conflicting
+   * topology information. MISC components should likely not be excluded
+   * since they usually bring non-primary additional information.
+   */
+  unsigned excludes;
+
+  /** \brief Instantiate callback to create a backend from the component.
+   * Parameters data1, data2, data3 are NULL except for components
+   * that have special enabling routines such as hwloc_topology_set_xml(). */
+  struct hwloc_backend * (*instantiate)(struct hwloc_disc_component *component, const void *data1, const void *data2, const void *data3);
+
+  /** \brief Component priority.
+   * Used to sort topology->components, higher priority first.
+   * Also used to decide between two components with the same name.
+   *
+   * Usual values are
+   * 50 for native OS (or platform) components,
+   * 45 for x86,
+   * 40 for no-OS fallback,
+   * 30 for global components (xml, synthetic),
+   * 20 for pci,
+   * 10 for other misc components (opencl etc.).
+   */
+  unsigned priority;
+
+  /** \private Used internally to list components by priority on topology->components
+   * (the component structure is usually read-only,
+   *  the core copies it before using this field for queueing)
+   */
+  struct hwloc_disc_component * next;
+};
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_disc_backends Components and Plugins: Discovery backends
+ * @{
+ */
+
+/** \brief Discovery backend structure
+ *
+ * A backend is the instantiation of a discovery component.
+ * When a component gets enabled for a topology,
+ * its instantiate() callback creates a backend.
+ *
+ * hwloc_backend_alloc() initializes all fields to default values
+ * that the component may change (except "component" and "next")
+ * before enabling the backend with hwloc_backend_enable().
+ */
+struct hwloc_backend {
+  /** \private Reserved for the core, set by hwloc_backend_alloc() */
+  struct hwloc_disc_component * component;
+  /** \private Reserved for the core, set by hwloc_backend_enable() */
+  struct hwloc_topology * topology;
+  /** \private Reserved for the core. Set to 1 if forced through envvar, 0 otherwise. */
+  int envvar_forced;
+  /** \private Reserved for the core. Used internally to list backends topology->backends. */
+  struct hwloc_backend * next;
+
+  /** \brief Backend flags, as an OR'ed set of HWLOC_BACKEND_FLAG_* */
+  unsigned long flags;
+
+  /** \brief Backend-specific 'is_thissystem' property.
+   * Set to 0 or 1 if the backend should enforce the thissystem flag when it gets enabled.
+   * Set to -1 if the backend doesn't care (default). */
+  int is_thissystem;
+
+  /** \brief Backend private data, or NULL if none. */
+  void * private_data;
+  /** \brief Callback for freeing the private_data.
+   * May be NULL.
+   */
+  void (*disable)(struct hwloc_backend *backend);
+
+  /** \brief Main discovery callback.
+   * returns > 0 if it modified the topology tree, -1 on error, 0 otherwise.
+   * May be NULL if type is HWLOC_DISC_COMPONENT_TYPE_MISC. */
+  int (*discover)(struct hwloc_backend *backend);
+
+  /** \brief Callback used by the PCI backend to retrieve the locality of a PCI object from the OS/cpu backend.
+   * May be NULL. */
+  int (*get_obj_cpuset)(struct hwloc_backend *backend, struct hwloc_backend *caller, struct hwloc_obj *obj, hwloc_bitmap_t cpuset);
+
+  /** \brief Callback called by backends to notify this backend that a new object was added.
+   * returns > 0 if it modified the topology tree, 0 otherwise.
+   * May be NULL. */
+  int (*notify_new_object)(struct hwloc_backend *backend, struct hwloc_backend *caller, struct hwloc_obj *obj);
+};
+
+/** \brief Backend flags */
+enum hwloc_backend_flag_e {
+  /** \brief Levels should be reconnected before this backend discover() is used.
+   * \hideinitializer */
+  HWLOC_BACKEND_FLAG_NEED_LEVELS = (1UL<<0)
+};
+
+/** \brief Allocate a backend structure, set good default values, initialize backend->component and topology, etc.
+ * The caller will then modify whatever needed, and call hwloc_backend_enable().
+ */
+HWLOC_DECLSPEC struct hwloc_backend * hwloc_backend_alloc(struct hwloc_disc_component *component);
+
+/** \brief Enable a previously allocated and setup backend. */
+HWLOC_DECLSPEC int hwloc_backend_enable(struct hwloc_topology *topology, struct hwloc_backend *backend);
+
+/** \brief Used by backends discovery callbacks to request locality information from others.
+ *
+ * Traverse the list of enabled backends until one has a
+ * get_obj_cpuset() method, and call it.
+ */
+HWLOC_DECLSPEC int hwloc_backends_get_obj_cpuset(struct hwloc_backend *caller, struct hwloc_obj *obj, hwloc_bitmap_t cpuset);
+
+/** \brief Used by backends discovery callbacks to notify other
+ * backends of new objects.
+ *
+ * Traverse the list of enabled backends (all but caller) and invoke
+ * their notify_new_object() method to notify them that a new object
+ * just got added to the topology.
+ *
+ * Currently only used for notifying of new PCI device objects.
+ */
+HWLOC_DECLSPEC int hwloc_backends_notify_new_object(struct hwloc_backend *caller, struct hwloc_obj *obj);
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_generic_components Components and Plugins: Generic components
+ * @{
+ */
+
+/** \brief Generic component type */
+typedef enum hwloc_component_type_e {
+  /** \brief The data field must point to a struct hwloc_disc_component. */
+  HWLOC_COMPONENT_TYPE_DISC,
+
+  /** \brief The data field must point to a struct hwloc_xml_component. */
+  HWLOC_COMPONENT_TYPE_XML
+} hwloc_component_type_t;
+
+/** \brief Generic component structure
+ *
+ * Generic components structure, either statically listed by configure in static-components.h
+ * or dynamically loaded as a plugin.
+ */
+struct hwloc_component {
+  /** \brief Component ABI version, set to HWLOC_COMPONENT_ABI */
+  unsigned abi;
+
+  /** \brief Process-wide component initialization callback.
+   *
+   * This optional callback is called when the component is registered
+   * to the hwloc core (after loading the plugin).
+   *
+   * When the component is built as a plugin, this callback
+   * should call hwloc_check_plugin_namespace()
+   * and return an negative error code on error.
+   *
+   * \p flags is always 0 for now.
+   *
+   * \return 0 on success, or a negative code on error.
+   *
+   * \note If the component uses ltdl for loading its own plugins,
+   * it should load/unload them only in init() and finalize(),
+   * to avoid race conditions with hwloc's use of ltdl.
+   */
+  int (*init)(unsigned long flags);
+
+  /** \brief Process-wide component termination callback.
+   *
+   * This optional callback is called after unregistering the component
+   * from the hwloc core (before unloading the plugin).
+   *
+   * \p flags is always 0 for now.
+   *
+   * \note If the component uses ltdl for loading its own plugins,
+   * it should load/unload them only in init() and finalize(),
+   * to avoid race conditions with hwloc's use of ltdl.
+   */
+  void (*finalize)(unsigned long flags);
+
+  /** \brief Component type */
+  hwloc_component_type_t type;
+
+  /** \brief Component flags, unused for now */
+  unsigned long flags;
+
+  /** \brief Component data, pointing to a struct hwloc_disc_component or struct hwloc_xml_component. */
+  void * data;
+};
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_components_core_funcs Components and Plugins: Core functions to be used by components
+ * @{
+ */
+
+/** \brief Add an object to the topology.
+ *
+ * It is sorted along the tree of other objects according to the inclusion of
+ * cpusets, to eventually be added as a child of the smallest object including
+ * this object.
+ *
+ * If the cpuset is empty, the type of the object (and maybe some attributes)
+ * must be enough to find where to insert the object. This is especially true
+ * for NUMA nodes with memory and no CPUs.
+ *
+ * The given object should not have children.
+ *
+ * This shall only be called before levels are built.
+ *
+ * In case of error, hwloc_report_os_error() is called.
+ *
+ * Returns the object on success.
+ * Returns NULL and frees obj on error.
+ * Returns another object and frees obj if it was merged with an identical pre-existing object.
+ */
+HWLOC_DECLSPEC struct hwloc_obj *hwloc_insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj);
+
+/** \brief Type of error callbacks during object insertion */
+typedef void (*hwloc_report_error_t)(const char * msg, int line);
+/** \brief Report an insertion error from a backend */
+HWLOC_DECLSPEC void hwloc_report_os_error(const char * msg, int line);
+/** \brief Check whether insertion errors are hidden */
+HWLOC_DECLSPEC int hwloc_hide_errors(void);
+
+/** \brief Add an object to the topology and specify which error callback to use.
+ *
+ * Aside from the error callback selection, this function is identical to hwloc_insert_object_by_cpuset()
+ */
+HWLOC_DECLSPEC struct hwloc_obj *hwloc__insert_object_by_cpuset(struct hwloc_topology *topology, hwloc_obj_t obj, hwloc_report_error_t report_error);
+
+/** \brief Insert an object somewhere in the topology.
+ *
+ * It is added as the last child of the given parent.
+ * The cpuset is completely ignored, so strange objects such as I/O devices should
+ * preferably be inserted with this.
+ *
+ * When used for "normal" children with cpusets (when importing from XML
+ * when duplicating a topology), the caller should make sure that:
+ * - children are inserted in order,
+ * - children cpusets do not intersect.
+ *
+ * The given object may have normal, I/O or Misc children, as long as they are in order as well.
+ * These children must have valid parent and next_sibling pointers.
+ */
+HWLOC_DECLSPEC void hwloc_insert_object_by_parent(struct hwloc_topology *topology, hwloc_obj_t parent, hwloc_obj_t obj);
+
+/** \brief Allocate and initialize an object of the given type and physical index */
+static __hwloc_inline struct hwloc_obj *
+hwloc_alloc_setup_object(hwloc_obj_type_t type, signed os_index)
+{
+  struct hwloc_obj *obj = malloc(sizeof(*obj));
+  memset(obj, 0, sizeof(*obj));
+  obj->type = type;
+  obj->os_index = os_index;
+  obj->attr = malloc(sizeof(*obj->attr));
+  memset(obj->attr, 0, sizeof(*obj->attr));
+  /* do not allocate the cpuset here, let the caller do it */
+  return obj;
+}
+
+/** \brief Setup object cpusets/nodesets by OR'ing its children.
+ *
+ * Used when adding an object late in the topology.
+ * Will update the new object by OR'ing all its new children sets.
+ *
+ * Used when PCI backend adds a hostbridge parent, when distances
+ * add a new Group, etc.
+ */
+HWLOC_DECLSPEC int hwloc_obj_add_children_sets(hwloc_obj_t obj);
+
+/** \brief Make sure that plugins can lookup core symbols.
+ *
+ * This is a sanity check to avoid lazy-lookup failures when libhwloc
+ * is loaded within a plugin, and later tries to load its own plugins.
+ * This may fail (and abort the program) if libhwloc symbols are in a
+ * private namespace.
+ *
+ * \return 0 on success.
+ * \return -1 if the plugin cannot be successfully loaded. The caller
+ * plugin init() callback should return a negative error code as well.
+ *
+ * Plugins should call this function in their init() callback to avoid
+ * later crashes if lazy symbol resolution is used by the upper layer that
+ * loaded hwloc (e.g. OpenCL implementations using dlopen with RTLD_LAZY).
+ *
+ * \note The build system must define HWLOC_INSIDE_PLUGIN if and only if
+ * building the caller as a plugin.
+ *
+ * \note This function should remain inline so plugins can call it even
+ * when they cannot find libhwloc symbols.
+ */
+static __hwloc_inline int
+hwloc_plugin_check_namespace(const char *pluginname __hwloc_attribute_unused, const char *symbol __hwloc_attribute_unused)
+{
+#ifdef HWLOC_INSIDE_PLUGIN
+  lt_dlhandle handle;
+  void *sym;
+  handle = lt_dlopen(NULL);
+  if (!handle)
+    /* cannot check, assume things will work */
+    return 0;
+  sym = lt_dlsym(handle, symbol);
+  lt_dlclose(handle);
+  if (!sym) {
+    static int verboseenv_checked = 0;
+    static int verboseenv_value = 0;
+    if (!verboseenv_checked) {
+      const char *verboseenv = getenv("HWLOC_PLUGINS_VERBOSE");
+      verboseenv_value = atoi(verboseenv);
+      verboseenv_checked = 1;
+    }
+    if (verboseenv_value)
+      fprintf(stderr, "Plugin `%s' disabling itself because it cannot find the `%s' core symbol.\n",
+	      pluginname, symbol);
+    return -1;
+  }
+#endif /* HWLOC_INSIDE_PLUGIN */
+  return 0;
+}
+
+/** @} */
+
+
+
+
+/** \defgroup hwlocality_components_pci_funcs Components and Plugins: PCI functions to be used by components
+ * @{
+ */
+
+/** \brief Insert a list of PCI devices and bridges in the backend topology.
+ *
+ * Insert a list of objects (either PCI device or bridges) starting at first_obj
+ * (linked by next_sibling in the topology, and ending with NULL).
+ * Objects are placed under the right bridges, and the remaining upstream bridges
+ * are then inserted in the topology by calling the get_obj_cpuset() callback to
+ * find their locality.
+ */
+HWLOC_DECLSPEC int hwloc_insert_pci_device_list(struct hwloc_backend *backend, struct hwloc_obj *first_obj);
+
+/** \brief Return the offset of the given capability in the PCI config space buffer
+ *
+ * This function requires a 256-bytes config space. Unknown/unavailable bytes should be set to 0xff.
+ */
+HWLOC_DECLSPEC unsigned hwloc_pci_find_cap(const unsigned char *config, unsigned cap);
+
+/** \brief Fill linkspeed by reading the PCI config space where PCI_CAP_ID_EXP is at position offset.
+ *
+ * Needs 20 bytes of EXP capability block starting at offset in the config space
+ * for registers up to link status.
+ */
+HWLOC_DECLSPEC int hwloc_pci_find_linkspeed(const unsigned char *config, unsigned offset, float *linkspeed);
+
+/** \brief Modify the PCI device object into a bridge and fill its attribute if a bridge is found in the PCI config space.
+ *
+ * This function requires 64 bytes of common configuration header at the beginning of config.
+ */
+HWLOC_DECLSPEC int hwloc_pci_prepare_bridge(hwloc_obj_t obj, const unsigned char *config);
+
+/** @} */
+
+
+
+
+#endif /* HWLOC_PLUGINS_H */
diff --git a/ext/hwloc/include/hwloc/rename.h b/ext/hwloc/include/hwloc/rename.h
new file mode 100644
index 0000000..2684e71
--- /dev/null
+++ b/ext/hwloc/include/hwloc/rename.h
@@ -0,0 +1,651 @@
+/*
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2010-2015 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#ifndef HWLOC_RENAME_H
+#define HWLOC_RENAME_H
+
+#include <hwloc/autogen/config.h>
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Only enact these defines if we're actually renaming the symbols
+   (i.e., avoid trying to have no-op defines if we're *not*
+   renaming). */
+
+#if HWLOC_SYM_TRANSFORM
+
+/* Use a preprocessor two-step in order to get the prefixing right.
+   Make 2 macros: HWLOC_NAME and HWLOC_NAME_CAPS for renaming
+   things. */
+
+#define HWLOC_MUNGE_NAME(a, b) HWLOC_MUNGE_NAME2(a, b)
+#define HWLOC_MUNGE_NAME2(a, b) a ## b
+#define HWLOC_NAME(name) HWLOC_MUNGE_NAME(HWLOC_SYM_PREFIX, hwloc_ ## name)
+#define HWLOC_NAME_CAPS(name) HWLOC_MUNGE_NAME(HWLOC_SYM_PREFIX_CAPS, hwloc_ ## name)
+
+/* Now define all the "real" names to be the prefixed names.  This
+   allows us to use the real names throughout the code base (i.e.,
+   "hwloc_<foo>"); the preprocessor will adjust to have the prefixed
+   name under the covers. */
+
+/* Names from hwloc.h */
+
+#define hwloc_get_api_version HWLOC_NAME(get_api_version)
+
+#define hwloc_topology HWLOC_NAME(topology)
+#define hwloc_topology_t HWLOC_NAME(topology_t)
+
+#define hwloc_cpuset_t HWLOC_NAME(cpuset_t)
+#define hwloc_const_cpuset_t HWLOC_NAME(const_cpuset_t)
+#define hwloc_nodeset_t HWLOC_NAME(nodeset_t)
+#define hwloc_const_nodeset_t HWLOC_NAME(const_nodeset_t)
+
+#define HWLOC_OBJ_SYSTEM HWLOC_NAME_CAPS(OBJ_SYSTEM)
+#define HWLOC_OBJ_MACHINE HWLOC_NAME_CAPS(OBJ_MACHINE)
+#define HWLOC_OBJ_NUMANODE HWLOC_NAME_CAPS(OBJ_NUMANODE)
+#define HWLOC_OBJ_PACKAGE HWLOC_NAME_CAPS(OBJ_PACKAGE)
+#define HWLOC_OBJ_CACHE HWLOC_NAME_CAPS(OBJ_CACHE)
+#define HWLOC_OBJ_CORE HWLOC_NAME_CAPS(OBJ_CORE)
+#define HWLOC_OBJ_PU HWLOC_NAME_CAPS(OBJ_PU)
+#define HWLOC_OBJ_MISC HWLOC_NAME_CAPS(OBJ_MISC)
+#define HWLOC_OBJ_GROUP HWLOC_NAME_CAPS(OBJ_GROUP)
+#define HWLOC_OBJ_BRIDGE HWLOC_NAME_CAPS(OBJ_BRIDGE)
+#define HWLOC_OBJ_PCI_DEVICE HWLOC_NAME_CAPS(OBJ_PCI_DEVICE)
+#define HWLOC_OBJ_OS_DEVICE HWLOC_NAME_CAPS(OBJ_OS_DEVICE)
+#define HWLOC_OBJ_TYPE_MAX HWLOC_NAME_CAPS(OBJ_TYPE_MAX)
+#define hwloc_obj_type_t HWLOC_NAME(obj_type_t)
+
+#define hwloc_obj_cache_type_e HWLOC_NAME(obj_cache_type_e)
+#define hwloc_obj_cache_type_t HWLOC_NAME(obj_cache_type_t)
+#define HWLOC_OBJ_CACHE_UNIFIED HWLOC_NAME_CAPS(OBJ_CACHE_UNIFIED)
+#define HWLOC_OBJ_CACHE_DATA HWLOC_NAME_CAPS(OBJ_CACHE_DATA)
+#define HWLOC_OBJ_CACHE_INSTRUCTION HWLOC_NAME_CAPS(OBJ_CACHE_INSTRUCTION)
+
+#define hwloc_obj_bridge_type_e HWLOC_NAME(obj_bridge_type_e)
+#define hwloc_obj_bridge_type_t HWLOC_NAME(obj_bridge_type_t)
+#define HWLOC_OBJ_BRIDGE_HOST HWLOC_NAME_CAPS(OBJ_BRIDGE_HOST)
+#define HWLOC_OBJ_BRIDGE_PCI HWLOC_NAME_CAPS(OBJ_BRIDGE_PCI)
+
+#define hwloc_obj_osdev_type_e HWLOC_NAME(obj_osdev_type_e)
+#define hwloc_obj_osdev_type_t HWLOC_NAME(obj_osdev_type_t)
+#define HWLOC_OBJ_OSDEV_BLOCK HWLOC_NAME_CAPS(OBJ_OSDEV_BLOCK)
+#define HWLOC_OBJ_OSDEV_GPU HWLOC_NAME_CAPS(OBJ_OSDEV_GPU)
+#define HWLOC_OBJ_OSDEV_NETWORK HWLOC_NAME_CAPS(OBJ_OSDEV_NETWORK)
+#define HWLOC_OBJ_OSDEV_OPENFABRICS HWLOC_NAME_CAPS(OBJ_OSDEV_OPENFABRICS)
+#define HWLOC_OBJ_OSDEV_DMA HWLOC_NAME_CAPS(OBJ_OSDEV_DMA)
+#define HWLOC_OBJ_OSDEV_COPROC HWLOC_NAME_CAPS(OBJ_OSDEV_COPROC)
+
+#define hwloc_compare_types HWLOC_NAME(compare_types)
+
+#define hwloc_compare_types_e HWLOC_NAME(compare_types_e)
+#define HWLOC_TYPE_UNORDERED HWLOC_NAME_CAPS(TYPE_UNORDERED)
+
+#define hwloc_obj_memory_s HWLOC_NAME(obj_memory_s)
+#define hwloc_obj_memory_page_type_s HWLOC_NAME(obj_memory_page_type_s)
+
+#define hwloc_obj HWLOC_NAME(obj)
+#define hwloc_obj_t HWLOC_NAME(obj_t)
+
+#define hwloc_distances_s HWLOC_NAME(distances_s)
+#define hwloc_obj_info_s HWLOC_NAME(obj_info_s)
+
+#define hwloc_obj_attr_u HWLOC_NAME(obj_attr_u)
+#define hwloc_cache_attr_s HWLOC_NAME(cache_attr_s)
+#define hwloc_group_attr_s HWLOC_NAME(group_attr_s)
+#define hwloc_pcidev_attr_s HWLOC_NAME(pcidev_attr_s)
+#define hwloc_bridge_attr_s HWLOC_NAME(bridge_attr_s)
+#define hwloc_osdev_attr_s HWLOC_NAME(osdev_attr_s)
+
+#define hwloc_topology_init HWLOC_NAME(topology_init)
+#define hwloc_topology_load HWLOC_NAME(topology_load)
+#define hwloc_topology_destroy HWLOC_NAME(topology_destroy)
+#define hwloc_topology_dup HWLOC_NAME(topology_dup)
+#define hwloc_topology_check HWLOC_NAME(topology_check)
+
+#define hwloc_topology_flags_e HWLOC_NAME(topology_flags_e)
+
+#define HWLOC_TOPOLOGY_FLAG_WHOLE_SYSTEM HWLOC_NAME_CAPS(TOPOLOGY_FLAG_WHOLE_SYSTEM)
+#define HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IS_THISSYSTEM)
+#define HWLOC_TOPOLOGY_FLAG_IO_DEVICES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IO_DEVICES)
+#define HWLOC_TOPOLOGY_FLAG_IO_BRIDGES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_IO_BRIDGES)
+#define HWLOC_TOPOLOGY_FLAG_WHOLE_IO HWLOC_NAME_CAPS(TOPOLOGY_FLAG_WHOLE_IO)
+#define HWLOC_TOPOLOGY_FLAG_ICACHES HWLOC_NAME_CAPS(TOPOLOGY_FLAG_ICACHES)
+
+#define hwloc_topology_set_pid HWLOC_NAME(topology_set_pid)
+#define hwloc_topology_set_synthetic HWLOC_NAME(topology_set_synthetic)
+#define hwloc_topology_set_xml HWLOC_NAME(topology_set_xml)
+#define hwloc_topology_set_xmlbuffer HWLOC_NAME(topology_set_xmlbuffer)
+
+#define hwloc_topology_set_flags HWLOC_NAME(topology_set_flags)
+#define hwloc_topology_is_thissystem HWLOC_NAME(topology_is_thissystem)
+#define hwloc_topology_get_flags HWLOC_NAME(topology_get_flags)
+#define hwloc_topology_discovery_support HWLOC_NAME(topology_discovery_support)
+#define hwloc_topology_cpubind_support HWLOC_NAME(topology_cpubind_support)
+#define hwloc_topology_membind_support HWLOC_NAME(topology_membind_support)
+#define hwloc_topology_support HWLOC_NAME(topology_support)
+#define hwloc_topology_get_support HWLOC_NAME(topology_get_support)
+#define hwloc_topology_ignore_type HWLOC_NAME(topology_ignore_type)
+#define hwloc_topology_ignore_type_keep_structure HWLOC_NAME(topology_ignore_type_keep_structure)
+#define hwloc_topology_ignore_all_keep_structure HWLOC_NAME(topology_ignore_all_keep_structure)
+#define hwloc_topology_set_distance_matrix HWLOC_NAME(topology_set_distance_matrix)
+#define hwloc_topology_set_userdata HWLOC_NAME(topology_set_userdata)
+#define hwloc_topology_get_userdata HWLOC_NAME(topology_get_userdata)
+
+#define hwloc_restrict_flags_e HWLOC_NAME(restrict_flags_e)
+#define HWLOC_RESTRICT_FLAG_ADAPT_DISTANCES HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_DISTANCES)
+#define HWLOC_RESTRICT_FLAG_ADAPT_MISC HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_MISC)
+#define HWLOC_RESTRICT_FLAG_ADAPT_IO HWLOC_NAME_CAPS(RESTRICT_FLAG_ADAPT_IO)
+#define hwloc_topology_restrict HWLOC_NAME(topology_restrict)
+
+#define hwloc_topology_insert_misc_object HWLOC_NAME(topology_insert_misc_object)
+#define hwloc_topology_alloc_group_object HWLOC_NAME(topology_alloc_group_object)
+#define hwloc_topology_insert_group_object HWLOC_NAME(topology_insert_group_object)
+#define hwloc_obj_add_other_obj_sets HWLOC_NAME(obj_add_other_obj_sets)
+
+#define hwloc_topology_get_depth HWLOC_NAME(topology_get_depth)
+#define hwloc_get_type_depth HWLOC_NAME(get_type_depth)
+
+#define hwloc_get_type_depth_e HWLOC_NAME(get_type_depth_e)
+#define HWLOC_TYPE_DEPTH_UNKNOWN HWLOC_NAME_CAPS(TYPE_DEPTH_UNKNOWN)
+#define HWLOC_TYPE_DEPTH_MULTIPLE HWLOC_NAME_CAPS(TYPE_DEPTH_MULTIPLE)
+#define HWLOC_TYPE_DEPTH_BRIDGE HWLOC_NAME_CAPS(TYPE_DEPTH_BRIDGE)
+#define HWLOC_TYPE_DEPTH_PCI_DEVICE HWLOC_NAME_CAPS(TYPE_DEPTH_PCI_DEVICE)
+#define HWLOC_TYPE_DEPTH_OS_DEVICE HWLOC_NAME_CAPS(TYPE_DEPTH_OS_DEVICE)
+#define HWLOC_TYPE_DEPTH_MISC HWLOC_NAME_CAPS(TYPE_DEPTH_MISC)
+
+#define hwloc_get_depth_type HWLOC_NAME(get_depth_type)
+#define hwloc_get_nbobjs_by_depth HWLOC_NAME(get_nbobjs_by_depth)
+#define hwloc_get_nbobjs_by_type HWLOC_NAME(get_nbobjs_by_type)
+
+#define hwloc_get_obj_by_depth HWLOC_NAME(get_obj_by_depth )
+#define hwloc_get_obj_by_type HWLOC_NAME(get_obj_by_type )
+
+#define hwloc_obj_type_string HWLOC_NAME(obj_type_string )
+#define hwloc_obj_type_sscanf HWLOC_NAME(obj_type_sscanf)
+#define hwloc_obj_type_snprintf HWLOC_NAME(obj_type_snprintf )
+#define hwloc_obj_attr_snprintf HWLOC_NAME(obj_attr_snprintf )
+#define hwloc_obj_cpuset_snprintf HWLOC_NAME(obj_cpuset_snprintf)
+#define hwloc_obj_get_info_by_name HWLOC_NAME(obj_get_info_by_name)
+#define hwloc_obj_add_info HWLOC_NAME(obj_add_info)
+
+#define HWLOC_CPUBIND_PROCESS HWLOC_NAME_CAPS(CPUBIND_PROCESS)
+#define HWLOC_CPUBIND_THREAD HWLOC_NAME_CAPS(CPUBIND_THREAD)
+#define HWLOC_CPUBIND_STRICT HWLOC_NAME_CAPS(CPUBIND_STRICT)
+#define HWLOC_CPUBIND_NOMEMBIND HWLOC_NAME_CAPS(CPUBIND_NOMEMBIND)
+
+#define hwloc_cpubind_flags_t HWLOC_NAME(cpubind_flags_t)
+
+#define hwloc_set_cpubind HWLOC_NAME(set_cpubind)
+#define hwloc_get_cpubind HWLOC_NAME(get_cpubind)
+#define hwloc_set_proc_cpubind HWLOC_NAME(set_proc_cpubind)
+#define hwloc_get_proc_cpubind HWLOC_NAME(get_proc_cpubind)
+#define hwloc_set_thread_cpubind HWLOC_NAME(set_thread_cpubind)
+#define hwloc_get_thread_cpubind HWLOC_NAME(get_thread_cpubind)
+
+#define hwloc_get_last_cpu_location HWLOC_NAME(get_last_cpu_location)
+#define hwloc_get_proc_last_cpu_location HWLOC_NAME(get_proc_last_cpu_location)
+
+#define HWLOC_MEMBIND_DEFAULT HWLOC_NAME_CAPS(MEMBIND_DEFAULT)
+#define HWLOC_MEMBIND_FIRSTTOUCH HWLOC_NAME_CAPS(MEMBIND_FIRSTTOUCH)
+#define HWLOC_MEMBIND_BIND HWLOC_NAME_CAPS(MEMBIND_BIND)
+#define HWLOC_MEMBIND_INTERLEAVE HWLOC_NAME_CAPS(MEMBIND_INTERLEAVE)
+#define HWLOC_MEMBIND_REPLICATE HWLOC_NAME_CAPS(MEMBIND_REPLICATE)
+#define HWLOC_MEMBIND_NEXTTOUCH HWLOC_NAME_CAPS(MEMBIND_NEXTTOUCH)
+#define HWLOC_MEMBIND_MIXED HWLOC_NAME_CAPS(MEMBIND_MIXED)
+
+#define hwloc_membind_policy_t HWLOC_NAME(membind_policy_t)
+
+#define HWLOC_MEMBIND_PROCESS HWLOC_NAME_CAPS(MEMBIND_PROCESS)
+#define HWLOC_MEMBIND_THREAD HWLOC_NAME_CAPS(MEMBIND_THREAD)
+#define HWLOC_MEMBIND_STRICT HWLOC_NAME_CAPS(MEMBIND_STRICT)
+#define HWLOC_MEMBIND_MIGRATE HWLOC_NAME_CAPS(MEMBIND_MIGRATE)
+#define HWLOC_MEMBIND_NOCPUBIND HWLOC_NAME_CAPS(MEMBIND_NOCPUBIND)
+
+#define hwloc_membind_flags_t HWLOC_NAME(membind_flags_t)
+
+#define hwloc_set_membind_nodeset HWLOC_NAME(set_membind_nodeset)
+#define hwloc_set_membind HWLOC_NAME(set_membind)
+#define hwloc_get_membind_nodeset HWLOC_NAME(get_membind_nodeset)
+#define hwloc_get_membind HWLOC_NAME(get_membind)
+#define hwloc_set_proc_membind_nodeset HWLOC_NAME(set_proc_membind_nodeset)
+#define hwloc_set_proc_membind HWLOC_NAME(set_proc_membind)
+#define hwloc_get_proc_membind_nodeset HWLOC_NAME(get_proc_membind_nodeset)
+#define hwloc_get_proc_membind HWLOC_NAME(get_proc_membind)
+#define hwloc_set_area_membind_nodeset HWLOC_NAME(set_area_membind_nodeset)
+#define hwloc_set_area_membind HWLOC_NAME(set_area_membind)
+#define hwloc_get_area_membind_nodeset HWLOC_NAME(get_area_membind_nodeset)
+#define hwloc_get_area_membind HWLOC_NAME(get_area_membind)
+#define hwloc_alloc_membind_nodeset HWLOC_NAME(alloc_membind_nodeset)
+#define hwloc_alloc_membind HWLOC_NAME(alloc_membind)
+#define hwloc_alloc HWLOC_NAME(alloc)
+#define hwloc_free HWLOC_NAME(free)
+
+#define hwloc_get_non_io_ancestor_obj HWLOC_NAME(get_non_io_ancestor_obj)
+#define hwloc_get_next_pcidev HWLOC_NAME(get_next_pcidev)
+#define hwloc_get_pcidev_by_busid HWLOC_NAME(get_pcidev_by_busid)
+#define hwloc_get_pcidev_by_busidstring HWLOC_NAME(get_pcidev_by_busidstring)
+#define hwloc_get_next_osdev HWLOC_NAME(get_next_osdev)
+#define hwloc_get_next_bridge HWLOC_NAME(get_next_bridge)
+#define hwloc_bridge_covers_pcibus HWLOC_NAME(bridge_covers_pcibus)
+#define hwloc_get_hostbridge_by_pcibus HWLOC_NAME(get_hostbridge_by_pcibus)
+
+/* hwloc/bitmap.h */
+
+#define hwloc_bitmap_s HWLOC_NAME(bitmap_s)
+#define hwloc_bitmap_t HWLOC_NAME(bitmap_t)
+#define hwloc_const_bitmap_t HWLOC_NAME(const_bitmap_t)
+
+#define hwloc_bitmap_alloc HWLOC_NAME(bitmap_alloc)
+#define hwloc_bitmap_alloc_full HWLOC_NAME(bitmap_alloc_full)
+#define hwloc_bitmap_free HWLOC_NAME(bitmap_free)
+#define hwloc_bitmap_dup HWLOC_NAME(bitmap_dup)
+#define hwloc_bitmap_copy HWLOC_NAME(bitmap_copy)
+#define hwloc_bitmap_snprintf HWLOC_NAME(bitmap_snprintf)
+#define hwloc_bitmap_asprintf HWLOC_NAME(bitmap_asprintf)
+#define hwloc_bitmap_sscanf HWLOC_NAME(bitmap_sscanf)
+#define hwloc_bitmap_list_snprintf HWLOC_NAME(bitmap_list_snprintf)
+#define hwloc_bitmap_list_asprintf HWLOC_NAME(bitmap_list_asprintf)
+#define hwloc_bitmap_list_sscanf HWLOC_NAME(bitmap_list_sscanf)
+#define hwloc_bitmap_taskset_snprintf HWLOC_NAME(bitmap_taskset_snprintf)
+#define hwloc_bitmap_taskset_asprintf HWLOC_NAME(bitmap_taskset_asprintf)
+#define hwloc_bitmap_taskset_sscanf HWLOC_NAME(bitmap_taskset_sscanf)
+#define hwloc_bitmap_zero HWLOC_NAME(bitmap_zero)
+#define hwloc_bitmap_fill HWLOC_NAME(bitmap_fill)
+#define hwloc_bitmap_from_ulong HWLOC_NAME(bitmap_from_ulong)
+
+#define hwloc_bitmap_from_ith_ulong HWLOC_NAME(bitmap_from_ith_ulong)
+#define hwloc_bitmap_to_ulong HWLOC_NAME(bitmap_to_ulong)
+#define hwloc_bitmap_to_ith_ulong HWLOC_NAME(bitmap_to_ith_ulong)
+#define hwloc_bitmap_only HWLOC_NAME(bitmap_only)
+#define hwloc_bitmap_allbut HWLOC_NAME(bitmap_allbut)
+#define hwloc_bitmap_set HWLOC_NAME(bitmap_set)
+#define hwloc_bitmap_set_range HWLOC_NAME(bitmap_set_range)
+#define hwloc_bitmap_set_ith_ulong HWLOC_NAME(bitmap_set_ith_ulong)
+#define hwloc_bitmap_clr HWLOC_NAME(bitmap_clr)
+#define hwloc_bitmap_clr_range HWLOC_NAME(bitmap_clr_range)
+#define hwloc_bitmap_isset HWLOC_NAME(bitmap_isset)
+#define hwloc_bitmap_iszero HWLOC_NAME(bitmap_iszero)
+#define hwloc_bitmap_isfull HWLOC_NAME(bitmap_isfull)
+#define hwloc_bitmap_isequal HWLOC_NAME(bitmap_isequal)
+#define hwloc_bitmap_intersects HWLOC_NAME(bitmap_intersects)
+#define hwloc_bitmap_isincluded HWLOC_NAME(bitmap_isincluded)
+#define hwloc_bitmap_or HWLOC_NAME(bitmap_or)
+#define hwloc_bitmap_and HWLOC_NAME(bitmap_and)
+#define hwloc_bitmap_andnot HWLOC_NAME(bitmap_andnot)
+#define hwloc_bitmap_xor HWLOC_NAME(bitmap_xor)
+#define hwloc_bitmap_not HWLOC_NAME(bitmap_not)
+#define hwloc_bitmap_first HWLOC_NAME(bitmap_first)
+#define hwloc_bitmap_last HWLOC_NAME(bitmap_last)
+#define hwloc_bitmap_next HWLOC_NAME(bitmap_next)
+#define hwloc_bitmap_singlify HWLOC_NAME(bitmap_singlify)
+#define hwloc_bitmap_compare_first HWLOC_NAME(bitmap_compare_first)
+#define hwloc_bitmap_compare HWLOC_NAME(bitmap_compare)
+#define hwloc_bitmap_weight HWLOC_NAME(bitmap_weight)
+
+/* hwloc/helper.h */
+
+#define hwloc_get_type_or_below_depth HWLOC_NAME(get_type_or_below_depth)
+#define hwloc_get_type_or_above_depth HWLOC_NAME(get_type_or_above_depth)
+#define hwloc_get_root_obj HWLOC_NAME(get_root_obj)
+#define hwloc_get_ancestor_obj_by_depth HWLOC_NAME(get_ancestor_obj_by_depth)
+#define hwloc_get_ancestor_obj_by_type HWLOC_NAME(get_ancestor_obj_by_type)
+#define hwloc_get_next_obj_by_depth HWLOC_NAME(get_next_obj_by_depth)
+#define hwloc_get_next_obj_by_type HWLOC_NAME(get_next_obj_by_type)
+#define hwloc_get_pu_obj_by_os_index HWLOC_NAME(get_pu_obj_by_os_index)
+#define hwloc_get_numanode_obj_by_os_index HWLOC_NAME(get_numanode_obj_by_os_index)
+#define hwloc_get_next_child HWLOC_NAME(get_next_child)
+#define hwloc_get_common_ancestor_obj HWLOC_NAME(get_common_ancestor_obj)
+#define hwloc_obj_is_in_subtree HWLOC_NAME(obj_is_in_subtree)
+#define hwloc_get_first_largest_obj_inside_cpuset HWLOC_NAME(get_first_largest_obj_inside_cpuset)
+#define hwloc_get_largest_objs_inside_cpuset HWLOC_NAME(get_largest_objs_inside_cpuset)
+#define hwloc_get_next_obj_inside_cpuset_by_depth HWLOC_NAME(get_next_obj_inside_cpuset_by_depth)
+#define hwloc_get_next_obj_inside_cpuset_by_type HWLOC_NAME(get_next_obj_inside_cpuset_by_type)
+#define hwloc_get_obj_inside_cpuset_by_depth HWLOC_NAME(get_obj_inside_cpuset_by_depth)
+#define hwloc_get_obj_inside_cpuset_by_type HWLOC_NAME(get_obj_inside_cpuset_by_type)
+#define hwloc_get_nbobjs_inside_cpuset_by_depth HWLOC_NAME(get_nbobjs_inside_cpuset_by_depth)
+#define hwloc_get_nbobjs_inside_cpuset_by_type HWLOC_NAME(get_nbobjs_inside_cpuset_by_type)
+#define hwloc_get_obj_index_inside_cpuset HWLOC_NAME(get_obj_index_inside_cpuset)
+#define hwloc_get_child_covering_cpuset HWLOC_NAME(get_child_covering_cpuset)
+#define hwloc_get_obj_covering_cpuset HWLOC_NAME(get_obj_covering_cpuset)
+#define hwloc_get_next_obj_covering_cpuset_by_depth HWLOC_NAME(get_next_obj_covering_cpuset_by_depth)
+#define hwloc_get_next_obj_covering_cpuset_by_type HWLOC_NAME(get_next_obj_covering_cpuset_by_type)
+#define hwloc_get_cache_type_depth HWLOC_NAME(get_cache_type_depth)
+#define hwloc_get_cache_covering_cpuset HWLOC_NAME(get_cache_covering_cpuset)
+#define hwloc_get_shared_cache_covering_obj HWLOC_NAME(get_shared_cache_covering_obj)
+#define hwloc_get_closest_objs HWLOC_NAME(get_closest_objs)
+#define hwloc_get_obj_below_by_type HWLOC_NAME(get_obj_below_by_type)
+#define hwloc_get_obj_below_array_by_type HWLOC_NAME(get_obj_below_array_by_type)
+#define hwloc_distrib_flags_e HWLOC_NAME(distrib_flags_e)
+#define HWLOC_DISTRIB_FLAG_REVERSE HWLOC_NAME_CAPS(DISTRIB_FLAG_REVERSE)
+#define hwloc_distrib HWLOC_NAME(distrib)
+#define hwloc_alloc_membind_policy HWLOC_NAME(alloc_membind_policy)
+#define hwloc_alloc_membind_policy_nodeset HWLOC_NAME(alloc_membind_policy_nodeset)
+#define hwloc_topology_get_complete_cpuset HWLOC_NAME(topology_get_complete_cpuset)
+#define hwloc_topology_get_topology_cpuset HWLOC_NAME(topology_get_topology_cpuset)
+#define hwloc_topology_get_allowed_cpuset HWLOC_NAME(topology_get_allowed_cpuset)
+#define hwloc_topology_get_complete_nodeset HWLOC_NAME(topology_get_complete_nodeset)
+#define hwloc_topology_get_topology_nodeset HWLOC_NAME(topology_get_topology_nodeset)
+#define hwloc_topology_get_allowed_nodeset HWLOC_NAME(topology_get_allowed_nodeset)
+#define hwloc_cpuset_to_nodeset HWLOC_NAME(cpuset_to_nodeset)
+#define hwloc_cpuset_to_nodeset_strict HWLOC_NAME(cpuset_to_nodeset_strict)
+#define hwloc_cpuset_from_nodeset HWLOC_NAME(cpuset_from_nodeset)
+#define hwloc_cpuset_from_nodeset_strict HWLOC_NAME(cpuset_from_nodeset_strict)
+#define hwloc_get_whole_distance_matrix_by_depth HWLOC_NAME(get_whole_distance_matrix_by_depth)
+#define hwloc_get_whole_distance_matrix_by_type HWLOC_NAME(get_whole_distance_matrix_by_type)
+#define hwloc_get_distance_matrix_covering_obj_by_depth HWLOC_NAME(get_distance_matrix_covering_obj_by_depth)
+#define hwloc_get_latency HWLOC_NAME(get_latency)
+
+/* export.h */
+
+#define hwloc_topology_export_xml HWLOC_NAME(topology_export_xml)
+#define hwloc_topology_export_xmlbuffer HWLOC_NAME(topology_export_xmlbuffer)
+#define hwloc_free_xmlbuffer HWLOC_NAME(free_xmlbuffer)
+#define hwloc_topology_set_userdata_export_callback HWLOC_NAME(topology_set_userdata_export_callback)
+#define hwloc_export_obj_userdata HWLOC_NAME(export_obj_userdata)
+#define hwloc_export_obj_userdata_base64 HWLOC_NAME(export_obj_userdata_base64)
+#define hwloc_topology_set_userdata_import_callback HWLOC_NAME(topology_set_userdata_import_callback)
+
+#define hwloc_topology_export_synthetic_flags_e HWLOC_NAME(topology_export_synthetic_flags_e)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_EXTENDED_TYPES)
+#define HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS HWLOC_NAME_CAPS(TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS)
+#define hwloc_topology_export_synthetic HWLOC_NAME(topology_export_synthetic)
+
+/* diff.h */
+
+#define hwloc_topology_diff_obj_attr_type_e HWLOC_NAME(topology_diff_obj_attr_type_e)
+#define hwloc_topology_diff_obj_attr_type_t HWLOC_NAME(topology_diff_obj_attr_type_t)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_SIZE HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_SIZE)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_NAME HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_NAME)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR_INFO HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR_INFO)
+#define hwloc_topology_diff_obj_attr_u HWLOC_NAME(topology_diff_obj_attr_u)
+#define hwloc_topology_diff_obj_attr_generic_s HWLOC_NAME(topology_diff_obj_attr_generic_s)
+#define hwloc_topology_diff_obj_attr_uint64_s HWLOC_NAME(topology_diff_obj_attr_uint64_s)
+#define hwloc_topology_diff_obj_attr_string_s HWLOC_NAME(topology_diff_obj_attr_string_s)
+#define hwloc_topology_diff_type_e HWLOC_NAME(topology_diff_type_e)
+#define hwloc_topology_diff_type_t HWLOC_NAME(topology_diff_type_t)
+#define HWLOC_TOPOLOGY_DIFF_OBJ_ATTR HWLOC_NAME_CAPS(TOPOLOGY_DIFF_OBJ_ATTR)
+#define HWLOC_TOPOLOGY_DIFF_TOO_COMPLEX HWLOC_NAME_CAPS(TOPOLOGY_DIFF_TOO_COMPLEX)
+#define hwloc_topology_diff_u HWLOC_NAME(topology_diff_u)
+#define hwloc_topology_diff_t HWLOC_NAME(topology_diff_t)
+#define hwloc_topology_diff_generic_s HWLOC_NAME(topology_diff_generic_s)
+#define hwloc_topology_diff_obj_attr_s HWLOC_NAME(topology_diff_obj_attr_s)
+#define hwloc_topology_diff_too_complex_s HWLOC_NAME(topology_diff_too_complex_s)
+#define hwloc_topology_diff_build HWLOC_NAME(topology_diff_build)
+#define hwloc_topology_diff_apply_flags_e HWLOC_NAME(topology_diff_apply_flags_e)
+#define HWLOC_TOPOLOGY_DIFF_APPLY_REVERSE HWLOC_NAME_CAPS(TOPOLOGY_DIFF_APPLY_REVERSE)
+#define hwloc_topology_diff_apply HWLOC_NAME(topology_diff_apply)
+#define hwloc_topology_diff_destroy HWLOC_NAME(topology_diff_destroy)
+#define hwloc_topology_diff_load_xml HWLOC_NAME(topology_diff_load_xml)
+#define hwloc_topology_diff_export_xml HWLOC_NAME(topology_diff_export_xml)
+#define hwloc_topology_diff_load_xmlbuffer HWLOC_NAME(topology_diff_load_xmlbuffer)
+#define hwloc_topology_diff_export_xmlbuffer HWLOC_NAME(topology_diff_export_xmlbuffer)
+
+/* glibc-sched.h */
+
+#define hwloc_cpuset_to_glibc_sched_affinity HWLOC_NAME(cpuset_to_glibc_sched_affinity)
+#define hwloc_cpuset_from_glibc_sched_affinity HWLOC_NAME(cpuset_from_glibc_sched_affinity)
+
+/* linux-libnuma.h */
+
+#define hwloc_cpuset_to_linux_libnuma_ulongs HWLOC_NAME(cpuset_to_linux_libnuma_ulongs)
+#define hwloc_nodeset_to_linux_libnuma_ulongs HWLOC_NAME(nodeset_to_linux_libnuma_ulongs)
+#define hwloc_cpuset_from_linux_libnuma_ulongs HWLOC_NAME(cpuset_from_linux_libnuma_ulongs)
+#define hwloc_nodeset_from_linux_libnuma_ulongs HWLOC_NAME(nodeset_from_linux_libnuma_ulongs)
+#define hwloc_cpuset_to_linux_libnuma_bitmask HWLOC_NAME(cpuset_to_linux_libnuma_bitmask)
+#define hwloc_nodeset_to_linux_libnuma_bitmask HWLOC_NAME(nodeset_to_linux_libnuma_bitmask)
+#define hwloc_cpuset_from_linux_libnuma_bitmask HWLOC_NAME(cpuset_from_linux_libnuma_bitmask)
+#define hwloc_nodeset_from_linux_libnuma_bitmask HWLOC_NAME(nodeset_from_linux_libnuma_bitmask)
+
+/* linux.h */
+
+#define hwloc_linux_parse_cpumap_file HWLOC_NAME(linux_parse_cpumap_file)
+#define hwloc_linux_set_tid_cpubind HWLOC_NAME(linux_set_tid_cpubind)
+#define hwloc_linux_get_tid_cpubind HWLOC_NAME(linux_get_tid_cpubind)
+#define hwloc_linux_get_tid_last_cpu_location HWLOC_NAME(linux_get_tid_last_cpu_location)
+
+/* openfabrics-verbs.h */
+
+#define hwloc_ibv_get_device_cpuset HWLOC_NAME(ibv_get_device_cpuset)
+#define hwloc_ibv_get_device_osdev HWLOC_NAME(ibv_get_device_osdev)
+#define hwloc_ibv_get_device_osdev_by_name HWLOC_NAME(ibv_get_device_osdev_by_name)
+
+/* myriexpress.h */
+
+#define hwloc_mx_board_get_device_cpuset HWLOC_NAME(mx_board_get_device_cpuset)
+#define hwloc_mx_endpoint_get_device_cpuset HWLOC_NAME(mx_endpoint_get_device_cpuset)
+
+/* intel-mic.h */
+
+#define hwloc_intel_mic_get_device_cpuset HWLOC_NAME(intel_mic_get_device_cpuset)
+#define hwloc_intel_mic_get_device_osdev_by_index HWLOC_NAME(intel_mic_get_device_osdev_by_index)
+
+/* opencl.h */
+
+#define hwloc_opencl_get_device_cpuset HWLOC_NAME(opencl_get_device_cpuset)
+#define hwloc_opencl_get_device_osdev HWLOC_NAME(opencl_get_device_osdev)
+#define hwloc_opencl_get_device_osdev_by_index HWLOC_NAME(opencl_get_device_osdev_by_index)
+
+/* cuda.h */
+
+#define hwloc_cuda_get_device_pci_ids HWLOC_NAME(cuda_get_device_pci_ids)
+#define hwloc_cuda_get_device_cpuset HWLOC_NAME(cuda_get_device_cpuset)
+#define hwloc_cuda_get_device_pcidev HWLOC_NAME(cuda_get_device_pcidev)
+#define hwloc_cuda_get_device_osdev HWLOC_NAME(cuda_get_device_osdev)
+#define hwloc_cuda_get_device_osdev_by_index HWLOC_NAME(cuda_get_device_osdev_by_index)
+
+/* cudart.h */
+
+#define hwloc_cudart_get_device_pci_ids HWLOC_NAME(cudart_get_device_pci_ids)
+#define hwloc_cudart_get_device_cpuset HWLOC_NAME(cudart_get_device_cpuset)
+#define hwloc_cudart_get_device_pcidev HWLOC_NAME(cudart_get_device_pcidev)
+#define hwloc_cudart_get_device_osdev_by_index HWLOC_NAME(cudart_get_device_osdev_by_index)
+
+/* nvml.h */
+
+#define hwloc_nvml_get_device_cpuset HWLOC_NAME(nvml_get_device_cpuset)
+#define hwloc_nvml_get_device_osdev HWLOC_NAME(nvml_get_device_osdev)
+#define hwloc_nvml_get_device_osdev_by_index HWLOC_NAME(nvml_get_device_osdev_by_index)
+
+/* gl.h */
+
+#define hwloc_gl_get_display_osdev_by_port_device HWLOC_NAME(gl_get_display_osdev_by_port_device)
+#define hwloc_gl_get_display_osdev_by_name HWLOC_NAME(gl_get_display_osdev_by_name)
+#define hwloc_gl_get_display_by_osdev HWLOC_NAME(gl_get_display_by_osdev)
+
+/* hwloc/plugins.h */
+
+#define hwloc_disc_component_type_e HWLOC_NAME(disc_component_type_e)
+#define HWLOC_DISC_COMPONENT_TYPE_CPU HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_CPU)
+#define HWLOC_DISC_COMPONENT_TYPE_GLOBAL HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_GLOBAL)
+#define HWLOC_DISC_COMPONENT_TYPE_MISC HWLOC_NAME_CAPS(DISC_COMPONENT_TYPE_MISC)
+#define hwloc_disc_component_type_t HWLOC_NAME(disc_component_type_t)
+#define hwloc_disc_component HWLOC_NAME(disc_component)
+
+#define hwloc_backend HWLOC_NAME(backend)
+#define hwloc_backend_flag_e HWLOC_NAME(backend_flag_e)
+#define HWLOC_BACKEND_FLAG_NEED_LEVELS HWLOC_NAME_CAPS(BACKEND_FLAG_NEED_LEVELS)
+
+#define hwloc_backend_alloc HWLOC_NAME(backend_alloc)
+#define hwloc_backend_enable HWLOC_NAME(backend_enable)
+#define hwloc_backends_get_obj_cpuset HWLOC_NAME(backends_get_obj_cpuset)
+#define hwloc_backends_notify_new_object HWLOC_NAME(backends_notify_new_object)
+
+#define hwloc_component_type_e HWLOC_NAME(component_type_e)
+#define HWLOC_COMPONENT_TYPE_DISC HWLOC_NAME_CAPS(COMPONENT_TYPE_DISC)
+#define HWLOC_COMPONENT_TYPE_XML HWLOC_NAME_CAPS(COMPONENT_TYPE_XML)
+#define hwloc_component_type_t HWLOC_NAME(component_type_t)
+#define hwloc_component HWLOC_NAME(component)
+
+#define hwloc_plugin_check_namespace HWLOC_NAME(plugin_check_namespace)
+
+#define hwloc_insert_object_by_cpuset HWLOC_NAME(insert_object_by_cpuset)
+#define hwloc_report_error_t HWLOC_NAME(report_error_t)
+#define hwloc_report_os_error HWLOC_NAME(report_os_error)
+#define hwloc_hide_errors HWLOC_NAME(hide_errors)
+#define hwloc__insert_object_by_cpuset HWLOC_NAME(_insert_object_by_cpuset)
+#define hwloc_insert_object_by_parent HWLOC_NAME(insert_object_by_parent)
+#define hwloc_alloc_setup_object HWLOC_NAME(alloc_setup_object)
+#define hwloc_obj_add_children_sets HWLOC_NAME(add_children_sets)
+
+#define hwloc_insert_pci_device_list HWLOC_NAME(insert_pci_device_list)
+#define hwloc_pci_find_cap HWLOC_NAME(pci_find_cap)
+#define hwloc_pci_find_linkspeed HWLOC_NAME(pci_find_linkspeed)
+#define hwloc_pci_prepare_bridge HWLOC_NAME(pci_prepare_bridge)
+
+/* hwloc/deprecated.h */
+
+#define hwloc_obj_type_of_string HWLOC_NAME(obj_type_of_string )
+#define hwloc_obj_snprintf HWLOC_NAME(obj_snprintf)
+#define hwloc_distributev HWLOC_NAME(distributev)
+#define hwloc_distribute HWLOC_NAME(distribute)
+#define hwloc_topology_insert_misc_object_by_parent HWLOC_NAME(topology_insert_misc_object_by_parent)
+
+/* private/debug.h */
+
+#define hwloc_debug HWLOC_NAME(debug)
+
+/* private/misc.h */
+
+#define hwloc_snprintf HWLOC_NAME(snprintf)
+#define hwloc_namecoloncmp HWLOC_NAME(namecoloncmp)
+#define hwloc_ffsl_manual HWLOC_NAME(ffsl_manual)
+#define hwloc_ffs32 HWLOC_NAME(ffs32)
+#define hwloc_ffsl_from_ffs32 HWLOC_NAME(ffsl_from_ffs32)
+#define hwloc_flsl_manual HWLOC_NAME(flsl_manual)
+#define hwloc_fls32 HWLOC_NAME(fls32)
+#define hwloc_flsl_from_fls32 HWLOC_NAME(flsl_from_fls32)
+#define hwloc_weight_long HWLOC_NAME(weight_long)
+#define hwloc_strncasecmp HWLOC_NAME(strncasecmp)
+
+/* private/cpuid-x86.h */
+
+#define hwloc_have_x86_cpuid HWLOC_NAME(have_x86_cpuid)
+#define hwloc_x86_cpuid HWLOC_NAME(x86_cpuid)
+
+/* private/xml.h */
+
+#define hwloc__xml_verbose HWLOC_NAME(_xml_verbose)
+
+#define hwloc__xml_import_state_s HWLOC_NAME(_xml_import_state_s)
+#define hwloc__xml_import_state_t HWLOC_NAME(_xml_import_state_t)
+#define hwloc__xml_import_diff HWLOC_NAME(_xml_import_diff)
+#define hwloc_xml_backend_data_s HWLOC_NAME(xml_backend_data_s)
+#define hwloc__xml_export_state_s HWLOC_NAME(_xml_export_state_s)
+#define hwloc__xml_export_state_t HWLOC_NAME(_xml_export_state_t)
+#define hwloc__xml_export_object HWLOC_NAME(_xml_export_object)
+#define hwloc__xml_export_diff HWLOC_NAME(_xml_export_diff)
+
+#define hwloc_xml_callbacks HWLOC_NAME(xml_callbacks)
+#define hwloc_xml_component HWLOC_NAME(xml_component)
+#define hwloc_xml_callbacks_register HWLOC_NAME(xml_callbacks_register)
+#define hwloc_xml_callbacks_reset HWLOC_NAME(xml_callbacks_reset)
+
+/* private/components.h */
+
+#define hwloc_disc_component_force_enable HWLOC_NAME(disc_component_force_enable)
+#define hwloc_disc_components_enable_others HWLOC_NAME(disc_components_instantiate_others)
+
+#define hwloc_backends_disable_all HWLOC_NAME(backends_disable_all)
+#define hwloc_backends_is_thissystem HWLOC_NAME(backends_is_thissystem)
+
+#define hwloc_components_init HWLOC_NAME(components_init)
+#define hwloc_components_destroy_all HWLOC_NAME(components_destroy_all)
+
+/* private/private.h */
+
+#define hwloc_ignore_type_e HWLOC_NAME(ignore_type_e)
+
+#define HWLOC_IGNORE_TYPE_NEVER HWLOC_NAME_CAPS(IGNORE_TYPE_NEVER)
+#define HWLOC_IGNORE_TYPE_KEEP_STRUCTURE HWLOC_NAME_CAPS(IGNORE_TYPE_KEEP_STRUCTURE)
+#define HWLOC_IGNORE_TYPE_ALWAYS HWLOC_NAME_CAPS(IGNORE_TYPE_ALWAYS)
+
+#define hwloc_os_distances_s HWLOC_NAME(os_distances_s)
+
+#define hwloc_xml_imported_distances_s HWLOC_NAME(xml_imported_distances_s)
+
+#define hwloc_alloc_obj_cpusets HWLOC_NAME(alloc_obj_cpusets)
+#define hwloc_setup_pu_level HWLOC_NAME(setup_pu_level)
+#define hwloc_get_sysctlbyname HWLOC_NAME(get_sysctlbyname)
+#define hwloc_get_sysctl HWLOC_NAME(get_sysctl)
+#define hwloc_fallback_nbprocessors HWLOC_NAME(fallback_nbprocessors)
+#define hwloc_connect_children HWLOC_NAME(connect_children)
+#define hwloc_connect_levels HWLOC_NAME(connect_levels)
+
+#define hwloc__object_cpusets_compare_first HWLOC_NAME(_object_cpusets_compare_first)
+#define hwloc__reorder_children HWLOC_NAME(_reorder_children)
+
+#define hwloc_topology_setup_defaults HWLOC_NAME(topology_setup_defaults)
+#define hwloc_topology_clear HWLOC_NAME(topology_clear)
+
+#define hwloc__add_info HWLOC_NAME(_add_info)
+#define hwloc__find_info_slot HWLOC_NAME(_find_info_slot)
+#define hwloc__move_infos HWLOC_NAME(_move_infos)
+#define hwloc__free_infos HWLOC_NAME(_free_infos)
+
+#define hwloc_binding_hooks HWLOC_NAME(binding_hooks)
+#define hwloc_set_native_binding_hooks HWLOC_NAME(set_native_binding_hooks)
+#define hwloc_set_binding_hooks HWLOC_NAME(set_binding_hooks)
+
+#define hwloc_set_linuxfs_hooks HWLOC_NAME(set_linuxfs_hooks)
+#define hwloc_set_bgq_hooks HWLOC_NAME(set_bgq_hooks)
+#define hwloc_set_solaris_hooks HWLOC_NAME(set_solaris_hooks)
+#define hwloc_set_aix_hooks HWLOC_NAME(set_aix_hooks)
+#define hwloc_set_osf_hooks HWLOC_NAME(set_osf_hooks)
+#define hwloc_set_windows_hooks HWLOC_NAME(set_windows_hooks)
+#define hwloc_set_darwin_hooks HWLOC_NAME(set_darwin_hooks)
+#define hwloc_set_freebsd_hooks HWLOC_NAME(set_freebsd_hooks)
+#define hwloc_set_netbsd_hooks HWLOC_NAME(set_netbsd_hooks)
+#define hwloc_set_hpux_hooks HWLOC_NAME(set_hpux_hooks)
+
+#define hwloc_add_uname_info HWLOC_NAME(add_uname_info)
+#define hwloc_free_unlinked_object HWLOC_NAME(free_unlinked_object)
+#define hwloc__duplicate_objects HWLOC_NAME(_duplicate_objects)
+
+#define hwloc_alloc_heap HWLOC_NAME(alloc_heap)
+#define hwloc_alloc_mmap HWLOC_NAME(alloc_mmap)
+#define hwloc_free_heap HWLOC_NAME(free_heap)
+#define hwloc_free_mmap HWLOC_NAME(free_mmap)
+#define hwloc_alloc_or_fail HWLOC_NAME(alloc_or_fail)
+
+#define hwloc_distances_init HWLOC_NAME(distances_init)
+#define hwloc_distances_destroy HWLOC_NAME(distances_destroy)
+#define hwloc_distances_set HWLOC_NAME(distances_set)
+#define hwloc_distances_set_from_env HWLOC_NAME(distances_set_from_env)
+#define hwloc_distances_restrict_os HWLOC_NAME(distances_restrict_os)
+#define hwloc_distances_restrict HWLOC_NAME(distances_restrict)
+#define hwloc_distances_finalize_os HWLOC_NAME(distances_finalize_os)
+#define hwloc_distances_finalize_logical HWLOC_NAME(distances_finalize_logical)
+#define hwloc_clear_object_distances HWLOC_NAME(clear_object_distances)
+#define hwloc_clear_object_distances_one HWLOC_NAME(clear_object_distances_one)
+#define hwloc_group_by_distances HWLOC_NAME(group_by_distances)
+
+#define hwloc_encode_to_base64 HWLOC_NAME(encode_to_base64)
+#define hwloc_decode_from_base64 HWLOC_NAME(decode_from_base64)
+
+#define hwloc_obj_add_info_nodup HWLOC_NAME(obj_add_info_nodup)
+
+#define hwloc_progname HWLOC_NAME(progname)
+
+#define hwloc_bitmap_compare_inclusion HWLOC_NAME(bitmap_compare_inclusion)
+
+/* private/solaris-chiptype.h */
+
+#define hwloc_solaris_get_chip_type HWLOC_NAME(solaris_get_chip_type)
+#define hwloc_solaris_get_chip_model HWLOC_NAME(solaris_get_chip_model)
+
+#endif /* HWLOC_SYM_TRANSFORM */
+
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+
+#endif /* HWLOC_RENAME_H */
diff --git a/ext/hwloc/include/numa.h b/ext/hwloc/include/numa.h
new file mode 100644
index 0000000..1dbc137
--- /dev/null
+++ b/ext/hwloc/include/numa.h
@@ -0,0 +1,468 @@
+/* Copyright (C) 2003,2004 Andi Kleen, SuSE Labs.
+
+   libnuma is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; version
+   2.1.
+
+   libnuma is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should find a copy of v2.1 of the GNU Lesser General Public License
+   somewhere on your Linux system; if not, write to the Free Software
+   Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
+
+#ifndef _NUMA_H
+#define _NUMA_H 1
+
+/* allow an application to test for the current programming interface: */
+#define LIBNUMA_API_VERSION 2
+
+/* Simple NUMA policy library */
+
+#include <stddef.h>
+#include <string.h>
+#include <sys/types.h>
+#include <stdlib.h>
+
+#if defined(__x86_64__) || defined(__i386__)
+#define NUMA_NUM_NODES  128
+#else
+#define NUMA_NUM_NODES  2048
+#endif
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+typedef struct {
+        unsigned long n[NUMA_NUM_NODES/(sizeof(unsigned long)*8)];
+} nodemask_t;
+
+struct bitmask {
+	unsigned long size; /* number of bits in the map */
+	unsigned long *maskp;
+};
+
+/* operations on struct bitmask */
+int numa_bitmask_isbitset(const struct bitmask *, unsigned int);
+struct bitmask *numa_bitmask_setall(struct bitmask *);
+struct bitmask *numa_bitmask_clearall(struct bitmask *);
+struct bitmask *numa_bitmask_setbit(struct bitmask *, unsigned int);
+struct bitmask *numa_bitmask_clearbit(struct bitmask *, unsigned int);
+unsigned int numa_bitmask_nbytes(struct bitmask *);
+struct bitmask *numa_bitmask_alloc(unsigned int);
+void numa_bitmask_free(struct bitmask *);
+int numa_bitmask_equal(const struct bitmask *, const struct bitmask *);
+void copy_nodemask_to_bitmask(nodemask_t *, struct bitmask *);
+void copy_bitmask_to_nodemask(struct bitmask *, nodemask_t *);
+void copy_bitmask_to_bitmask(struct bitmask *, struct bitmask *);
+
+/* compatibility for codes that used them: */
+
+static inline void nodemask_zero(nodemask_t *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	numa_bitmask_clearall(&tmp);
+}
+
+static inline void nodemask_zero_compat(nodemask_t *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	numa_bitmask_clearall(&tmp);
+}
+
+static inline void nodemask_set_compat(nodemask_t *mask, int node)
+{
+	mask->n[node / (8*sizeof(unsigned long))] |=
+		(1UL<<(node%(8*sizeof(unsigned long))));
+}
+
+static inline void nodemask_clr_compat(nodemask_t *mask, int node)
+{
+	mask->n[node / (8*sizeof(unsigned long))] &=
+		~(1UL<<(node%(8*sizeof(unsigned long))));
+}
+
+static inline int nodemask_isset_compat(const nodemask_t *mask, int node)
+{
+	if ((unsigned)node >= NUMA_NUM_NODES)
+		return 0;
+	if (mask->n[node / (8*sizeof(unsigned long))] &
+		(1UL<<(node%(8*sizeof(unsigned long)))))
+		return 1;
+	return 0;
+}
+
+static inline int nodemask_equal(const nodemask_t *a, const nodemask_t *b)
+{
+	struct bitmask tmp_a, tmp_b;
+
+	tmp_a.maskp = (unsigned long *)a;
+	tmp_a.size = sizeof(nodemask_t) * 8;
+
+	tmp_b.maskp = (unsigned long *)b;
+	tmp_b.size = sizeof(nodemask_t) * 8;
+
+	return numa_bitmask_equal(&tmp_a, &tmp_b);
+}
+
+static inline int nodemask_equal_compat(const nodemask_t *a, const nodemask_t *b)
+{
+	struct bitmask tmp_a, tmp_b;
+
+	tmp_a.maskp = (unsigned long *)a;
+	tmp_a.size = sizeof(nodemask_t) * 8;
+
+	tmp_b.maskp = (unsigned long *)b;
+	tmp_b.size = sizeof(nodemask_t) * 8;
+
+	return numa_bitmask_equal(&tmp_a, &tmp_b);
+}
+
+/* NUMA support available. If this returns a negative value all other function
+   in this library are undefined. */
+int numa_available(void);
+
+/* Basic NUMA state */
+
+/* Get max available node */
+int numa_max_node(void);
+int numa_max_possible_node(void);
+/* Return preferred node */
+int numa_preferred(void);
+
+/* Return node size and free memory */
+long long numa_node_size64(int node, long long *freep);
+long numa_node_size(int node, long *freep);
+
+int numa_pagesize(void);
+
+/* Set with all nodes from which the calling process may allocate memory.
+   Only valid after numa_available. */
+extern struct bitmask *numa_all_nodes_ptr;
+
+/* Set with all nodes the kernel has exposed to userspace */
+extern struct bitmask *numa_nodes_ptr;
+
+/* For source compatibility */
+extern nodemask_t numa_all_nodes;
+
+/* Set with all cpus. */
+extern struct bitmask *numa_all_cpus_ptr;
+
+/* Set with no nodes */
+extern struct bitmask *numa_no_nodes_ptr;
+
+/* Source compatibility */
+extern nodemask_t numa_no_nodes;
+
+/* Only run and allocate memory from a specific set of nodes. */
+void numa_bind(struct bitmask *nodes);
+
+/* Set the NUMA node interleaving mask. 0 to turn off interleaving */
+void numa_set_interleave_mask(struct bitmask *nodemask);
+
+/* Return the current interleaving mask */
+struct bitmask *numa_get_interleave_mask(void);
+
+/* allocate a bitmask big enough for all nodes */
+struct bitmask *numa_allocate_nodemask(void);
+
+static inline void numa_free_nodemask(struct bitmask *b)
+{
+	numa_bitmask_free(b);
+}
+
+/* Some node to preferably allocate memory from for task. */
+void numa_set_preferred(int node);
+
+/* Set local memory allocation policy for task */
+void numa_set_localalloc(void);
+
+/* Only allocate memory from the nodes set in mask. 0 to turn off */
+void numa_set_membind(struct bitmask *nodemask);
+
+/* Return current membind */
+struct bitmask *numa_get_membind(void);
+
+/* Return allowed memories [nodes] */
+struct bitmask *numa_get_mems_allowed(void);
+
+int numa_get_interleave_node(void);
+
+/* NUMA memory allocation. These functions always round to page size
+   and are relatively slow. */
+
+/* Alloc memory page interleaved on nodes in mask */
+void *numa_alloc_interleaved_subset(size_t size, struct bitmask *nodemask);
+/* Alloc memory page interleaved on all nodes. */
+void *numa_alloc_interleaved(size_t size);
+/* Alloc memory located on node */
+void *numa_alloc_onnode(size_t size, int node);
+/* Alloc memory on local node */
+void *numa_alloc_local(size_t size);
+/* Allocation with current policy */
+void *numa_alloc(size_t size);
+/* Change the size of a memory area preserving the memory policy */
+void *numa_realloc(void *old_addr, size_t old_size, size_t new_size);
+/* Free memory allocated by the functions above */
+void numa_free(void *mem, size_t size);
+
+/* Low level functions, primarily for shared memory. All memory
+   processed by these must not be touched yet */
+
+/* Interleave an memory area. */
+void numa_interleave_memory(void *mem, size_t size, struct bitmask *mask);
+
+/* Allocate a memory area on a specific node. */
+void numa_tonode_memory(void *start, size_t size, int node);
+
+/* Allocate memory on a mask of nodes. */
+void numa_tonodemask_memory(void *mem, size_t size, struct bitmask *mask);
+
+/* Allocate a memory area on the current node. */
+void numa_setlocal_memory(void *start, size_t size);
+
+/* Allocate memory area with current memory policy */
+void numa_police_memory(void *start, size_t size);
+
+/* Run current task only on nodes in mask */
+int numa_run_on_node_mask(struct bitmask *mask);
+/* Run current task only on node */
+int numa_run_on_node(int node);
+/* Return current mask of nodes the task can run on */
+struct bitmask * numa_get_run_node_mask(void);
+
+/* When strict fail allocation when memory cannot be allocated in target node(s). */
+void numa_set_bind_policy(int strict);
+
+/* Fail when existing memory has incompatible policy */
+void numa_set_strict(int flag);
+
+/* maximum nodes (size of kernel nodemask_t) */
+int numa_num_possible_nodes();
+
+/* maximum cpus (size of kernel cpumask_t) */
+int numa_num_possible_cpus();
+
+/* nodes in the system */
+int numa_num_configured_nodes();
+
+/* maximum cpus */
+int numa_num_configured_cpus();
+
+/* maximum cpus allowed to current task */
+int numa_num_task_cpus();
+int numa_num_thread_cpus(); /* backward compatibility */
+
+/* maximum nodes allowed to current task */
+int numa_num_task_nodes();
+int numa_num_thread_nodes(); /* backward compatibility */
+
+/* allocate a bitmask the size of the kernel cpumask_t */
+struct bitmask *numa_allocate_cpumask();
+
+static inline void numa_free_cpumask(struct bitmask *b)
+{
+	numa_bitmask_free(b);
+}
+
+/* Convert node to CPU mask. -1/errno on failure, otherwise 0. */
+int numa_node_to_cpus(int, struct bitmask *);
+
+/* report the node of the specified cpu. -1/errno on invalid cpu. */
+int numa_node_of_cpu(int cpu);
+
+/* Report distance of node1 from node2. 0 on error.*/
+int numa_distance(int node1, int node2);
+
+/* Error handling. */
+/* This is an internal function in libnuma that can be overwritten by an user
+   program. Default is to print an error to stderr and exit if numa_exit_on_error
+   is true. */
+void numa_error(char *where);
+
+/* When true exit the program when a NUMA system call (except numa_available)
+   fails */
+extern int numa_exit_on_error;
+/* Warning function. Can also be overwritten. Default is to print on stderr
+   once. */
+void numa_warn(int num, char *fmt, ...);
+
+/* When true exit the program on a numa_warn() call */
+extern int numa_exit_on_warn;
+
+int numa_migrate_pages(int pid, struct bitmask *from, struct bitmask *to);
+
+int numa_move_pages(int pid, unsigned long count, void **pages,
+		const int *nodes, int *status, int flags);
+
+int numa_sched_getaffinity(pid_t, struct bitmask *);
+int numa_sched_setaffinity(pid_t, struct bitmask *);
+
+/* Convert an ascii list of nodes to a bitmask */
+struct bitmask *numa_parse_nodestring(char *);
+
+/* Convert an ascii list of cpu to a bitmask */
+struct bitmask *numa_parse_cpustring(char *);
+
+/*
+ * The following functions are for source code compatibility
+ * with releases prior to version 2.
+ * Such codes should be compiled with NUMA_VERSION1_COMPATIBILITY defined.
+ */
+
+static inline void numa_set_interleave_mask_compat(nodemask_t *nodemask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)nodemask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	numa_set_interleave_mask(&tmp);
+}
+
+static inline nodemask_t numa_get_interleave_mask_compat()
+{
+	struct bitmask *tp;
+	nodemask_t mask;
+
+	tp = numa_get_interleave_mask();
+	copy_bitmask_to_nodemask(tp, &mask);
+	numa_bitmask_free(tp);
+	return mask;
+}
+
+static inline void numa_bind_compat(nodemask_t *mask)
+{
+	struct bitmask *tp;
+
+	tp = numa_allocate_nodemask();
+	copy_nodemask_to_bitmask(mask, tp);
+	numa_bind(tp);
+	numa_bitmask_free(tp);
+}
+
+static inline void numa_set_membind_compat(nodemask_t *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	numa_set_membind(&tmp);
+}
+
+static inline nodemask_t numa_get_membind_compat()
+{
+	struct bitmask *tp;
+	nodemask_t mask;
+
+	tp = numa_get_membind();
+	copy_bitmask_to_nodemask(tp, &mask);
+	numa_bitmask_free(tp);
+	return mask;
+}
+
+static inline void *numa_alloc_interleaved_subset_compat(size_t size,
+					const nodemask_t *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	return numa_alloc_interleaved_subset(size, &tmp);
+}
+
+static inline int numa_run_on_node_mask_compat(const nodemask_t *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	return numa_run_on_node_mask(&tmp);
+}
+
+static inline nodemask_t numa_get_run_node_mask_compat()
+{
+	struct bitmask *tp;
+	nodemask_t mask;
+
+	tp = numa_get_run_node_mask();
+	copy_bitmask_to_nodemask(tp, &mask);
+	numa_bitmask_free(tp);
+	return mask;
+}
+
+static inline void numa_interleave_memory_compat(void *mem, size_t size,
+						const nodemask_t *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	numa_interleave_memory(mem, size, &tmp);
+}
+
+static inline void numa_tonodemask_memory_compat(void *mem, size_t size,
+						const nodemask_t *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = sizeof(nodemask_t) * 8;
+	numa_tonodemask_memory(mem, size, &tmp);
+}
+
+static inline int numa_sched_getaffinity_compat(pid_t pid, unsigned len,
+						unsigned long *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = len * 8;
+	return numa_sched_getaffinity(pid, &tmp);
+}
+
+static inline int numa_sched_setaffinity_compat(pid_t pid, unsigned len,
+						unsigned long *mask)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)mask;
+	tmp.size = len * 8;
+	return numa_sched_setaffinity(pid, &tmp);
+}
+
+static inline int numa_node_to_cpus_compat(int node, unsigned long *buffer,
+							int buffer_len)
+{
+	struct bitmask tmp;
+
+	tmp.maskp = (unsigned long *)buffer;
+	tmp.size = buffer_len * 8;
+	return numa_node_to_cpus(node, &tmp);
+}
+
+/* end of version 1 compatibility functions */
+
+/*
+ * To compile an application that uses libnuma version 1:
+ *   add -DNUMA_VERSION1_COMPATIBILITY to your Makefile's CFLAGS
+ */
+#ifdef NUMA_VERSION1_COMPATIBILITY
+#include <numacompat1.h>
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/ext/hwloc/include/pci/config.h b/ext/hwloc/include/pci/config.h
new file mode 100644
index 0000000..beecb1d
--- /dev/null
+++ b/ext/hwloc/include/pci/config.h
@@ -0,0 +1,16 @@
+#define PCI_CONFIG_H
+#define PCI_ARCH_X86_64
+#define PCI_OS_LINUX
+#define PCI_HAVE_PM_LINUX_SYSFS
+#define PCI_HAVE_PM_LINUX_PROC
+#define PCI_HAVE_LINUX_BYTEORDER_H
+#define PCI_PATH_PROC_BUS_PCI "/proc/bus/pci"
+#define PCI_PATH_SYS_BUS_PCI "/sys/bus/pci"
+#define PCI_HAVE_PM_INTEL_CONF
+#define PCI_HAVE_64BIT_ADDRESS
+#define PCI_HAVE_PM_DUMP
+#define PCI_COMPRESSED_IDS
+#define PCI_IDS "pci.ids.gz"
+#define PCI_PATH_IDS_DIR "/usr/share/misc"
+#define PCI_USE_DNS
+#define PCI_ID_DOMAIN "pci.id.ucw.cz"
diff --git a/ext/hwloc/include/pci/header.h b/ext/hwloc/include/pci/header.h
new file mode 100644
index 0000000..d481f27
--- /dev/null
+++ b/ext/hwloc/include/pci/header.h
@@ -0,0 +1,1195 @@
+/*
+ *	The PCI Library -- PCI Header Structure (based on <linux/pci.h>)
+ *
+ *	Copyright (c) 1997--2010 Martin Mares <mj at ucw.cz>
+ *
+ *	Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+/*
+ * Under PCI, each device has 256 bytes of configuration address space,
+ * of which the first 64 bytes are standardized as follows:
+ */
+#define PCI_VENDOR_ID		0x00	/* 16 bits */
+#define PCI_DEVICE_ID		0x02	/* 16 bits */
+#define PCI_COMMAND		0x04	/* 16 bits */
+#define  PCI_COMMAND_IO		0x1	/* Enable response in I/O space */
+#define  PCI_COMMAND_MEMORY	0x2	/* Enable response in Memory space */
+#define  PCI_COMMAND_MASTER	0x4	/* Enable bus mastering */
+#define  PCI_COMMAND_SPECIAL	0x8	/* Enable response to special cycles */
+#define  PCI_COMMAND_INVALIDATE	0x10	/* Use memory write and invalidate */
+#define  PCI_COMMAND_VGA_PALETTE 0x20	/* Enable palette snooping */
+#define  PCI_COMMAND_PARITY	0x40	/* Enable parity checking */
+#define  PCI_COMMAND_WAIT 	0x80	/* Enable address/data stepping */
+#define  PCI_COMMAND_SERR	0x100	/* Enable SERR */
+#define  PCI_COMMAND_FAST_BACK	0x200	/* Enable back-to-back writes */
+#define  PCI_COMMAND_DISABLE_INTx	0x400	/* PCIE: Disable INTx interrupts */
+
+#define PCI_STATUS		0x06	/* 16 bits */
+#define  PCI_STATUS_INTx	0x08	/* PCIE: INTx interrupt pending */
+#define  PCI_STATUS_CAP_LIST	0x10	/* Support Capability List */
+#define  PCI_STATUS_66MHZ	0x20	/* Support 66 Mhz PCI 2.1 bus */
+#define  PCI_STATUS_UDF		0x40	/* Support User Definable Features [obsolete] */
+#define  PCI_STATUS_FAST_BACK	0x80	/* Accept fast-back to back */
+#define  PCI_STATUS_PARITY	0x100	/* Detected parity error */
+#define  PCI_STATUS_DEVSEL_MASK	0x600	/* DEVSEL timing */
+#define  PCI_STATUS_DEVSEL_FAST	0x000
+#define  PCI_STATUS_DEVSEL_MEDIUM 0x200
+#define  PCI_STATUS_DEVSEL_SLOW 0x400
+#define  PCI_STATUS_SIG_TARGET_ABORT 0x800 /* Set on target abort */
+#define  PCI_STATUS_REC_TARGET_ABORT 0x1000 /* Master ack of " */
+#define  PCI_STATUS_REC_MASTER_ABORT 0x2000 /* Set on master abort */
+#define  PCI_STATUS_SIG_SYSTEM_ERROR 0x4000 /* Set when we drive SERR */
+#define  PCI_STATUS_DETECTED_PARITY 0x8000 /* Set on parity error */
+
+#define PCI_CLASS_REVISION	0x08	/* High 24 bits are class, low 8
+					   revision */
+#define PCI_REVISION_ID         0x08    /* Revision ID */
+#define PCI_CLASS_PROG          0x09    /* Reg. Level Programming Interface */
+#define PCI_CLASS_DEVICE        0x0a    /* Device class */
+
+#define PCI_CACHE_LINE_SIZE	0x0c	/* 8 bits */
+#define PCI_LATENCY_TIMER	0x0d	/* 8 bits */
+#define PCI_HEADER_TYPE		0x0e	/* 8 bits */
+#define  PCI_HEADER_TYPE_NORMAL	0
+#define  PCI_HEADER_TYPE_BRIDGE 1
+#define  PCI_HEADER_TYPE_CARDBUS 2
+
+#define PCI_BIST		0x0f	/* 8 bits */
+#define PCI_BIST_CODE_MASK	0x0f	/* Return result */
+#define PCI_BIST_START		0x40	/* 1 to start BIST, 2 secs or less */
+#define PCI_BIST_CAPABLE	0x80	/* 1 if BIST capable */
+
+/*
+ * Base addresses specify locations in memory or I/O space.
+ * Decoded size can be determined by writing a value of
+ * 0xffffffff to the register, and reading it back.  Only
+ * 1 bits are decoded.
+ */
+#define PCI_BASE_ADDRESS_0	0x10	/* 32 bits */
+#define PCI_BASE_ADDRESS_1	0x14	/* 32 bits [htype 0,1 only] */
+#define PCI_BASE_ADDRESS_2	0x18	/* 32 bits [htype 0 only] */
+#define PCI_BASE_ADDRESS_3	0x1c	/* 32 bits */
+#define PCI_BASE_ADDRESS_4	0x20	/* 32 bits */
+#define PCI_BASE_ADDRESS_5	0x24	/* 32 bits */
+#define  PCI_BASE_ADDRESS_SPACE	0x01	/* 0 = memory, 1 = I/O */
+#define  PCI_BASE_ADDRESS_SPACE_IO 0x01
+#define  PCI_BASE_ADDRESS_SPACE_MEMORY 0x00
+#define  PCI_BASE_ADDRESS_MEM_TYPE_MASK 0x06
+#define  PCI_BASE_ADDRESS_MEM_TYPE_32	0x00	/* 32 bit address */
+#define  PCI_BASE_ADDRESS_MEM_TYPE_1M	0x02	/* Below 1M [obsolete] */
+#define  PCI_BASE_ADDRESS_MEM_TYPE_64	0x04	/* 64 bit address */
+#define  PCI_BASE_ADDRESS_MEM_PREFETCH	0x08	/* prefetchable? */
+#define  PCI_BASE_ADDRESS_MEM_MASK	(~(pciaddr_t)0x0f)
+#define  PCI_BASE_ADDRESS_IO_MASK	(~(pciaddr_t)0x03)
+/* bit 1 is reserved if address_space = 1 */
+
+/* Header type 0 (normal devices) */
+#define PCI_CARDBUS_CIS		0x28
+#define PCI_SUBSYSTEM_VENDOR_ID	0x2c
+#define PCI_SUBSYSTEM_ID	0x2e
+#define PCI_ROM_ADDRESS		0x30	/* Bits 31..11 are address, 10..1 reserved */
+#define  PCI_ROM_ADDRESS_ENABLE	0x01
+#define PCI_ROM_ADDRESS_MASK	(~(pciaddr_t)0x7ff)
+
+#define PCI_CAPABILITY_LIST	0x34	/* Offset of first capability list entry */
+
+/* 0x35-0x3b are reserved */
+#define PCI_INTERRUPT_LINE	0x3c	/* 8 bits */
+#define PCI_INTERRUPT_PIN	0x3d	/* 8 bits */
+#define PCI_MIN_GNT		0x3e	/* 8 bits */
+#define PCI_MAX_LAT		0x3f	/* 8 bits */
+
+/* Header type 1 (PCI-to-PCI bridges) */
+#define PCI_PRIMARY_BUS		0x18	/* Primary bus number */
+#define PCI_SECONDARY_BUS	0x19	/* Secondary bus number */
+#define PCI_SUBORDINATE_BUS	0x1a	/* Highest bus number behind the bridge */
+#define PCI_SEC_LATENCY_TIMER	0x1b	/* Latency timer for secondary interface */
+#define PCI_IO_BASE		0x1c	/* I/O range behind the bridge */
+#define PCI_IO_LIMIT		0x1d
+#define  PCI_IO_RANGE_TYPE_MASK	0x0f	/* I/O bridging type */
+#define  PCI_IO_RANGE_TYPE_16	0x00
+#define  PCI_IO_RANGE_TYPE_32	0x01
+#define  PCI_IO_RANGE_MASK	~0x0f
+#define PCI_SEC_STATUS		0x1e	/* Secondary status register */
+#define PCI_MEMORY_BASE		0x20	/* Memory range behind */
+#define PCI_MEMORY_LIMIT	0x22
+#define  PCI_MEMORY_RANGE_TYPE_MASK 0x0f
+#define  PCI_MEMORY_RANGE_MASK	~0x0f
+#define PCI_PREF_MEMORY_BASE	0x24	/* Prefetchable memory range behind */
+#define PCI_PREF_MEMORY_LIMIT	0x26
+#define  PCI_PREF_RANGE_TYPE_MASK 0x0f
+#define  PCI_PREF_RANGE_TYPE_32	0x00
+#define  PCI_PREF_RANGE_TYPE_64	0x01
+#define  PCI_PREF_RANGE_MASK	~0x0f
+#define PCI_PREF_BASE_UPPER32	0x28	/* Upper half of prefetchable memory range */
+#define PCI_PREF_LIMIT_UPPER32	0x2c
+#define PCI_IO_BASE_UPPER16	0x30	/* Upper half of I/O addresses */
+#define PCI_IO_LIMIT_UPPER16	0x32
+/* 0x34 same as for htype 0 */
+/* 0x35-0x3b is reserved */
+#define PCI_ROM_ADDRESS1	0x38	/* Same as PCI_ROM_ADDRESS, but for htype 1 */
+/* 0x3c-0x3d are same as for htype 0 */
+#define PCI_BRIDGE_CONTROL	0x3e
+#define  PCI_BRIDGE_CTL_PARITY	0x01	/* Enable parity detection on secondary interface */
+#define  PCI_BRIDGE_CTL_SERR	0x02	/* The same for SERR forwarding */
+#define  PCI_BRIDGE_CTL_NO_ISA	0x04	/* Disable bridging of ISA ports */
+#define  PCI_BRIDGE_CTL_VGA	0x08	/* Forward VGA addresses */
+#define  PCI_BRIDGE_CTL_MASTER_ABORT 0x20  /* Report master aborts */
+#define  PCI_BRIDGE_CTL_BUS_RESET 0x40	/* Secondary bus reset */
+#define  PCI_BRIDGE_CTL_FAST_BACK 0x80	/* Fast Back2Back enabled on secondary interface */
+#define  PCI_BRIDGE_CTL_PRI_DISCARD_TIMER 0x100		/* PCI-X? */
+#define  PCI_BRIDGE_CTL_SEC_DISCARD_TIMER 0x200		/* PCI-X? */
+#define  PCI_BRIDGE_CTL_DISCARD_TIMER_STATUS 0x400	/* PCI-X? */
+#define  PCI_BRIDGE_CTL_DISCARD_TIMER_SERR_EN 0x800	/* PCI-X? */
+
+/* Header type 2 (CardBus bridges) */
+/* 0x14-0x15 reserved */
+#define PCI_CB_SEC_STATUS	0x16	/* Secondary status */
+#define PCI_CB_PRIMARY_BUS	0x18	/* PCI bus number */
+#define PCI_CB_CARD_BUS		0x19	/* CardBus bus number */
+#define PCI_CB_SUBORDINATE_BUS	0x1a	/* Subordinate bus number */
+#define PCI_CB_LATENCY_TIMER	0x1b	/* CardBus latency timer */
+#define PCI_CB_MEMORY_BASE_0	0x1c
+#define PCI_CB_MEMORY_LIMIT_0	0x20
+#define PCI_CB_MEMORY_BASE_1	0x24
+#define PCI_CB_MEMORY_LIMIT_1	0x28
+#define PCI_CB_IO_BASE_0	0x2c
+#define PCI_CB_IO_BASE_0_HI	0x2e
+#define PCI_CB_IO_LIMIT_0	0x30
+#define PCI_CB_IO_LIMIT_0_HI	0x32
+#define PCI_CB_IO_BASE_1	0x34
+#define PCI_CB_IO_BASE_1_HI	0x36
+#define PCI_CB_IO_LIMIT_1	0x38
+#define PCI_CB_IO_LIMIT_1_HI	0x3a
+#define  PCI_CB_IO_RANGE_MASK	~0x03
+/* 0x3c-0x3d are same as for htype 0 */
+#define PCI_CB_BRIDGE_CONTROL	0x3e
+#define  PCI_CB_BRIDGE_CTL_PARITY	0x01	/* Similar to standard bridge control register */
+#define  PCI_CB_BRIDGE_CTL_SERR		0x02
+#define  PCI_CB_BRIDGE_CTL_ISA		0x04
+#define  PCI_CB_BRIDGE_CTL_VGA		0x08
+#define  PCI_CB_BRIDGE_CTL_MASTER_ABORT	0x20
+#define  PCI_CB_BRIDGE_CTL_CB_RESET	0x40	/* CardBus reset */
+#define  PCI_CB_BRIDGE_CTL_16BIT_INT	0x80	/* Enable interrupt for 16-bit cards */
+#define  PCI_CB_BRIDGE_CTL_PREFETCH_MEM0 0x100	/* Prefetch enable for both memory regions */
+#define  PCI_CB_BRIDGE_CTL_PREFETCH_MEM1 0x200
+#define  PCI_CB_BRIDGE_CTL_POST_WRITES	0x400
+#define PCI_CB_SUBSYSTEM_VENDOR_ID 0x40
+#define PCI_CB_SUBSYSTEM_ID	0x42
+#define PCI_CB_LEGACY_MODE_BASE	0x44	/* 16-bit PC Card legacy mode base address (ExCa) */
+/* 0x48-0x7f reserved */
+
+/* Capability lists */
+
+#define PCI_CAP_LIST_ID		0	/* Capability ID */
+#define  PCI_CAP_ID_PM		0x01	/* Power Management */
+#define  PCI_CAP_ID_AGP		0x02	/* Accelerated Graphics Port */
+#define  PCI_CAP_ID_VPD		0x03	/* Vital Product Data */
+#define  PCI_CAP_ID_SLOTID	0x04	/* Slot Identification */
+#define  PCI_CAP_ID_MSI		0x05	/* Message Signaled Interrupts */
+#define  PCI_CAP_ID_CHSWP	0x06	/* CompactPCI HotSwap */
+#define  PCI_CAP_ID_PCIX        0x07    /* PCI-X */
+#define  PCI_CAP_ID_HT          0x08    /* HyperTransport */
+#define  PCI_CAP_ID_VNDR	0x09	/* Vendor specific */
+#define  PCI_CAP_ID_DBG		0x0A	/* Debug port */
+#define  PCI_CAP_ID_CCRC	0x0B	/* CompactPCI Central Resource Control */
+#define  PCI_CAP_ID_HOTPLUG	0x0C	/* PCI hot-plug */
+#define  PCI_CAP_ID_SSVID	0x0D	/* Bridge subsystem vendor/device ID */
+#define  PCI_CAP_ID_AGP3	0x0E	/* AGP 8x */
+#define  PCI_CAP_ID_SECURE	0x0F	/* Secure device (?) */
+#define  PCI_CAP_ID_EXP		0x10	/* PCI Express */
+#define  PCI_CAP_ID_MSIX	0x11	/* MSI-X */
+#define  PCI_CAP_ID_SATA	0x12	/* Serial-ATA HBA */
+#define  PCI_CAP_ID_AF		0x13	/* Advanced features of PCI devices integrated in PCIe root cplx */
+#define PCI_CAP_LIST_NEXT	1	/* Next capability in the list */
+#define PCI_CAP_FLAGS		2	/* Capability defined flags (16 bits) */
+#define PCI_CAP_SIZEOF		4
+
+/* Capabilities residing in the PCI Express extended configuration space */
+
+#define PCI_EXT_CAP_ID_AER	0x01	/* Advanced Error Reporting */
+#define PCI_EXT_CAP_ID_VC	0x02	/* Virtual Channel */
+#define PCI_EXT_CAP_ID_DSN	0x03	/* Device Serial Number */
+#define PCI_EXT_CAP_ID_PB	0x04	/* Power Budgeting */
+#define PCI_EXT_CAP_ID_RCLINK	0x05	/* Root Complex Link Declaration */
+#define PCI_EXT_CAP_ID_RCILINK	0x06	/* Root Complex Internal Link Declaration */
+#define PCI_EXT_CAP_ID_RCECOLL	0x07	/* Root Complex Event Collector */
+#define PCI_EXT_CAP_ID_MFVC	0x08	/* Multi-Function Virtual Channel */
+#define PCI_EXT_CAP_ID_VC2	0x09	/* Virtual Channel (2nd ID) */
+#define PCI_EXT_CAP_ID_RBCB	0x0a	/* Root Bridge Control Block */
+#define PCI_EXT_CAP_ID_VNDR	0x0b	/* Vendor specific */
+#define PCI_EXT_CAP_ID_ACS	0x0d	/* Access Controls */
+#define PCI_EXT_CAP_ID_ARI	0x0e	/* Alternative Routing-ID Interpretation */
+#define PCI_EXT_CAP_ID_ATS	0x0f	/* Address Translation Service */
+#define PCI_EXT_CAP_ID_SRIOV	0x10	/* Single Root I/O Virtualization */
+#define PCI_EXT_CAP_ID_TPH	0x17	/* Transaction processing hints */
+#define PCI_EXT_CAP_ID_LTR	0x18	/* Latency Tolerance Reporting */
+
+/*** Definitions of capabilities ***/
+
+/* Power Management Registers */
+
+#define  PCI_PM_CAP_VER_MASK	0x0007	/* Version (2=PM1.1) */
+#define  PCI_PM_CAP_PME_CLOCK	0x0008	/* Clock required for PME generation */
+#define  PCI_PM_CAP_DSI		0x0020	/* Device specific initialization required */
+#define  PCI_PM_CAP_AUX_C_MASK	0x01c0	/* Maximum aux current required in D3cold */
+#define  PCI_PM_CAP_D1		0x0200	/* D1 power state support */
+#define  PCI_PM_CAP_D2		0x0400	/* D2 power state support */
+#define  PCI_PM_CAP_PME_D0	0x0800	/* PME can be asserted from D0 */
+#define  PCI_PM_CAP_PME_D1	0x1000	/* PME can be asserted from D1 */
+#define  PCI_PM_CAP_PME_D2	0x2000	/* PME can be asserted from D2 */
+#define  PCI_PM_CAP_PME_D3_HOT	0x4000	/* PME can be asserted from D3hot */
+#define  PCI_PM_CAP_PME_D3_COLD	0x8000	/* PME can be asserted from D3cold */
+#define PCI_PM_CTRL		4	/* PM control and status register */
+#define  PCI_PM_CTRL_STATE_MASK	0x0003	/* Current power state (D0 to D3) */
+#define  PCI_PM_CTRL_NO_SOFT_RST	0x0008	/* No Soft Reset from D3hot to D0 */
+#define  PCI_PM_CTRL_PME_ENABLE	0x0100	/* PME pin enable */
+#define  PCI_PM_CTRL_DATA_SEL_MASK	0x1e00	/* PM table data index */
+#define  PCI_PM_CTRL_DATA_SCALE_MASK	0x6000	/* PM table data scaling factor */
+#define  PCI_PM_CTRL_PME_STATUS	0x8000	/* PME pin status */
+#define PCI_PM_PPB_EXTENSIONS	6	/* PPB support extensions */
+#define  PCI_PM_PPB_B2_B3	0x40	/* If bridge enters D3hot, bus enters: 0=B3, 1=B2 */
+#define  PCI_PM_BPCC_ENABLE	0x80	/* Secondary bus is power managed */
+#define PCI_PM_DATA_REGISTER	7	/* PM table contents read here */
+#define PCI_PM_SIZEOF		8
+
+/* AGP registers */
+
+#define PCI_AGP_VERSION		2	/* BCD version number */
+#define PCI_AGP_RFU		3	/* Rest of capability flags */
+#define PCI_AGP_STATUS		4	/* Status register */
+#define  PCI_AGP_STATUS_RQ_MASK	0xff000000	/* Maximum number of requests - 1 */
+#define  PCI_AGP_STATUS_ISOCH	0x10000	/* Isochronous transactions supported */
+#define  PCI_AGP_STATUS_ARQSZ_MASK	0xe000	/* log2(optimum async req size in bytes) - 4 */
+#define  PCI_AGP_STATUS_CAL_MASK	0x1c00	/* Calibration cycle timing */
+#define  PCI_AGP_STATUS_SBA	0x0200	/* Sideband addressing supported */
+#define  PCI_AGP_STATUS_ITA_COH	0x0100	/* In-aperture accesses always coherent */
+#define  PCI_AGP_STATUS_GART64	0x0080	/* 64-bit GART entries supported */
+#define  PCI_AGP_STATUS_HTRANS	0x0040	/* If 0, core logic can xlate host CPU accesses thru aperture */
+#define  PCI_AGP_STATUS_64BIT	0x0020	/* 64-bit addressing cycles supported */
+#define  PCI_AGP_STATUS_FW	0x0010	/* Fast write transfers supported */
+#define  PCI_AGP_STATUS_AGP3	0x0008	/* AGP3 mode supported */
+#define  PCI_AGP_STATUS_RATE4	0x0004	/* 4x transfer rate supported (RFU in AGP3 mode) */
+#define  PCI_AGP_STATUS_RATE2	0x0002	/* 2x transfer rate supported (8x in AGP3 mode) */
+#define  PCI_AGP_STATUS_RATE1	0x0001	/* 1x transfer rate supported (4x in AGP3 mode) */
+#define PCI_AGP_COMMAND		8	/* Control register */
+#define  PCI_AGP_COMMAND_RQ_MASK 0xff000000  /* Master: Maximum number of requests */
+#define  PCI_AGP_COMMAND_ARQSZ_MASK	0xe000	/* log2(optimum async req size in bytes) - 4 */
+#define  PCI_AGP_COMMAND_CAL_MASK	0x1c00	/* Calibration cycle timing */
+#define  PCI_AGP_COMMAND_SBA	0x0200	/* Sideband addressing enabled */
+#define  PCI_AGP_COMMAND_AGP	0x0100	/* Allow processing of AGP transactions */
+#define  PCI_AGP_COMMAND_GART64	0x0080	/* 64-bit GART entries enabled */
+#define  PCI_AGP_COMMAND_64BIT	0x0020 	/* Allow generation of 64-bit addr cycles */
+#define  PCI_AGP_COMMAND_FW	0x0010 	/* Enable FW transfers */
+#define  PCI_AGP_COMMAND_RATE4	0x0004	/* Use 4x rate (RFU in AGP3 mode) */
+#define  PCI_AGP_COMMAND_RATE2	0x0002	/* Use 2x rate (8x in AGP3 mode) */
+#define  PCI_AGP_COMMAND_RATE1	0x0001	/* Use 1x rate (4x in AGP3 mode) */
+#define PCI_AGP_SIZEOF		12
+
+/* Vital Product Data */
+
+#define PCI_VPD_ADDR		2	/* Address to access (15 bits!) */
+#define  PCI_VPD_ADDR_MASK	0x7fff	/* Address mask */
+#define  PCI_VPD_ADDR_F		0x8000	/* Write 0, 1 indicates completion */
+#define PCI_VPD_DATA		4	/* 32-bits of data returned here */
+
+/* Slot Identification */
+
+#define PCI_SID_ESR		2	/* Expansion Slot Register */
+#define  PCI_SID_ESR_NSLOTS	0x1f	/* Number of expansion slots available */
+#define  PCI_SID_ESR_FIC	0x20	/* First In Chassis Flag */
+#define PCI_SID_CHASSIS_NR	3	/* Chassis Number */
+
+/* Message Signaled Interrupts registers */
+
+#define PCI_MSI_FLAGS		2	/* Various flags */
+#define  PCI_MSI_FLAGS_MASK_BIT	0x100	/* interrupt masking & reporting supported */
+#define  PCI_MSI_FLAGS_64BIT	0x080	/* 64-bit addresses allowed */
+#define  PCI_MSI_FLAGS_QSIZE	0x070	/* Message queue size configured */
+#define  PCI_MSI_FLAGS_QMASK	0x00e	/* Maximum queue size available */
+#define  PCI_MSI_FLAGS_ENABLE	0x001	/* MSI feature enabled */
+#define PCI_MSI_RFU		3	/* Rest of capability flags */
+#define PCI_MSI_ADDRESS_LO	4	/* Lower 32 bits */
+#define PCI_MSI_ADDRESS_HI	8	/* Upper 32 bits (if PCI_MSI_FLAGS_64BIT set) */
+#define PCI_MSI_DATA_32		8	/* 16 bits of data for 32-bit devices */
+#define PCI_MSI_DATA_64		12	/* 16 bits of data for 64-bit devices */
+#define PCI_MSI_MASK_BIT_32	12	/* per-vector masking for 32-bit devices */
+#define PCI_MSI_MASK_BIT_64	16	/* per-vector masking for 64-bit devices */
+#define PCI_MSI_PENDING_32	16	/* per-vector interrupt pending for 32-bit devices */
+#define PCI_MSI_PENDING_64	20	/* per-vector interrupt pending for 64-bit devices */
+
+/* PCI-X */
+#define PCI_PCIX_COMMAND                                                2 /* Command register offset */
+#define PCI_PCIX_COMMAND_DPERE                                     0x0001 /* Data Parity Error Recover Enable */
+#define PCI_PCIX_COMMAND_ERO                                       0x0002 /* Enable Relaxed Ordering */
+#define PCI_PCIX_COMMAND_MAX_MEM_READ_BYTE_COUNT                   0x000c /* Maximum Memory Read Byte Count */
+#define PCI_PCIX_COMMAND_MAX_OUTSTANDING_SPLIT_TRANS               0x0070
+#define PCI_PCIX_COMMAND_RESERVED                                   0xf80
+#define PCI_PCIX_STATUS                                                 4 /* Status register offset */
+#define PCI_PCIX_STATUS_FUNCTION                               0x00000007
+#define PCI_PCIX_STATUS_DEVICE                                 0x000000f8
+#define PCI_PCIX_STATUS_BUS                                    0x0000ff00
+#define PCI_PCIX_STATUS_64BIT                                  0x00010000
+#define PCI_PCIX_STATUS_133MHZ                                 0x00020000
+#define PCI_PCIX_STATUS_SC_DISCARDED                           0x00040000 /* Split Completion Discarded */
+#define PCI_PCIX_STATUS_UNEXPECTED_SC                          0x00080000 /* Unexpected Split Completion */
+#define PCI_PCIX_STATUS_DEVICE_COMPLEXITY                      0x00100000 /* 0 = simple device, 1 = bridge device */
+#define PCI_PCIX_STATUS_DESIGNED_MAX_MEM_READ_BYTE_COUNT       0x00600000 /* 0 = 512 bytes, 1 = 1024, 2 = 2048, 3 = 4096 */
+#define PCI_PCIX_STATUS_DESIGNED_MAX_OUTSTANDING_SPLIT_TRANS   0x03800000
+#define PCI_PCIX_STATUS_DESIGNED_MAX_CUMULATIVE_READ_SIZE      0x1c000000
+#define PCI_PCIX_STATUS_RCVD_SC_ERR_MESS                       0x20000000 /* Received Split Completion Error Message */
+#define PCI_PCIX_STATUS_266MHZ				       0x40000000 /* 266 MHz capable */
+#define PCI_PCIX_STATUS_533MHZ				       0x80000000 /* 533 MHz capable */
+#define PCI_PCIX_SIZEOF		4
+
+/* PCI-X Bridges */
+#define PCI_PCIX_BRIDGE_SEC_STATUS                                      2 /* Secondary bus status register offset */
+#define PCI_PCIX_BRIDGE_SEC_STATUS_64BIT                           0x0001
+#define PCI_PCIX_BRIDGE_SEC_STATUS_133MHZ                          0x0002
+#define PCI_PCIX_BRIDGE_SEC_STATUS_SC_DISCARDED                    0x0004 /* Split Completion Discarded on secondary bus */
+#define PCI_PCIX_BRIDGE_SEC_STATUS_UNEXPECTED_SC                   0x0008 /* Unexpected Split Completion on secondary bus */
+#define PCI_PCIX_BRIDGE_SEC_STATUS_SC_OVERRUN                      0x0010 /* Split Completion Overrun on secondary bus */
+#define PCI_PCIX_BRIDGE_SEC_STATUS_SPLIT_REQUEST_DELAYED           0x0020
+#define PCI_PCIX_BRIDGE_SEC_STATUS_CLOCK_FREQ                      0x01c0
+#define PCI_PCIX_BRIDGE_SEC_STATUS_RESERVED                        0xfe00
+#define PCI_PCIX_BRIDGE_STATUS                                          4 /* Primary bus status register offset */
+#define PCI_PCIX_BRIDGE_STATUS_FUNCTION                        0x00000007
+#define PCI_PCIX_BRIDGE_STATUS_DEVICE                          0x000000f8
+#define PCI_PCIX_BRIDGE_STATUS_BUS                             0x0000ff00
+#define PCI_PCIX_BRIDGE_STATUS_64BIT                           0x00010000
+#define PCI_PCIX_BRIDGE_STATUS_133MHZ                          0x00020000
+#define PCI_PCIX_BRIDGE_STATUS_SC_DISCARDED                    0x00040000 /* Split Completion Discarded */
+#define PCI_PCIX_BRIDGE_STATUS_UNEXPECTED_SC                   0x00080000 /* Unexpected Split Completion */
+#define PCI_PCIX_BRIDGE_STATUS_SC_OVERRUN                      0x00100000 /* Split Completion Overrun */
+#define PCI_PCIX_BRIDGE_STATUS_SPLIT_REQUEST_DELAYED           0x00200000
+#define PCI_PCIX_BRIDGE_STATUS_RESERVED                        0xffc00000
+#define PCI_PCIX_BRIDGE_UPSTREAM_SPLIT_TRANS_CTRL                       8 /* Upstream Split Transaction Register offset */
+#define PCI_PCIX_BRIDGE_DOWNSTREAM_SPLIT_TRANS_CTRL                    12 /* Downstream Split Transaction Register offset */
+#define PCI_PCIX_BRIDGE_STR_CAPACITY                           0x0000ffff
+#define PCI_PCIX_BRIDGE_STR_COMMITMENT_LIMIT                   0xffff0000
+#define PCI_PCIX_BRIDGE_SIZEOF 12
+
+/* HyperTransport (as of spec rev. 2.00) */
+#define PCI_HT_CMD		2	/* Command Register */
+#define  PCI_HT_CMD_TYP_HI	0xe000	/* Capability Type high part */
+#define  PCI_HT_CMD_TYP_HI_PRI	0x0000	/* Slave or Primary Interface */
+#define  PCI_HT_CMD_TYP_HI_SEC	0x2000	/* Host or Secondary Interface */
+#define  PCI_HT_CMD_TYP		0xf800	/* Capability Type */
+#define  PCI_HT_CMD_TYP_SW	0x4000	/* Switch */
+#define  PCI_HT_CMD_TYP_IDC	0x8000	/* Interrupt Discovery and Configuration */
+#define  PCI_HT_CMD_TYP_RID	0x8800	/* Revision ID */
+#define  PCI_HT_CMD_TYP_UIDC	0x9000	/* UnitID Clumping */
+#define  PCI_HT_CMD_TYP_ECSA	0x9800	/* Extended Configuration Space Access */
+#define  PCI_HT_CMD_TYP_AM	0xa000	/* Address Mapping */
+#define  PCI_HT_CMD_TYP_MSIM	0xa800	/* MSI Mapping */
+#define  PCI_HT_CMD_TYP_DR	0xb000	/* DirectRoute */
+#define  PCI_HT_CMD_TYP_VCS	0xb800	/* VCSet */
+#define  PCI_HT_CMD_TYP_RM	0xc000	/* Retry Mode */
+#define  PCI_HT_CMD_TYP_X86	0xc800	/* X86 (reserved) */
+
+					/* Link Control Register */
+#define  PCI_HT_LCTR_CFLE	0x0002	/* CRC Flood Enable */
+#define  PCI_HT_LCTR_CST	0x0004	/* CRC Start Test */
+#define  PCI_HT_LCTR_CFE	0x0008	/* CRC Force Error */
+#define  PCI_HT_LCTR_LKFAIL	0x0010	/* Link Failure */
+#define  PCI_HT_LCTR_INIT	0x0020	/* Initialization Complete */
+#define  PCI_HT_LCTR_EOC	0x0040	/* End of Chain */
+#define  PCI_HT_LCTR_TXO	0x0080	/* Transmitter Off */
+#define  PCI_HT_LCTR_CRCERR	0x0f00	/* CRC Error */
+#define  PCI_HT_LCTR_ISOCEN	0x1000	/* Isochronous Flow Control Enable */
+#define  PCI_HT_LCTR_LSEN	0x2000	/* LDTSTOP# Tristate Enable */
+#define  PCI_HT_LCTR_EXTCTL	0x4000	/* Extended CTL Time */
+#define  PCI_HT_LCTR_64B	0x8000	/* 64-bit Addressing Enable */
+
+					/* Link Configuration Register */
+#define  PCI_HT_LCNF_MLWI	0x0007	/* Max Link Width In */
+#define  PCI_HT_LCNF_LW_8B	0x0	/* Link Width 8 bits */
+#define  PCI_HT_LCNF_LW_16B	0x1	/* Link Width 16 bits */
+#define  PCI_HT_LCNF_LW_32B	0x3	/* Link Width 32 bits */
+#define  PCI_HT_LCNF_LW_2B	0x4	/* Link Width 2 bits */
+#define  PCI_HT_LCNF_LW_4B	0x5	/* Link Width 4 bits */
+#define  PCI_HT_LCNF_LW_NC	0x7	/* Link physically not connected */
+#define  PCI_HT_LCNF_DFI	0x0008	/* Doubleword Flow Control In */
+#define  PCI_HT_LCNF_MLWO	0x0070	/* Max Link Width Out */
+#define  PCI_HT_LCNF_DFO	0x0080	/* Doubleword Flow Control Out */
+#define  PCI_HT_LCNF_LWI	0x0700	/* Link Width In */
+#define  PCI_HT_LCNF_DFIE	0x0800	/* Doubleword Flow Control In Enable */
+#define  PCI_HT_LCNF_LWO	0x7000	/* Link Width Out */
+#define  PCI_HT_LCNF_DFOE	0x8000	/* Doubleword Flow Control Out Enable */
+
+					/* Revision ID Register */
+#define  PCI_HT_RID_MIN		0x1f	/* Minor Revision */
+#define  PCI_HT_RID_MAJ		0xe0	/* Major Revision */
+
+					/* Link Frequency/Error Register */
+#define  PCI_HT_LFRER_FREQ	0x0f	/* Transmitter Clock Frequency */
+#define  PCI_HT_LFRER_200	0x00	/* 200MHz */
+#define  PCI_HT_LFRER_300	0x01	/* 300MHz */
+#define  PCI_HT_LFRER_400	0x02	/* 400MHz */
+#define  PCI_HT_LFRER_500	0x03	/* 500MHz */
+#define  PCI_HT_LFRER_600	0x04	/* 600MHz */
+#define  PCI_HT_LFRER_800	0x05	/* 800MHz */
+#define  PCI_HT_LFRER_1000	0x06	/* 1.0GHz */
+#define  PCI_HT_LFRER_1200	0x07	/* 1.2GHz */
+#define  PCI_HT_LFRER_1400	0x08	/* 1.4GHz */
+#define  PCI_HT_LFRER_1600	0x09	/* 1.6GHz */
+#define  PCI_HT_LFRER_VEND	0x0f	/* Vendor-Specific */
+#define  PCI_HT_LFRER_ERR	0xf0	/* Link Error */
+#define  PCI_HT_LFRER_PROT	0x10	/* Protocol Error */
+#define  PCI_HT_LFRER_OV	0x20	/* Overflow Error */
+#define  PCI_HT_LFRER_EOC	0x40	/* End of Chain Error */
+#define  PCI_HT_LFRER_CTLT	0x80	/* CTL Timeout */
+
+					/* Link Frequency Capability Register */
+#define  PCI_HT_LFCAP_200	0x0001	/* 200MHz */
+#define  PCI_HT_LFCAP_300	0x0002	/* 300MHz */
+#define  PCI_HT_LFCAP_400	0x0004	/* 400MHz */
+#define  PCI_HT_LFCAP_500	0x0008	/* 500MHz */
+#define  PCI_HT_LFCAP_600	0x0010	/* 600MHz */
+#define  PCI_HT_LFCAP_800	0x0020	/* 800MHz */
+#define  PCI_HT_LFCAP_1000	0x0040	/* 1.0GHz */
+#define  PCI_HT_LFCAP_1200	0x0080	/* 1.2GHz */
+#define  PCI_HT_LFCAP_1400	0x0100	/* 1.4GHz */
+#define  PCI_HT_LFCAP_1600	0x0200	/* 1.6GHz */
+#define  PCI_HT_LFCAP_VEND	0x8000	/* Vendor-Specific */
+
+					/* Feature Register */
+#define  PCI_HT_FTR_ISOCFC	0x0001	/* Isochronous Flow Control Mode */
+#define  PCI_HT_FTR_LDTSTOP	0x0002	/* LDTSTOP# Supported */
+#define  PCI_HT_FTR_CRCTM	0x0004	/* CRC Test Mode */
+#define  PCI_HT_FTR_ECTLT	0x0008	/* Extended CTL Time Required */
+#define  PCI_HT_FTR_64BA	0x0010	/* 64-bit Addressing */
+#define  PCI_HT_FTR_UIDRD	0x0020	/* UnitID Reorder Disable */
+
+					/* Error Handling Register */
+#define  PCI_HT_EH_PFLE		0x0001	/* Protocol Error Flood Enable */
+#define  PCI_HT_EH_OFLE		0x0002	/* Overflow Error Flood Enable */
+#define  PCI_HT_EH_PFE		0x0004	/* Protocol Error Fatal Enable */
+#define  PCI_HT_EH_OFE		0x0008	/* Overflow Error Fatal Enable */
+#define  PCI_HT_EH_EOCFE	0x0010	/* End of Chain Error Fatal Enable */
+#define  PCI_HT_EH_RFE		0x0020	/* Response Error Fatal Enable */
+#define  PCI_HT_EH_CRCFE	0x0040	/* CRC Error Fatal Enable */
+#define  PCI_HT_EH_SERRFE	0x0080	/* System Error Fatal Enable (B */
+#define  PCI_HT_EH_CF		0x0100	/* Chain Fail */
+#define  PCI_HT_EH_RE		0x0200	/* Response Error */
+#define  PCI_HT_EH_PNFE		0x0400	/* Protocol Error Nonfatal Enable */
+#define  PCI_HT_EH_ONFE		0x0800	/* Overflow Error Nonfatal Enable */
+#define  PCI_HT_EH_EOCNFE	0x1000	/* End of Chain Error Nonfatal Enable */
+#define  PCI_HT_EH_RNFE		0x2000	/* Response Error Nonfatal Enable */
+#define  PCI_HT_EH_CRCNFE	0x4000	/* CRC Error Nonfatal Enable */
+#define  PCI_HT_EH_SERRNFE	0x8000	/* System Error Nonfatal Enable */
+
+/* HyperTransport: Slave or Primary Interface */
+#define PCI_HT_PRI_CMD		2	/* Command Register */
+#define  PCI_HT_PRI_CMD_BUID	0x001f	/* Base UnitID */
+#define  PCI_HT_PRI_CMD_UC	0x03e0	/* Unit Count */
+#define  PCI_HT_PRI_CMD_MH	0x0400	/* Master Host */
+#define  PCI_HT_PRI_CMD_DD	0x0800	/* Default Direction */
+#define  PCI_HT_PRI_CMD_DUL	0x1000	/* Drop on Uninitialized Link */
+
+#define PCI_HT_PRI_LCTR0	4	/* Link Control 0 Register */
+#define PCI_HT_PRI_LCNF0	6	/* Link Config 0 Register */
+#define PCI_HT_PRI_LCTR1	8	/* Link Control 1 Register */
+#define PCI_HT_PRI_LCNF1	10	/* Link Config 1 Register */
+#define PCI_HT_PRI_RID		12	/* Revision ID Register */
+#define PCI_HT_PRI_LFRER0	13	/* Link Frequency/Error 0 Register */
+#define PCI_HT_PRI_LFCAP0	14	/* Link Frequency Capability 0 Register */
+#define PCI_HT_PRI_FTR		16	/* Feature Register */
+#define PCI_HT_PRI_LFRER1	17	/* Link Frequency/Error 1 Register */
+#define PCI_HT_PRI_LFCAP1	18	/* Link Frequency Capability 1 Register */
+#define PCI_HT_PRI_ES		20	/* Enumeration Scratchpad Register */
+#define PCI_HT_PRI_EH		22	/* Error Handling Register */
+#define PCI_HT_PRI_MBU		24	/* Memory Base Upper Register */
+#define PCI_HT_PRI_MLU		25	/* Memory Limit Upper Register */
+#define PCI_HT_PRI_BN		26	/* Bus Number Register */
+#define PCI_HT_PRI_SIZEOF	28
+
+/* HyperTransport: Host or Secondary Interface */
+#define PCI_HT_SEC_CMD		2	/* Command Register */
+#define  PCI_HT_SEC_CMD_WR	0x0001	/* Warm Reset */
+#define  PCI_HT_SEC_CMD_DE	0x0002	/* Double-Ended */
+#define  PCI_HT_SEC_CMD_DN	0x0076	/* Device Number */
+#define  PCI_HT_SEC_CMD_CS	0x0080	/* Chain Side */
+#define  PCI_HT_SEC_CMD_HH	0x0100	/* Host Hide */
+#define  PCI_HT_SEC_CMD_AS	0x0400	/* Act as Slave */
+#define  PCI_HT_SEC_CMD_HIECE	0x0800	/* Host Inbound End of Chain Error */
+#define  PCI_HT_SEC_CMD_DUL	0x1000	/* Drop on Uninitialized Link */
+
+#define PCI_HT_SEC_LCTR		4	/* Link Control Register */
+#define PCI_HT_SEC_LCNF		6	/* Link Config Register */
+#define PCI_HT_SEC_RID		8	/* Revision ID Register */
+#define PCI_HT_SEC_LFRER	9	/* Link Frequency/Error Register */
+#define PCI_HT_SEC_LFCAP	10	/* Link Frequency Capability Register */
+#define PCI_HT_SEC_FTR		12	/* Feature Register */
+#define  PCI_HT_SEC_FTR_EXTRS	0x0100	/* Extended Register Set */
+#define  PCI_HT_SEC_FTR_UCNFE	0x0200	/* Upstream Configuration Enable */
+#define PCI_HT_SEC_ES		16	/* Enumeration Scratchpad Register */
+#define PCI_HT_SEC_EH		18	/* Error Handling Register */
+#define PCI_HT_SEC_MBU		20	/* Memory Base Upper Register */
+#define PCI_HT_SEC_MLU		21	/* Memory Limit Upper Register */
+#define PCI_HT_SEC_SIZEOF	24
+
+/* HyperTransport: Switch */
+#define PCI_HT_SW_CMD		2	/* Switch Command Register */
+#define  PCI_HT_SW_CMD_VIBERR	0x0080	/* VIB Error */
+#define  PCI_HT_SW_CMD_VIBFL	0x0100	/* VIB Flood */
+#define  PCI_HT_SW_CMD_VIBFT	0x0200	/* VIB Fatal */
+#define  PCI_HT_SW_CMD_VIBNFT	0x0400	/* VIB Nonfatal */
+#define PCI_HT_SW_PMASK		4	/* Partition Mask Register */
+#define PCI_HT_SW_SWINF		8	/* Switch Info Register */
+#define  PCI_HT_SW_SWINF_DP	0x0000001f /* Default Port */
+#define  PCI_HT_SW_SWINF_EN	0x00000020 /* Enable Decode */
+#define  PCI_HT_SW_SWINF_CR	0x00000040 /* Cold Reset */
+#define  PCI_HT_SW_SWINF_PCIDX	0x00000f00 /* Performance Counter Index */
+#define  PCI_HT_SW_SWINF_BLRIDX	0x0003f000 /* Base/Limit Range Index */
+#define  PCI_HT_SW_SWINF_SBIDX	0x00002000 /* Secondary Base Range Index */
+#define  PCI_HT_SW_SWINF_HP	0x00040000 /* Hot Plug */
+#define  PCI_HT_SW_SWINF_HIDE	0x00080000 /* Hide Port */
+#define PCI_HT_SW_PCD		12	/* Performance Counter Data Register */
+#define PCI_HT_SW_BLRD		16	/* Base/Limit Range Data Register */
+#define PCI_HT_SW_SBD		20	/* Secondary Base Data Register */
+#define PCI_HT_SW_SIZEOF	24
+
+					/* Counter indices */
+#define  PCI_HT_SW_PC_PCR	0x0	/* Posted Command Receive */
+#define  PCI_HT_SW_PC_NPCR	0x1	/* Nonposted Command Receive */
+#define  PCI_HT_SW_PC_RCR	0x2	/* Response Command Receive */
+#define  PCI_HT_SW_PC_PDWR	0x3	/* Posted DW Receive */
+#define  PCI_HT_SW_PC_NPDWR	0x4	/* Nonposted DW Receive */
+#define  PCI_HT_SW_PC_RDWR	0x5	/* Response DW Receive */
+#define  PCI_HT_SW_PC_PCT	0x6	/* Posted Command Transmit */
+#define  PCI_HT_SW_PC_NPCT	0x7	/* Nonposted Command Transmit */
+#define  PCI_HT_SW_PC_RCT	0x8	/* Response Command Transmit */
+#define  PCI_HT_SW_PC_PDWT	0x9	/* Posted DW Transmit */
+#define  PCI_HT_SW_PC_NPDWT	0xa	/* Nonposted DW Transmit */
+#define  PCI_HT_SW_PC_RDWT	0xb	/* Response DW Transmit */
+
+					/* Base/Limit Range indices */
+#define  PCI_HT_SW_BLR_BASE0_LO	0x0	/* Base 0[31:1], Enable */
+#define  PCI_HT_SW_BLR_BASE0_HI	0x1	/* Base 0 Upper */
+#define  PCI_HT_SW_BLR_LIM0_LO	0x2	/* Limit 0 Lower */
+#define  PCI_HT_SW_BLR_LIM0_HI	0x3	/* Limit 0 Upper */
+
+					/* Secondary Base indices */
+#define  PCI_HT_SW_SB_LO	0x0	/* Secondary Base[31:1], Enable */
+#define  PCI_HT_SW_S0_HI	0x1	/* Secondary Base Upper */
+
+/* HyperTransport: Interrupt Discovery and Configuration */
+#define PCI_HT_IDC_IDX		2	/* Index Register */
+#define PCI_HT_IDC_DATA		4	/* Data Register */
+#define PCI_HT_IDC_SIZEOF	8
+
+					/* Register indices */
+#define  PCI_HT_IDC_IDX_LINT	0x01	/* Last Interrupt Register */
+#define   PCI_HT_IDC_LINT	0x00ff0000 /* Last interrupt definition */
+#define  PCI_HT_IDC_IDX_IDR	0x10	/* Interrupt Definition Registers */
+					/* Low part (at index) */
+#define   PCI_HT_IDC_IDR_MASK	0x10000001 /* Mask */
+#define   PCI_HT_IDC_IDR_POL	0x10000002 /* Polarity */
+#define   PCI_HT_IDC_IDR_II_2	0x1000001c /* IntrInfo[4:2]: Message Type */
+#define   PCI_HT_IDC_IDR_II_5	0x10000020 /* IntrInfo[5]: Request EOI */
+#define   PCI_HT_IDC_IDR_II_6	0x00ffffc0 /* IntrInfo[23:6] */
+#define   PCI_HT_IDC_IDR_II_24	0xff000000 /* IntrInfo[31:24] */
+					/* High part (at index + 1) */
+#define   PCI_HT_IDC_IDR_II_32	0x00ffffff /* IntrInfo[55:32] */
+#define   PCI_HT_IDC_IDR_PASSPW	0x40000000 /* PassPW setting for messages */
+#define   PCI_HT_IDC_IDR_WEOI	0x80000000 /* Waiting for EOI */
+
+/* HyperTransport: Revision ID */
+#define PCI_HT_RID_RID		2	/* Revision Register */
+#define PCI_HT_RID_SIZEOF	4
+
+/* HyperTransport: UnitID Clumping */
+#define PCI_HT_UIDC_CS		4	/* Clumping Support Register */
+#define PCI_HT_UIDC_CE		8	/* Clumping Enable Register */
+#define PCI_HT_UIDC_SIZEOF	12
+
+/* HyperTransport: Extended Configuration Space Access */
+#define PCI_HT_ECSA_ADDR	4	/* Configuration Address Register */
+#define  PCI_HT_ECSA_ADDR_REG	0x00000ffc /* Register */
+#define  PCI_HT_ECSA_ADDR_FUN	0x00007000 /* Function */
+#define  PCI_HT_ECSA_ADDR_DEV	0x000f1000 /* Device */
+#define  PCI_HT_ECSA_ADDR_BUS	0x0ff00000 /* Bus Number */
+#define  PCI_HT_ECSA_ADDR_TYPE	0x10000000 /* Access Type */
+#define PCI_HT_ECSA_DATA	8	/* Configuration Data Register */
+#define PCI_HT_ECSA_SIZEOF	12
+
+/* HyperTransport: Address Mapping */
+#define PCI_HT_AM_CMD		2	/* Command Register */
+#define  PCI_HT_AM_CMD_NDMA	0x000f	/* Number of DMA Mappings */
+#define  PCI_HT_AM_CMD_IOSIZ	0x01f0	/* I/O Size */
+#define  PCI_HT_AM_CMD_MT	0x0600	/* Map Type */
+#define  PCI_HT_AM_CMD_MT_40B	0x0000	/* 40-bit */
+#define  PCI_HT_AM_CMD_MT_64B	0x0200	/* 64-bit */
+
+					/* Window Control Register bits */
+#define  PCI_HT_AM_SBW_CTR_COMP	0x1	/* Compat */
+#define  PCI_HT_AM_SBW_CTR_NCOH	0x2	/* NonCoherent */
+#define  PCI_HT_AM_SBW_CTR_ISOC	0x4	/* Isochronous */
+#define  PCI_HT_AM_SBW_CTR_EN	0x8	/* Enable */
+
+/* HyperTransport: 40-bit Address Mapping */
+#define PCI_HT_AM40_SBNPW	4	/* Secondary Bus Non-Prefetchable Window Register */
+#define  PCI_HT_AM40_SBW_BASE	0x000fffff /* Window Base */
+#define  PCI_HT_AM40_SBW_CTR	0xf0000000 /* Window Control */
+#define PCI_HT_AM40_SBPW	8	/* Secondary Bus Prefetchable Window Register */
+#define PCI_HT_AM40_DMA_PBASE0	12	/* DMA Window Primary Base 0 Register */
+#define PCI_HT_AM40_DMA_CTR0	15	/* DMA Window Control 0 Register */
+#define  PCI_HT_AM40_DMA_CTR_CTR 0xf0	/* Window Control */
+#define PCI_HT_AM40_DMA_SLIM0	16	/* DMA Window Secondary Limit 0 Register */
+#define PCI_HT_AM40_DMA_SBASE0	18	/* DMA Window Secondary Base 0 Register */
+#define PCI_HT_AM40_SIZEOF	12	/* size is variable: 12 + 8 * NDMA */
+
+/* HyperTransport: 64-bit Address Mapping */
+#define PCI_HT_AM64_IDX		4	/* Index Register */
+#define PCI_HT_AM64_DATA_LO	8	/* Data Lower Register */
+#define PCI_HT_AM64_DATA_HI	12	/* Data Upper Register */
+#define PCI_HT_AM64_SIZEOF	16
+
+					/* Register indices */
+#define  PCI_HT_AM64_IDX_SBNPW	0x00	/* Secondary Bus Non-Prefetchable Window Register */
+#define   PCI_HT_AM64_W_BASE_LO	0xfff00000 /* Window Base Lower */
+#define   PCI_HT_AM64_W_CTR	0x0000000f /* Window Control */
+#define  PCI_HT_AM64_IDX_SBPW	0x01	/* Secondary Bus Prefetchable Window Register */
+#define   PCI_HT_AM64_IDX_PBNPW	0x02	/* Primary Bus Non-Prefetchable Window Register */
+#define   PCI_HT_AM64_IDX_DMAPB0 0x04	/* DMA Window Primary Base 0 Register */
+#define   PCI_HT_AM64_IDX_DMASB0 0x05	/* DMA Window Secondary Base 0 Register */
+#define   PCI_HT_AM64_IDX_DMASL0 0x06	/* DMA Window Secondary Limit 0 Register */
+
+/* HyperTransport: MSI Mapping */
+#define PCI_HT_MSIM_CMD		2	/* Command Register */
+#define  PCI_HT_MSIM_CMD_EN	0x0001	/* Mapping Active */
+#define  PCI_HT_MSIM_CMD_FIXD	0x0002	/* MSI Mapping Address Fixed */
+#define PCI_HT_MSIM_ADDR_LO	4	/* MSI Mapping Address Lower Register */
+#define PCI_HT_MSIM_ADDR_HI	8	/* MSI Mapping Address Upper Register */
+#define PCI_HT_MSIM_SIZEOF	12
+
+/* HyperTransport: DirectRoute */
+#define PCI_HT_DR_CMD		2	/* Command Register */
+#define  PCI_HT_DR_CMD_NDRS	0x000f	/* Number of DirectRoute Spaces */
+#define  PCI_HT_DR_CMD_IDX	0x01f0	/* Index */
+#define PCI_HT_DR_EN		4	/* Enable Vector Register */
+#define PCI_HT_DR_DATA		8	/* Data Register */
+#define PCI_HT_DR_SIZEOF	12
+
+					/* Register indices */
+#define  PCI_HT_DR_IDX_BASE_LO	0x00	/* DirectRoute Base Lower Register */
+#define   PCI_HT_DR_OTNRD	0x00000001 /* Opposite to Normal Request Direction */
+#define   PCI_HT_DR_BL_LO	0xffffff00 /* Base/Limit Lower */
+#define  PCI_HT_DR_IDX_BASE_HI	0x01	/* DirectRoute Base Upper Register */
+#define  PCI_HT_DR_IDX_LIMIT_LO	0x02	/* DirectRoute Limit Lower Register */
+#define  PCI_HT_DR_IDX_LIMIT_HI	0x03	/* DirectRoute Limit Upper Register */
+
+/* HyperTransport: VCSet */
+#define PCI_HT_VCS_SUP		4	/* VCSets Supported Register */
+#define PCI_HT_VCS_L1EN		5	/* Link 1 VCSets Enabled Register */
+#define PCI_HT_VCS_L0EN		6	/* Link 0 VCSets Enabled Register */
+#define PCI_HT_VCS_SBD		8	/* Stream Bucket Depth Register */
+#define PCI_HT_VCS_SINT		9	/* Stream Interval Register */
+#define PCI_HT_VCS_SSUP		10	/* Number of Streaming VCs Supported Register */
+#define  PCI_HT_VCS_SSUP_0	0x00	/* Streaming VC 0 */
+#define  PCI_HT_VCS_SSUP_3	0x01	/* Streaming VCs 0-3 */
+#define  PCI_HT_VCS_SSUP_15	0x02	/* Streaming VCs 0-15 */
+#define PCI_HT_VCS_NFCBD	12	/* Non-FC Bucket Depth Register */
+#define PCI_HT_VCS_NFCINT	13	/* Non-FC Bucket Interval Register */
+#define PCI_HT_VCS_SIZEOF	16
+
+/* HyperTransport: Retry Mode */
+#define PCI_HT_RM_CTR0		4	/* Control 0 Register */
+#define  PCI_HT_RM_CTR_LRETEN	0x01	/* Link Retry Enable */
+#define  PCI_HT_RM_CTR_FSER	0x02	/* Force Single Error */
+#define  PCI_HT_RM_CTR_ROLNEN	0x04	/* Rollover Nonfatal Enable */
+#define  PCI_HT_RM_CTR_FSS	0x08	/* Force Single Stomp */
+#define  PCI_HT_RM_CTR_RETNEN	0x10	/* Retry Nonfatal Enable */
+#define  PCI_HT_RM_CTR_RETFEN	0x20	/* Retry Fatal Enable */
+#define  PCI_HT_RM_CTR_AA	0xc0	/* Allowed Attempts */
+#define PCI_HT_RM_STS0		5	/* Status 0 Register */
+#define  PCI_HT_RM_STS_RETSNT	0x01	/* Retry Sent */
+#define  PCI_HT_RM_STS_CNTROL	0x02	/* Count Rollover */
+#define  PCI_HT_RM_STS_SRCV	0x04	/* Stomp Received */
+#define PCI_HT_RM_CTR1		6	/* Control 1 Register */
+#define PCI_HT_RM_STS1		7	/* Status 1 Register */
+#define PCI_HT_RM_CNT0		8	/* Retry Count 0 Register */
+#define PCI_HT_RM_CNT1		10	/* Retry Count 1 Register */
+#define PCI_HT_RM_SIZEOF	12
+
+/* Vendor-Specific Capability (see PCI_EVNDR_xxx for the PCIe version) */
+#define PCI_VNDR_LENGTH		2	/* Length byte */
+
+/* PCI Express */
+#define PCI_EXP_FLAGS		0x2	/* Capabilities register */
+#define PCI_EXP_FLAGS_VERS	0x000f	/* Capability version */
+#define PCI_EXP_FLAGS_TYPE	0x00f0	/* Device/Port type */
+#define  PCI_EXP_TYPE_ENDPOINT	0x0	/* Express Endpoint */
+#define  PCI_EXP_TYPE_LEG_END	0x1	/* Legacy Endpoint */
+#define  PCI_EXP_TYPE_ROOT_PORT 0x4	/* Root Port */
+#define  PCI_EXP_TYPE_UPSTREAM	0x5	/* Upstream Port */
+#define  PCI_EXP_TYPE_DOWNSTREAM 0x6	/* Downstream Port */
+#define  PCI_EXP_TYPE_PCI_BRIDGE 0x7	/* PCI/PCI-X Bridge */
+#define  PCI_EXP_TYPE_PCIE_BRIDGE 0x8	/* PCI/PCI-X to PCIE Bridge */
+#define  PCI_EXP_TYPE_ROOT_INT_EP 0x9	/* Root Complex Integrated Endpoint */
+#define  PCI_EXP_TYPE_ROOT_EC 0xa	/* Root Complex Event Collector */
+#define PCI_EXP_FLAGS_SLOT	0x0100	/* Slot implemented */
+#define PCI_EXP_FLAGS_IRQ	0x3e00	/* Interrupt message number */
+#define PCI_EXP_DEVCAP		0x4	/* Device capabilities */
+#define  PCI_EXP_DEVCAP_PAYLOAD	0x07	/* Max_Payload_Size */
+#define  PCI_EXP_DEVCAP_PHANTOM	0x18	/* Phantom functions */
+#define  PCI_EXP_DEVCAP_EXT_TAG	0x20	/* Extended tags */
+#define  PCI_EXP_DEVCAP_L0S	0x1c0	/* L0s Acceptable Latency */
+#define  PCI_EXP_DEVCAP_L1	0xe00	/* L1 Acceptable Latency */
+#define  PCI_EXP_DEVCAP_ATN_BUT	0x1000	/* Attention Button Present */
+#define  PCI_EXP_DEVCAP_ATN_IND	0x2000	/* Attention Indicator Present */
+#define  PCI_EXP_DEVCAP_PWR_IND	0x4000	/* Power Indicator Present */
+#define  PCI_EXP_DEVCAP_RBE	0x8000	/* Role-Based Error Reporting */
+#define  PCI_EXP_DEVCAP_PWR_VAL	0x3fc0000 /* Slot Power Limit Value */
+#define  PCI_EXP_DEVCAP_PWR_SCL	0xc000000 /* Slot Power Limit Scale */
+#define  PCI_EXP_DEVCAP_FLRESET	0x10000000 /* Function-Level Reset */
+#define PCI_EXP_DEVCTL		0x8	/* Device Control */
+#define  PCI_EXP_DEVCTL_CERE	0x0001	/* Correctable Error Reporting En. */
+#define  PCI_EXP_DEVCTL_NFERE	0x0002	/* Non-Fatal Error Reporting Enable */
+#define  PCI_EXP_DEVCTL_FERE	0x0004	/* Fatal Error Reporting Enable */
+#define  PCI_EXP_DEVCTL_URRE	0x0008	/* Unsupported Request Reporting En. */
+#define  PCI_EXP_DEVCTL_RELAXED	0x0010	/* Enable Relaxed Ordering */
+#define  PCI_EXP_DEVCTL_PAYLOAD	0x00e0	/* Max_Payload_Size */
+#define  PCI_EXP_DEVCTL_EXT_TAG	0x0100	/* Extended Tag Field Enable */
+#define  PCI_EXP_DEVCTL_PHANTOM	0x0200	/* Phantom Functions Enable */
+#define  PCI_EXP_DEVCTL_AUX_PME	0x0400	/* Auxiliary Power PM Enable */
+#define  PCI_EXP_DEVCTL_NOSNOOP	0x0800	/* Enable No Snoop */
+#define  PCI_EXP_DEVCTL_READRQ	0x7000	/* Max_Read_Request_Size */
+#define  PCI_EXP_DEVCTL_BCRE	0x8000	/* Bridge Configuration Retry Enable */
+#define  PCI_EXP_DEVCTL_FLRESET	0x8000	/* Function-Level Reset [bit shared with BCRE] */
+#define PCI_EXP_DEVSTA		0xa	/* Device Status */
+#define  PCI_EXP_DEVSTA_CED	0x01	/* Correctable Error Detected */
+#define  PCI_EXP_DEVSTA_NFED	0x02	/* Non-Fatal Error Detected */
+#define  PCI_EXP_DEVSTA_FED	0x04	/* Fatal Error Detected */
+#define  PCI_EXP_DEVSTA_URD	0x08	/* Unsupported Request Detected */
+#define  PCI_EXP_DEVSTA_AUXPD	0x10	/* AUX Power Detected */
+#define  PCI_EXP_DEVSTA_TRPND	0x20	/* Transactions Pending */
+#define PCI_EXP_LNKCAP		0xc	/* Link Capabilities */
+#define  PCI_EXP_LNKCAP_SPEED	0x0000f	/* Maximum Link Speed */
+#define  PCI_EXP_LNKCAP_WIDTH	0x003f0	/* Maximum Link Width */
+#define  PCI_EXP_LNKCAP_ASPM	0x00c00	/* Active State Power Management */
+#define  PCI_EXP_LNKCAP_L0S	0x07000	/* L0s Acceptable Latency */
+#define  PCI_EXP_LNKCAP_L1	0x38000	/* L1 Acceptable Latency */
+#define  PCI_EXP_LNKCAP_CLOCKPM	0x40000	/* Clock Power Management */
+#define  PCI_EXP_LNKCAP_SURPRISE 0x80000 /* Surprise Down Error Reporting */
+#define  PCI_EXP_LNKCAP_DLLA	0x100000 /* Data Link Layer Active Reporting */
+#define  PCI_EXP_LNKCAP_LBNC	0x200000 /* Link Bandwidth Notification Capability */
+#define  PCI_EXP_LNKCAP_PORT	0xff000000 /* Port Number */
+#define PCI_EXP_LNKCTL		0x10	/* Link Control */
+#define  PCI_EXP_LNKCTL_ASPM	0x0003	/* ASPM Control */
+#define  PCI_EXP_LNKCTL_RCB	0x0008	/* Read Completion Boundary */
+#define  PCI_EXP_LNKCTL_DISABLE	0x0010	/* Link Disable */
+#define  PCI_EXP_LNKCTL_RETRAIN	0x0020	/* Retrain Link */
+#define  PCI_EXP_LNKCTL_CLOCK	0x0040	/* Common Clock Configuration */
+#define  PCI_EXP_LNKCTL_XSYNCH	0x0080	/* Extended Synch */
+#define  PCI_EXP_LNKCTL_CLOCKPM	0x0100	/* Clock Power Management */
+#define  PCI_EXP_LNKCTL_HWAUTWD	0x0200	/* Hardware Autonomous Width Disable */
+#define  PCI_EXP_LNKCTL_BWMIE	0x0400	/* Bandwidth Mgmt Interrupt Enable */
+#define  PCI_EXP_LNKCTL_AUTBWIE	0x0800	/* Autonomous Bandwidth Mgmt Interrupt Enable */
+#define PCI_EXP_LNKSTA		0x12	/* Link Status */
+#define  PCI_EXP_LNKSTA_SPEED	0x000f	/* Negotiated Link Speed */
+#define  PCI_EXP_LNKSTA_WIDTH	0x03f0	/* Negotiated Link Width */
+#define  PCI_EXP_LNKSTA_TR_ERR	0x0400	/* Training Error (obsolete) */
+#define  PCI_EXP_LNKSTA_TRAIN	0x0800	/* Link Training */
+#define  PCI_EXP_LNKSTA_SL_CLK	0x1000	/* Slot Clock Configuration */
+#define  PCI_EXP_LNKSTA_DL_ACT	0x2000	/* Data Link Layer in DL_Active State */
+#define  PCI_EXP_LNKSTA_BWMGMT	0x4000	/* Bandwidth Mgmt Status */
+#define  PCI_EXP_LNKSTA_AUTBW	0x8000	/* Autonomous Bandwidth Mgmt Status */
+#define PCI_EXP_SLTCAP		0x14	/* Slot Capabilities */
+#define  PCI_EXP_SLTCAP_ATNB	0x0001	/* Attention Button Present */
+#define  PCI_EXP_SLTCAP_PWRC	0x0002	/* Power Controller Present */
+#define  PCI_EXP_SLTCAP_MRL	0x0004	/* MRL Sensor Present */
+#define  PCI_EXP_SLTCAP_ATNI	0x0008	/* Attention Indicator Present */
+#define  PCI_EXP_SLTCAP_PWRI	0x0010	/* Power Indicator Present */
+#define  PCI_EXP_SLTCAP_HPS	0x0020	/* Hot-Plug Surprise */
+#define  PCI_EXP_SLTCAP_HPC	0x0040	/* Hot-Plug Capable */
+#define  PCI_EXP_SLTCAP_PWR_VAL	0x00007f80 /* Slot Power Limit Value */
+#define  PCI_EXP_SLTCAP_PWR_SCL	0x00018000 /* Slot Power Limit Scale */
+#define  PCI_EXP_SLTCAP_INTERLOCK 0x020000 /* Electromechanical Interlock Present */
+#define  PCI_EXP_SLTCAP_NOCMDCOMP 0x040000 /* No Command Completed Support */
+#define  PCI_EXP_SLTCAP_PSN	0xfff80000 /* Physical Slot Number */
+#define PCI_EXP_SLTCTL		0x18	/* Slot Control */
+#define  PCI_EXP_SLTCTL_ATNB	0x0001	/* Attention Button Pressed Enable */
+#define  PCI_EXP_SLTCTL_PWRF	0x0002	/* Power Fault Detected Enable */
+#define  PCI_EXP_SLTCTL_MRLS	0x0004	/* MRL Sensor Changed Enable */
+#define  PCI_EXP_SLTCTL_PRSD	0x0008	/* Presence Detect Changed Enable */
+#define  PCI_EXP_SLTCTL_CMDC	0x0010	/* Command Completed Interrupt Enable */
+#define  PCI_EXP_SLTCTL_HPIE	0x0020	/* Hot-Plug Interrupt Enable */
+#define  PCI_EXP_SLTCTL_ATNI	0x00c0	/* Attention Indicator Control */
+#define  PCI_EXP_SLTCTL_PWRI	0x0300	/* Power Indicator Control */
+#define  PCI_EXP_SLTCTL_PWRC	0x0400	/* Power Controller Control */
+#define  PCI_EXP_SLTCTL_INTERLOCK 0x0800 /* Electromechanical Interlock Control */
+#define  PCI_EXP_SLTCTL_LLCHG	0x1000	/* Data Link Layer State Changed Enable */
+#define PCI_EXP_SLTSTA		0x1a	/* Slot Status */
+#define  PCI_EXP_SLTSTA_ATNB	0x0001	/* Attention Button Pressed */
+#define  PCI_EXP_SLTSTA_PWRF	0x0002	/* Power Fault Detected */
+#define  PCI_EXP_SLTSTA_MRLS	0x0004	/* MRL Sensor Changed */
+#define  PCI_EXP_SLTSTA_PRSD	0x0008	/* Presence Detect Changed */
+#define  PCI_EXP_SLTSTA_CMDC	0x0010	/* Command Completed */
+#define  PCI_EXP_SLTSTA_MRL_ST	0x0020	/* MRL Sensor State */
+#define  PCI_EXP_SLTSTA_PRES	0x0040	/* Presence Detect State */
+#define  PCI_EXP_SLTSTA_INTERLOCK 0x0080 /* Electromechanical Interlock Status */
+#define  PCI_EXP_SLTSTA_LLCHG	0x0100	/* Data Link Layer State Changed */
+#define PCI_EXP_RTCTL		0x1c	/* Root Control */
+#define  PCI_EXP_RTCTL_SECEE	0x0001	/* System Error on Correctable Error */
+#define  PCI_EXP_RTCTL_SENFEE	0x0002	/* System Error on Non-Fatal Error */
+#define  PCI_EXP_RTCTL_SEFEE	0x0004	/* System Error on Fatal Error */
+#define  PCI_EXP_RTCTL_PMEIE	0x0008	/* PME Interrupt Enable */
+#define  PCI_EXP_RTCTL_CRSVIS	0x0010	/* Configuration Request Retry Status Visible to SW */
+#define PCI_EXP_RTCAP		0x1e	/* Root Capabilities */
+#define  PCI_EXP_RTCAP_CRSVIS	0x0010	/* Configuration Request Retry Status Visible to SW */
+#define PCI_EXP_RTSTA		0x20	/* Root Status */
+#define  PCI_EXP_RTSTA_PME_REQID   0x0000ffff /* PME Requester ID */
+#define  PCI_EXP_RTSTA_PME_STATUS  0x00010000 /* PME Status */
+#define  PCI_EXP_RTSTA_PME_PENDING 0x00020000 /* PME is Pending */
+#define PCI_EXP_DEVCAP2			0x24	/* Device capabilities 2 */
+#define PCI_EXP_DEVCTL2			0x28	/* Device Control */
+#define  PCI_EXP_DEV2_TIMEOUT_RANGE(x)	((x) & 0xf) /* Completion Timeout Ranges Supported */
+#define  PCI_EXP_DEV2_TIMEOUT_VALUE(x)	((x) & 0xf) /* Completion Timeout Value */
+#define  PCI_EXP_DEV2_TIMEOUT_DIS	0x0010	/* Completion Timeout Disable Supported */
+#define  PCI_EXP_DEV2_ARI		0x0020	/* ARI Forwarding */
+#define PCI_EXP_DEVSTA2			0x2a	/* Device Status */
+#define PCI_EXP_LNKCAP2			0x2c	/* Link Capabilities */
+#define PCI_EXP_LNKCTL2			0x30	/* Link Control */
+#define  PCI_EXP_LNKCTL2_SPEED(x)	((x) & 0xf) /* Target Link Speed */
+#define  PCI_EXP_LNKCTL2_CMPLNC		0x0010	/* Enter Compliance */
+#define  PCI_EXP_LNKCTL2_SPEED_DIS	0x0020	/* Hardware Autonomous Speed Disable */
+#define  PCI_EXP_LNKCTL2_DEEMPHASIS(x)	(((x) >> 6) & 1) /* Selectable De-emphasis */
+#define  PCI_EXP_LNKCTL2_MARGIN(x)	(((x) >> 7) & 7) /* Transmit Margin */
+#define  PCI_EXP_LNKCTL2_MOD_CMPLNC	0x0400	/* Enter Modified Compliance */
+#define  PCI_EXP_LNKCTL2_CMPLNC_SOS	0x0800	/* Compliance SOS */
+#define  PCI_EXP_LNKCTL2_COM_DEEMPHASIS(x) (((x) >> 12) & 1) /* Compliance De-emphasis */
+#define PCI_EXP_LNKSTA2			0x32	/* Link Status */
+#define  PCI_EXP_LINKSTA2_DEEMPHASIS(x)	((x) & 1)	/* Current De-emphasis Level */
+#define PCI_EXP_SLTCAP2			0x34	/* Slot Capabilities */
+#define PCI_EXP_SLTCTL2			0x38	/* Slot Control */
+#define PCI_EXP_SLTSTA2			0x3a	/* Slot Status */
+
+/* MSI-X */
+#define  PCI_MSIX_ENABLE	0x8000
+#define  PCI_MSIX_MASK		0x4000
+#define  PCI_MSIX_TABSIZE	0x07ff
+#define PCI_MSIX_TABLE		4
+#define PCI_MSIX_PBA		8
+#define  PCI_MSIX_BIR		0x7
+
+/* Subsystem vendor/device ID for PCI bridges */
+#define PCI_SSVID_VENDOR	4
+#define PCI_SSVID_DEVICE	6
+
+/* PCI Advanced Features */
+#define PCI_AF_CAP		3
+#define  PCI_AF_CAP_TP		0x01
+#define  PCI_AF_CAP_FLR		0x02
+#define PCI_AF_CTRL		4
+#define  PCI_AF_CTRL_FLR	0x01
+#define PCI_AF_STATUS		5
+#define  PCI_AF_STATUS_TP	0x01
+
+/* SATA Host Bus Adapter */
+#define PCI_SATA_HBA_BARS	4
+#define PCI_SATA_HBA_REG0	8
+
+/*** Definitions of extended capabilities ***/
+
+/* Advanced Error Reporting */
+#define PCI_ERR_UNCOR_STATUS	4	/* Uncorrectable Error Status */
+#define  PCI_ERR_UNC_TRAIN	0x00000001	/* Undefined in PCIe rev1.1 & 2.0 spec */
+#define  PCI_ERR_UNC_DLP	0x00000010	/* Data Link Protocol */
+#define  PCI_ERR_UNC_SDES	0x00000020	/* Surprise Down Error */
+#define  PCI_ERR_UNC_POISON_TLP	0x00001000	/* Poisoned TLP */
+#define  PCI_ERR_UNC_FCP	0x00002000	/* Flow Control Protocol */
+#define  PCI_ERR_UNC_COMP_TIME	0x00004000	/* Completion Timeout */
+#define  PCI_ERR_UNC_COMP_ABORT	0x00008000	/* Completer Abort */
+#define  PCI_ERR_UNC_UNX_COMP	0x00010000	/* Unexpected Completion */
+#define  PCI_ERR_UNC_RX_OVER	0x00020000	/* Receiver Overflow */
+#define  PCI_ERR_UNC_MALF_TLP	0x00040000	/* Malformed TLP */
+#define  PCI_ERR_UNC_ECRC	0x00080000	/* ECRC Error Status */
+#define  PCI_ERR_UNC_UNSUP	0x00100000	/* Unsupported Request */
+#define  PCI_ERR_UNC_ACS_VIOL	0x00200000	/* ACS Violation */
+#define PCI_ERR_UNCOR_MASK	8	/* Uncorrectable Error Mask */
+	/* Same bits as above */
+#define PCI_ERR_UNCOR_SEVER	12	/* Uncorrectable Error Severity */
+	/* Same bits as above */
+#define PCI_ERR_COR_STATUS	16	/* Correctable Error Status */
+#define  PCI_ERR_COR_RCVR	0x00000001	/* Receiver Error Status */
+#define  PCI_ERR_COR_BAD_TLP	0x00000040	/* Bad TLP Status */
+#define  PCI_ERR_COR_BAD_DLLP	0x00000080	/* Bad DLLP Status */
+#define  PCI_ERR_COR_REP_ROLL	0x00000100	/* REPLAY_NUM Rollover */
+#define  PCI_ERR_COR_REP_TIMER	0x00001000	/* Replay Timer Timeout */
+#define  PCI_ERR_COR_REP_ANFE	0x00002000	/* Advisory Non-Fatal Error */
+#define PCI_ERR_COR_MASK	20	/* Correctable Error Mask */
+	/* Same bits as above */
+#define PCI_ERR_CAP		24	/* Advanced Error Capabilities */
+#define  PCI_ERR_CAP_FEP(x)	((x) & 31)	/* First Error Pointer */
+#define  PCI_ERR_CAP_ECRC_GENC	0x00000020	/* ECRC Generation Capable */
+#define  PCI_ERR_CAP_ECRC_GENE	0x00000040	/* ECRC Generation Enable */
+#define  PCI_ERR_CAP_ECRC_CHKC	0x00000080	/* ECRC Check Capable */
+#define  PCI_ERR_CAP_ECRC_CHKE	0x00000100	/* ECRC Check Enable */
+#define PCI_ERR_HEADER_LOG	28	/* Header Log Register (16 bytes) */
+#define PCI_ERR_ROOT_COMMAND	44	/* Root Error Command */
+#define PCI_ERR_ROOT_STATUS	48
+#define PCI_ERR_ROOT_COR_SRC	52
+#define PCI_ERR_ROOT_SRC	54
+
+/* Virtual Channel */
+#define PCI_VC_PORT_REG1	4
+#define PCI_VC_PORT_REG2	8
+#define PCI_VC_PORT_CTRL	12
+#define PCI_VC_PORT_STATUS	14
+#define PCI_VC_RES_CAP		16
+#define PCI_VC_RES_CTRL		20
+#define PCI_VC_RES_STATUS	26
+
+/* Power Budgeting */
+#define PCI_PWR_DSR		4	/* Data Select Register */
+#define PCI_PWR_DATA		8	/* Data Register */
+#define  PCI_PWR_DATA_BASE(x)	((x) & 0xff)	    /* Base Power */
+#define  PCI_PWR_DATA_SCALE(x)	(((x) >> 8) & 3)    /* Data Scale */
+#define  PCI_PWR_DATA_PM_SUB(x)	(((x) >> 10) & 7)   /* PM Sub State */
+#define  PCI_PWR_DATA_PM_STATE(x) (((x) >> 13) & 3) /* PM State */
+#define  PCI_PWR_DATA_TYPE(x)	(((x) >> 15) & 7)   /* Type */
+#define  PCI_PWR_DATA_RAIL(x)	(((x) >> 18) & 7)   /* Power Rail */
+#define PCI_PWR_CAP		12	/* Capability */
+#define  PCI_PWR_CAP_BUDGET(x)	((x) & 1)	/* Included in system budget */
+
+/* Root Complex Link */
+#define PCI_RCLINK_ESD		4	/* Element Self Description */
+#define PCI_RCLINK_LINK1	16	/* First Link Entry */
+#define  PCI_RCLINK_LINK_DESC	0	/* Link Entry: Description */
+#define  PCI_RCLINK_LINK_ADDR	8	/* Link Entry: Address (64-bit) */
+#define  PCI_RCLINK_LINK_SIZE	16	/* Link Entry: sizeof */
+
+/* PCIe Vendor-Specific Capability */
+#define PCI_EVNDR_HEADER	4	/* Vendor-Specific Header */
+#define PCI_EVNDR_REGISTERS	8	/* Vendor-Specific Registers */
+
+/* Access Control Services */
+#define PCI_ACS_CAP		0x04	/* ACS Capability Register */
+#define PCI_ACS_CAP_VALID	0x0001	/* ACS Source Validation */
+#define PCI_ACS_CAP_BLOCK	0x0002	/* ACS Translation Blocking */
+#define PCI_ACS_CAP_REQ_RED	0x0004	/* ACS P2P Request Redirect */
+#define PCI_ACS_CAP_CMPLT_RED	0x0008	/* ACS P2P Completion Redirect */
+#define PCI_ACS_CAP_FORWARD	0x0010	/* ACS Upstream Forwarding */
+#define PCI_ACS_CAP_EGRESS	0x0020	/* ACS P2P Egress Control */
+#define PCI_ACS_CAP_TRANS	0x0040	/* ACS Direct Translated P2P */
+#define PCI_ACS_CAP_VECTOR(x)	(((x) >> 8) & 0xff) /* Egress Control Vector Size */
+#define PCI_ACS_CTRL		0x06	/* ACS Control Register */
+#define PCI_ACS_CTRL_VALID	0x0001	/* ACS Source Validation Enable */
+#define PCI_ACS_CTRL_BLOCK	0x0002	/* ACS Translation Blocking Enable */
+#define PCI_ACS_CTRL_REQ_RED	0x0004	/* ACS P2P Request Redirect Enable */
+#define PCI_ACS_CTRL_CMPLT_RED	0x0008	/* ACS P2P Completion Redirect Enable */
+#define PCI_ACS_CTRL_FORWARD	0x0010	/* ACS Upstream Forwarding Enable */
+#define PCI_ACS_CTRL_EGRESS	0x0020	/* ACS P2P Egress Control Enable */
+#define PCI_ACS_CTRL_TRANS	0x0040	/* ACS Direct Translated P2P Enable */
+#define PCI_ACS_EGRESS_CTRL	0x08	/* Egress Control Vector */
+
+/* Alternative Routing-ID Interpretation */
+#define PCI_ARI_CAP		0x04	/* ARI Capability Register */
+#define  PCI_ARI_CAP_MFVC	0x0001	/* MFVC Function Groups Capability */
+#define  PCI_ARI_CAP_ACS	0x0002	/* ACS Function Groups Capability */
+#define  PCI_ARI_CAP_NFN(x)	(((x) >> 8) & 0xff) /* Next Function Number */
+#define PCI_ARI_CTRL		0x06	/* ARI Control Register */
+#define  PCI_ARI_CTRL_MFVC	0x0001	/* MFVC Function Groups Enable */
+#define  PCI_ARI_CTRL_ACS	0x0002	/* ACS Function Groups Enable */
+#define  PCI_ARI_CTRL_FG(x)	(((x) >> 4) & 7) /* Function Group */
+
+/* Address Translation Service */
+#define PCI_ATS_CAP		0x04	/* ATS Capability Register */
+#define  PCI_ATS_CAP_IQD(x)	((x) & 0x1f) /* Invalidate Queue Depth */
+#define PCI_ATS_CTRL		0x06	/* ATS Control Register */
+#define  PCI_ATS_CTRL_STU(x)	((x) & 0x1f) /* Smallest Translation Unit */
+#define  PCI_ATS_CTRL_ENABLE	0x8000	/* ATS Enable */
+
+/* Single Root I/O Virtualization */
+#define PCI_IOV_CAP		0x04	/* SR-IOV Capability Register */
+#define  PCI_IOV_CAP_VFM	0x00000001 /* VF Migration Capable */
+#define  PCI_IOV_CAP_IMN(x)	((x) >> 21) /* VF Migration Interrupt Message Number */
+#define PCI_IOV_CTRL		0x08	/* SR-IOV Control Register */
+#define  PCI_IOV_CTRL_VFE	0x0001	/* VF Enable */
+#define  PCI_IOV_CTRL_VFME	0x0002	/* VF Migration Enable */
+#define  PCI_IOV_CTRL_VFMIE	0x0004	/* VF Migration Interrupt Enable */
+#define  PCI_IOV_CTRL_MSE	0x0008	/* VF MSE */
+#define  PCI_IOV_CTRL_ARI	0x0010	/* ARI Capable Hierarchy */
+#define PCI_IOV_STATUS		0x0a	/* SR-IOV Status Register */
+#define  PCI_IOV_STATUS_MS	0x0001	/* VF Migration Status */
+#define PCI_IOV_INITIALVF	0x0c	/* Number of VFs that are initially associated */
+#define PCI_IOV_TOTALVF		0x0e	/* Maximum number of VFs that could be associated */
+#define PCI_IOV_NUMVF		0x10	/* Number of VFs that are available */
+#define PCI_IOV_FDL		0x12	/* Function Dependency Link */
+#define PCI_IOV_OFFSET		0x14	/* First VF Offset */
+#define PCI_IOV_STRIDE		0x16	/* Routing ID offset from one VF to the next one */
+#define PCI_IOV_DID		0x1a	/* VF Device ID */
+#define PCI_IOV_SUPPS		0x1c	/* Supported Page Sizes */
+#define PCI_IOV_SYSPS		0x20	/* System Page Size */
+#define PCI_IOV_BAR_BASE	0x24	/* VF BAR0, VF BAR1, ... VF BAR5 */
+#define PCI_IOV_NUM_BAR		6	/* Number of VF BARs */
+#define PCI_IOV_MSAO		0x3c	/* VF Migration State Array Offset */
+#define PCI_IOV_MSA_BIR(x)	((x) & 7) /* VF Migration State BIR */
+#define PCI_IOV_MSA_OFFSET(x)	((x) & 0xfffffff8) /* VF Migration State Offset */
+
+/* Transaction Processing Hints */
+#define PCI_TPH_CAPABILITIES	4
+#define   PCI_TPH_INTVEC_SUP	(1<<1)	/* Supports interrupt vector mode */
+#define   PCI_TPH_DEV_SUP      	(1<<2)	/* Device specific mode supported */
+#define   PCI_TPH_EXT_REQ_SUP	(1<<8)	/* Supports extended requests */
+#define   PCI_TPH_ST_LOC_MASK	(3<<9)	/* Steering table location bits */
+#define     PCI_TPH_ST_NONE	(0<<9)	/* No steering table */
+#define     PCI_TPH_ST_CAP	(1<<9)	/* Steering table in TPH cap */
+#define     PCI_TPH_ST_MSIX	(2<<9)	/* Steering table in MSI-X table */
+#define   PCI_TPH_ST_SIZE_SHIFT	(16)	/* Encoded as size - 1 */
+
+/* Latency Tolerance Reporting */
+#define PCI_LTR_MAX_SNOOP	4	/* 16 bit value */
+#define   PCI_LTR_VALUE_MASK	(0x3ff)
+#define   PCI_LTR_SCALE_SHIFT	(10)
+#define   PCI_LTR_SCALE_MASK	(7)
+#define PCI_LTR_MAX_NOSNOOP	6	/* 16 bit value */
+
+/*
+ * The PCI interface treats multi-function devices as independent
+ * devices.  The slot/function address of each device is encoded
+ * in a single byte as follows:
+ *
+ *	7:3 = slot
+ *	2:0 = function
+ */
+#define PCI_DEVFN(slot,func)	((((slot) & 0x1f) << 3) | ((func) & 0x07))
+#define PCI_SLOT(devfn)		(((devfn) >> 3) & 0x1f)
+#define PCI_FUNC(devfn)		((devfn) & 0x07)
+
+/* Device classes and subclasses */
+
+#define PCI_CLASS_NOT_DEFINED		0x0000
+#define PCI_CLASS_NOT_DEFINED_VGA	0x0001
+
+#define PCI_BASE_CLASS_STORAGE		0x01
+#define PCI_CLASS_STORAGE_SCSI		0x0100
+#define PCI_CLASS_STORAGE_IDE		0x0101
+#define PCI_CLASS_STORAGE_FLOPPY	0x0102
+#define PCI_CLASS_STORAGE_IPI		0x0103
+#define PCI_CLASS_STORAGE_RAID		0x0104
+#define PCI_CLASS_STORAGE_ATA		0x0105
+#define PCI_CLASS_STORAGE_SATA		0x0106
+#define PCI_CLASS_STORAGE_SAS		0x0107
+#define PCI_CLASS_STORAGE_OTHER		0x0180
+
+#define PCI_BASE_CLASS_NETWORK		0x02
+#define PCI_CLASS_NETWORK_ETHERNET	0x0200
+#define PCI_CLASS_NETWORK_TOKEN_RING	0x0201
+#define PCI_CLASS_NETWORK_FDDI		0x0202
+#define PCI_CLASS_NETWORK_ATM		0x0203
+#define PCI_CLASS_NETWORK_ISDN		0x0204
+#define PCI_CLASS_NETWORK_OTHER		0x0280
+
+#define PCI_BASE_CLASS_DISPLAY		0x03
+#define PCI_CLASS_DISPLAY_VGA		0x0300
+#define PCI_CLASS_DISPLAY_XGA		0x0301
+#define PCI_CLASS_DISPLAY_3D		0x0302
+#define PCI_CLASS_DISPLAY_OTHER		0x0380
+
+#define PCI_BASE_CLASS_MULTIMEDIA	0x04
+#define PCI_CLASS_MULTIMEDIA_VIDEO	0x0400
+#define PCI_CLASS_MULTIMEDIA_AUDIO	0x0401
+#define PCI_CLASS_MULTIMEDIA_PHONE	0x0402
+#define PCI_CLASS_MULTIMEDIA_AUDIO_DEV	0x0403
+#define PCI_CLASS_MULTIMEDIA_OTHER	0x0480
+
+#define PCI_BASE_CLASS_MEMORY		0x05
+#define  PCI_CLASS_MEMORY_RAM		0x0500
+#define  PCI_CLASS_MEMORY_FLASH		0x0501
+#define  PCI_CLASS_MEMORY_OTHER		0x0580
+
+#define PCI_BASE_CLASS_BRIDGE		0x06
+#define  PCI_CLASS_BRIDGE_HOST		0x0600
+#define  PCI_CLASS_BRIDGE_ISA		0x0601
+#define  PCI_CLASS_BRIDGE_EISA		0x0602
+#define  PCI_CLASS_BRIDGE_MC		0x0603
+#define  PCI_CLASS_BRIDGE_PCI		0x0604
+#define  PCI_CLASS_BRIDGE_PCMCIA	0x0605
+#define  PCI_CLASS_BRIDGE_NUBUS		0x0606
+#define  PCI_CLASS_BRIDGE_CARDBUS	0x0607
+#define  PCI_CLASS_BRIDGE_RACEWAY	0x0608
+#define  PCI_CLASS_BRIDGE_PCI_SEMI	0x0609
+#define  PCI_CLASS_BRIDGE_IB_TO_PCI	0x060a
+#define  PCI_CLASS_BRIDGE_OTHER		0x0680
+
+#define PCI_BASE_CLASS_COMMUNICATION	0x07
+#define PCI_CLASS_COMMUNICATION_SERIAL	0x0700
+#define PCI_CLASS_COMMUNICATION_PARALLEL 0x0701
+#define PCI_CLASS_COMMUNICATION_MSERIAL	0x0702
+#define PCI_CLASS_COMMUNICATION_MODEM	0x0703
+#define PCI_CLASS_COMMUNICATION_OTHER	0x0780
+
+#define PCI_BASE_CLASS_SYSTEM		0x08
+#define PCI_CLASS_SYSTEM_PIC		0x0800
+#define PCI_CLASS_SYSTEM_DMA		0x0801
+#define PCI_CLASS_SYSTEM_TIMER		0x0802
+#define PCI_CLASS_SYSTEM_RTC		0x0803
+#define PCI_CLASS_SYSTEM_PCI_HOTPLUG	0x0804
+#define PCI_CLASS_SYSTEM_OTHER		0x0880
+
+#define PCI_BASE_CLASS_INPUT		0x09
+#define PCI_CLASS_INPUT_KEYBOARD	0x0900
+#define PCI_CLASS_INPUT_PEN		0x0901
+#define PCI_CLASS_INPUT_MOUSE		0x0902
+#define PCI_CLASS_INPUT_SCANNER		0x0903
+#define PCI_CLASS_INPUT_GAMEPORT	0x0904
+#define PCI_CLASS_INPUT_OTHER		0x0980
+
+#define PCI_BASE_CLASS_DOCKING		0x0a
+#define PCI_CLASS_DOCKING_GENERIC	0x0a00
+#define PCI_CLASS_DOCKING_OTHER		0x0a80
+
+#define PCI_BASE_CLASS_PROCESSOR	0x0b
+#define PCI_CLASS_PROCESSOR_386		0x0b00
+#define PCI_CLASS_PROCESSOR_486		0x0b01
+#define PCI_CLASS_PROCESSOR_PENTIUM	0x0b02
+#define PCI_CLASS_PROCESSOR_ALPHA	0x0b10
+#define PCI_CLASS_PROCESSOR_POWERPC	0x0b20
+#define PCI_CLASS_PROCESSOR_MIPS	0x0b30
+#define PCI_CLASS_PROCESSOR_CO		0x0b40
+
+#define PCI_BASE_CLASS_SERIAL		0x0c
+#define PCI_CLASS_SERIAL_FIREWIRE	0x0c00
+#define PCI_CLASS_SERIAL_ACCESS		0x0c01
+#define PCI_CLASS_SERIAL_SSA		0x0c02
+#define PCI_CLASS_SERIAL_USB		0x0c03
+#define PCI_CLASS_SERIAL_FIBER		0x0c04
+#define PCI_CLASS_SERIAL_SMBUS		0x0c05
+#define PCI_CLASS_SERIAL_INFINIBAND	0x0c06
+
+#define PCI_BASE_CLASS_WIRELESS		0x0d
+#define PCI_CLASS_WIRELESS_IRDA		0x0d00
+#define PCI_CLASS_WIRELESS_CONSUMER_IR	0x0d01
+#define PCI_CLASS_WIRELESS_RF		0x0d10
+#define PCI_CLASS_WIRELESS_OTHER	0x0d80
+
+#define PCI_BASE_CLASS_INTELLIGENT	0x0e
+#define PCI_CLASS_INTELLIGENT_I2O	0x0e00
+
+#define PCI_BASE_CLASS_SATELLITE	0x0f
+#define PCI_CLASS_SATELLITE_TV		0x0f00
+#define PCI_CLASS_SATELLITE_AUDIO	0x0f01
+#define PCI_CLASS_SATELLITE_VOICE	0x0f03
+#define PCI_CLASS_SATELLITE_DATA	0x0f04
+
+#define PCI_BASE_CLASS_CRYPT		0x10
+#define PCI_CLASS_CRYPT_NETWORK		0x1000
+#define PCI_CLASS_CRYPT_ENTERTAINMENT	0x1010
+#define PCI_CLASS_CRYPT_OTHER		0x1080
+
+#define PCI_BASE_CLASS_SIGNAL		0x11
+#define PCI_CLASS_SIGNAL_DPIO		0x1100
+#define PCI_CLASS_SIGNAL_PERF_CTR	0x1101
+#define PCI_CLASS_SIGNAL_SYNCHRONIZER	0x1110
+#define PCI_CLASS_SIGNAL_OTHER		0x1180
+
+#define PCI_CLASS_OTHERS		0xff
+
+/* Several ID's we need in the library */
+
+#define PCI_VENDOR_ID_INTEL		0x8086
+#define PCI_VENDOR_ID_COMPAQ		0x0e11
diff --git a/ext/hwloc/include/pci/pci.h b/ext/hwloc/include/pci/pci.h
new file mode 100644
index 0000000..7a5a6b8
--- /dev/null
+++ b/ext/hwloc/include/pci/pci.h
@@ -0,0 +1,240 @@
+/*
+ *	The PCI Library
+ *
+ *	Copyright (c) 1997--2009 Martin Mares <mj at ucw.cz>
+ *
+ *	Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#ifndef _PCI_LIB_H
+#define _PCI_LIB_H
+
+#ifndef PCI_CONFIG_H
+#include "config.h"
+#endif
+
+#include "header.h"
+#include "types.h"
+
+#define PCI_LIB_VERSION 0x030100
+
+#ifndef PCI_ABI
+#define PCI_ABI
+#endif
+
+/*
+ *	PCI Access Structure
+ */
+
+struct pci_methods;
+
+enum pci_access_type {
+  /* Known access methods, remember to update access.c as well */
+  PCI_ACCESS_AUTO,			/* Autodetection */
+  PCI_ACCESS_SYS_BUS_PCI,		/* Linux /sys/bus/pci */
+  PCI_ACCESS_PROC_BUS_PCI,		/* Linux /proc/bus/pci */
+  PCI_ACCESS_I386_TYPE1,		/* i386 ports, type 1 */
+  PCI_ACCESS_I386_TYPE2,		/* i386 ports, type 2 */
+  PCI_ACCESS_FBSD_DEVICE,		/* FreeBSD /dev/pci */
+  PCI_ACCESS_AIX_DEVICE,		/* /dev/pci0, /dev/bus0, etc. */
+  PCI_ACCESS_NBSD_LIBPCI,		/* NetBSD libpci */
+  PCI_ACCESS_OBSD_DEVICE,		/* OpenBSD /dev/pci */
+  PCI_ACCESS_DUMP,			/* Dump file */
+  PCI_ACCESS_MAX
+};
+
+struct pci_access {
+  /* Options you can change: */
+  unsigned int method;			/* Access method */
+  int writeable;			/* Open in read/write mode */
+  int buscentric;			/* Bus-centric view of the world */
+
+  char *id_file_name;			/* Name of ID list file (use pci_set_name_list_path()) */
+  int free_id_name;			/* Set if id_file_name is malloced */
+  int numeric_ids;			/* Enforce PCI_LOOKUP_NUMERIC (>1 => PCI_LOOKUP_MIXED) */
+
+  unsigned int id_lookup_mode;		/* pci_lookup_mode flags which are set automatically */
+					/* Default: PCI_LOOKUP_CACHE */
+
+  int debugging;			/* Turn on debugging messages */
+
+  /* Functions you can override: */
+  void (*error)(char *msg, ...) PCI_PRINTF(1,2);	/* Write error message and quit */
+  void (*warning)(char *msg, ...) PCI_PRINTF(1,2);	/* Write a warning message */
+  void (*debug)(char *msg, ...) PCI_PRINTF(1,2);	/* Write a debugging message */
+
+  struct pci_dev *devices;		/* Devices found on this bus */
+
+  /* Fields used internally: */
+  struct pci_methods *methods;
+  struct pci_param *params;
+  struct id_entry **id_hash;		/* names.c */
+  struct id_bucket *current_id_bucket;
+  int id_load_failed;
+  int id_cache_status;			/* 0=not read, 1=read, 2=dirty */
+  int fd;				/* proc/sys: fd for config space */
+  int fd_rw;				/* proc/sys: fd opened read-write */
+  int fd_pos;				/* proc/sys: current position */
+  int fd_vpd;				/* sys: fd for VPD */
+  struct pci_dev *cached_dev;		/* proc/sys: device the fds are for */
+};
+
+/* Initialize PCI access */
+struct pci_access *pci_alloc(void) PCI_ABI;
+void pci_init(struct pci_access *) PCI_ABI;
+void pci_cleanup(struct pci_access *) PCI_ABI;
+
+/* Scanning of devices */
+void pci_scan_bus(struct pci_access *acc) PCI_ABI;
+struct pci_dev *pci_get_dev(struct pci_access *acc, int domain, int bus, int dev, int func) PCI_ABI; /* Raw access to specified device */
+void pci_free_dev(struct pci_dev *) PCI_ABI;
+
+/* Names of access methods */
+int pci_lookup_method(char *name) PCI_ABI;	/* Returns -1 if not found */
+char *pci_get_method_name(int index) PCI_ABI;	/* Returns "" if unavailable, NULL if index out of range */
+
+/*
+ *	Named parameters
+ */
+
+struct pci_param {
+  struct pci_param *next;		/* Please use pci_walk_params() for traversing the list */
+  char *param;				/* Name of the parameter */
+  char *value;				/* Value of the parameter */
+  int value_malloced;			/* used internally */
+  char *help;				/* Explanation of the parameter */
+};
+
+char *pci_get_param(struct pci_access *acc, char *param) PCI_ABI;
+int pci_set_param(struct pci_access *acc, char *param, char *value) PCI_ABI;	/* 0 on success, -1 if no such parameter */
+/* To traverse the list, call pci_walk_params repeatedly, first with prev=NULL, and do not modify the parameters during traversal. */
+struct pci_param *pci_walk_params(struct pci_access *acc, struct pci_param *prev) PCI_ABI;
+
+/*
+ *	Devices
+ */
+
+struct pci_dev {
+  struct pci_dev *next;			/* Next device in the chain */
+  u16 domain;				/* PCI domain (host bridge) */
+  u8 bus, dev, func;			/* Bus inside domain, device and function */
+
+  /* These fields are set by pci_fill_info() */
+  int known_fields;			/* Set of info fields already known */
+  u16 vendor_id, device_id;		/* Identity of the device */
+  u16 device_class;			/* PCI device class */
+  int irq;				/* IRQ number */
+  pciaddr_t base_addr[6];		/* Base addresses including flags in lower bits */
+  pciaddr_t size[6];			/* Region sizes */
+  pciaddr_t rom_base_addr;		/* Expansion ROM base address */
+  pciaddr_t rom_size;			/* Expansion ROM size */
+  struct pci_cap *first_cap;		/* List of capabilities */
+  char *phy_slot;			/* Physical slot */
+
+  /* Fields used internally: */
+  struct pci_access *access;
+  struct pci_methods *methods;
+  u8 *cache;				/* Cached config registers */
+  int cache_len;
+  int hdrtype;				/* Cached low 7 bits of header type, -1 if unknown */
+  void *aux;				/* Auxillary data */
+};
+
+#define PCI_ADDR_IO_MASK (~(pciaddr_t) 0x3)
+#define PCI_ADDR_MEM_MASK (~(pciaddr_t) 0xf)
+#define PCI_ADDR_FLAG_MASK 0xf
+
+u8 pci_read_byte(struct pci_dev *, int pos) PCI_ABI; /* Access to configuration space */
+u16 pci_read_word(struct pci_dev *, int pos) PCI_ABI;
+u32 pci_read_long(struct pci_dev *, int pos) PCI_ABI;
+int pci_read_block(struct pci_dev *, int pos, u8 *buf, int len) PCI_ABI;
+int pci_read_vpd(struct pci_dev *d, int pos, u8 *buf, int len) PCI_ABI;
+int pci_write_byte(struct pci_dev *, int pos, u8 data) PCI_ABI;
+int pci_write_word(struct pci_dev *, int pos, u16 data) PCI_ABI;
+int pci_write_long(struct pci_dev *, int pos, u32 data) PCI_ABI;
+int pci_write_block(struct pci_dev *, int pos, u8 *buf, int len) PCI_ABI;
+
+int pci_fill_info(struct pci_dev *, int flags) PCI_ABI; /* Fill in device information */
+
+#define PCI_FILL_IDENT		1
+#define PCI_FILL_IRQ		2
+#define PCI_FILL_BASES		4
+#define PCI_FILL_ROM_BASE	8
+#define PCI_FILL_SIZES		16
+#define PCI_FILL_CLASS		32
+#define PCI_FILL_CAPS		64
+#define PCI_FILL_EXT_CAPS	128
+#define PCI_FILL_PHYS_SLOT	256
+#define PCI_FILL_RESCAN		0x10000
+
+void pci_setup_cache(struct pci_dev *, u8 *cache, int len) PCI_ABI;
+
+/*
+ *	Capabilities
+ */
+
+struct pci_cap {
+  struct pci_cap *next;
+  u16 id;				/* PCI_CAP_ID_xxx */
+  u16 type;				/* PCI_CAP_xxx */
+  unsigned int addr;			/* Position in the config space */
+};
+
+#define PCI_CAP_NORMAL		1	/* Traditional PCI capabilities */
+#define PCI_CAP_EXTENDED	2	/* PCIe extended capabilities */
+
+struct pci_cap *pci_find_cap(struct pci_dev *, unsigned int id, unsigned int type) PCI_ABI;
+
+/*
+ *	Filters
+ */
+
+struct pci_filter {
+  int domain, bus, slot, func;			/* -1 = ANY */
+  int vendor, device;
+};
+
+void pci_filter_init(struct pci_access *, struct pci_filter *) PCI_ABI;
+char *pci_filter_parse_slot(struct pci_filter *, char *) PCI_ABI;
+char *pci_filter_parse_id(struct pci_filter *, char *) PCI_ABI;
+int pci_filter_match(struct pci_filter *, struct pci_dev *) PCI_ABI;
+
+/*
+ *	Conversion of PCI ID's to names (according to the pci.ids file)
+ *
+ *	Call pci_lookup_name() to identify different types of ID's:
+ *
+ *	VENDOR				(vendorID) -> vendor
+ *	DEVICE				(vendorID, deviceID) -> device
+ *	VENDOR | DEVICE			(vendorID, deviceID) -> combined vendor and device
+ *	SUBSYSTEM | VENDOR		(subvendorID) -> subsystem vendor
+ *	SUBSYSTEM | DEVICE		(vendorID, deviceID, subvendorID, subdevID) -> subsystem device
+ *	SUBSYSTEM | VENDOR | DEVICE	(vendorID, deviceID, subvendorID, subdevID) -> combined subsystem v+d
+ *	SUBSYSTEM | ...			(-1, -1, subvendorID, subdevID) -> generic subsystem
+ *	CLASS				(classID) -> class
+ *	PROGIF				(classID, progif) -> programming interface
+ */
+
+char *pci_lookup_name(struct pci_access *a, char *buf, int size, int flags, ...) PCI_ABI;
+
+int pci_load_name_list(struct pci_access *a) PCI_ABI;	/* Called automatically by pci_lookup_*() when needed; returns success */
+void pci_free_name_list(struct pci_access *a) PCI_ABI;	/* Called automatically by pci_cleanup() */
+void pci_set_name_list_path(struct pci_access *a, char *name, int to_be_freed) PCI_ABI;
+void pci_id_cache_flush(struct pci_access *a) PCI_ABI;
+
+enum pci_lookup_mode {
+  PCI_LOOKUP_VENDOR = 1,		/* Vendor name (args: vendorID) */
+  PCI_LOOKUP_DEVICE = 2,		/* Device name (args: vendorID, deviceID) */
+  PCI_LOOKUP_CLASS = 4,			/* Device class (args: classID) */
+  PCI_LOOKUP_SUBSYSTEM = 8,
+  PCI_LOOKUP_PROGIF = 16,		/* Programming interface (args: classID, prog_if) */
+  PCI_LOOKUP_NUMERIC = 0x10000,		/* Want only formatted numbers; default if access->numeric_ids is set */
+  PCI_LOOKUP_NO_NUMBERS = 0x20000,	/* Return NULL if not found in the database; default is to print numerically */
+  PCI_LOOKUP_MIXED = 0x40000,		/* Include both numbers and names */
+  PCI_LOOKUP_NETWORK = 0x80000,		/* Try to resolve unknown ID's by DNS */
+  PCI_LOOKUP_SKIP_LOCAL = 0x100000,	/* Do not consult local database */
+  PCI_LOOKUP_CACHE = 0x200000,		/* Consult the local cache before using DNS */
+  PCI_LOOKUP_REFRESH_CACHE = 0x400000,	/* Forget all previously cached entries, but still allow updating the cache */
+};
+
+#endif
diff --git a/ext/hwloc/include/pci/types.h b/ext/hwloc/include/pci/types.h
new file mode 100644
index 0000000..4d23e69
--- /dev/null
+++ b/ext/hwloc/include/pci/types.h
@@ -0,0 +1,65 @@
+/*
+ *	The PCI Library -- Types and Format Strings
+ *
+ *	Copyright (c) 1997--2008 Martin Mares <mj at ucw.cz>
+ *
+ *	Can be freely distributed and used under the terms of the GNU GPL.
+ */
+
+#include <sys/types.h>
+
+#ifndef PCI_HAVE_Uxx_TYPES
+
+#ifdef PCI_OS_WINDOWS
+#include <windef.h>
+typedef BYTE u8;
+typedef WORD u16;
+typedef DWORD u32;
+#elif defined(PCI_HAVE_STDINT_H) || (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L)
+#include <stdint.h>
+typedef uint8_t u8;
+typedef uint16_t u16;
+typedef uint32_t u32;
+#else
+typedef u_int8_t u8;
+typedef u_int16_t u16;
+typedef u_int32_t u32;
+#endif
+
+#ifdef PCI_HAVE_64BIT_ADDRESS
+#include <limits.h>
+#if ULONG_MAX > 0xffffffff
+typedef unsigned long u64;
+#define PCI_U64_FMT "l"
+#else
+typedef unsigned long long u64;
+#define PCI_U64_FMT "ll"
+#endif
+#endif
+
+#endif	/* PCI_HAVE_Uxx_TYPES */
+
+#ifdef PCI_HAVE_64BIT_ADDRESS
+typedef u64 pciaddr_t;
+#define PCIADDR_T_FMT "%08" PCI_U64_FMT "x"
+#define PCIADDR_PORT_FMT "%04" PCI_U64_FMT "x"
+#else
+typedef u32 pciaddr_t;
+#define PCIADDR_T_FMT "%08x"
+#define PCIADDR_PORT_FMT "%04x"
+#endif
+
+#ifdef PCI_ARCH_SPARC64
+/* On sparc64 Linux the kernel reports remapped port addresses and IRQ numbers */
+#undef PCIADDR_PORT_FMT
+#define PCIADDR_PORT_FMT PCIADDR_T_FMT
+#define PCIIRQ_FMT "%08x"
+#else
+#define PCIIRQ_FMT "%d"
+#endif
+
+#if defined(__GNUC__) && __GNUC__ > 2
+#define PCI_PRINTF(x,y) __attribute__((format(printf, x, y)))
+#else
+#define PCI_PRINTF(x,y)
+#endif
diff --git a/ext/hwloc/include/private/autogen/README.txt b/ext/hwloc/include/private/autogen/README.txt
new file mode 100644
index 0000000..17f7f60
--- /dev/null
+++ b/ext/hwloc/include/private/autogen/README.txt
@@ -0,0 +1,3 @@
+This directory needs to exist in the repo so that the Autotools can
+generate a file here.  We have a put a token file in this directory so
+that git doesn't ignore the empty directory in the repository.
diff --git a/ext/hwloc/include/private/autogen/config.h b/ext/hwloc/include/private/autogen/config.h
new file mode 100644
index 0000000..633e7ef
--- /dev/null
+++ b/ext/hwloc/include/private/autogen/config.h
@@ -0,0 +1,761 @@
+/* include/private/autogen/config.h.  Generated from config.h.in by configure.  */
+/* include/private/autogen/config.h.in.  Generated from configure.ac by autoheader.  */
+
+/* -*- c -*-
+ *
+ * Copyright © 2009, 2011, 2012 CNRS, inria., Université Bordeaux  All rights reserved.
+ * Copyright © 2009-2014 Cisco Systems, Inc.  All rights reserved.
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ *
+ * This file is automatically generated by configure.  Edits will be lost
+ * the next time you run configure!
+ */
+
+#ifndef HWLOC_CONFIGURE_H
+#define HWLOC_CONFIGURE_H
+
+
+/* Define to 1 if gcc's __atomic builtins are available */
+/* #undef HAVE_ATOMIC_BUILTINS */
+
+/* Define to 1 if the system has the type `CACHE_DESCRIPTOR'. */
+/* #undef HAVE_CACHE_DESCRIPTOR */
+
+/* Define to 1 if the system has the type `CACHE_RELATIONSHIP'. */
+/* #undef HAVE_CACHE_RELATIONSHIP */
+
+/* Define to 1 if you have the `close' function. */
+#define HAVE_CLOSE 1
+
+/* Define to 1 if you have the `clz' function. */
+/* #undef HAVE_CLZ */
+
+/* Define to 1 if you have the `clzl' function. */
+/* #undef HAVE_CLZL */
+
+/* Define to 1 if you have the <CL/cl_ext.h> header file. */
+/* #undef HAVE_CL_CL_EXT_H */
+
+/* Define to 1 if you have the `cpuset_setaffinity' function. */
+/* #undef HAVE_CPUSET_SETAFFINITY */
+
+/* Define to 1 if you have the `cpuset_setid' function. */
+/* #undef HAVE_CPUSET_SETID */
+
+/* Define to 1 if you have the <ctype.h> header file. */
+#define HAVE_CTYPE_H 1
+
+/* Define to 1 if we have -lcuda */
+/* #undef HAVE_CUDA */
+
+/* Define to 1 if you have the <cuda.h> header file. */
+/* #undef HAVE_CUDA_H */
+
+/* Define to 1 if you have the <cuda_runtime_api.h> header file. */
+/* #undef HAVE_CUDA_RUNTIME_API_H */
+
+/* Define to 1 if you have the declaration of `CL_DEVICE_TOPOLOGY_AMD', and to
+   0 if you don't. */
+/* #undef HAVE_DECL_CL_DEVICE_TOPOLOGY_AMD */
+
+/* Define to 1 if you have the declaration of `CTL_HW', and to 0 if you don't.
+   */
+#define HAVE_DECL_CTL_HW 0
+
+/* Define to 1 if you have the declaration of `fabsf', and to 0 if you don't.
+   */
+#define HAVE_DECL_FABSF 1
+
+/* Define to 1 if you have the declaration of `getexecname', and to 0 if you
+   don't. */
+#define HAVE_DECL_GETEXECNAME 0
+
+/* Define to 1 if you have the declaration of `GetModuleFileName', and to 0 if
+   you don't. */
+#define HAVE_DECL_GETMODULEFILENAME 0
+
+/* Define to 1 if you have the declaration of `getprogname', and to 0 if you
+   don't. */
+#define HAVE_DECL_GETPROGNAME 0
+
+/* Define to 1 if you have the declaration of `HW_NCPU', and to 0 if you
+   don't. */
+#define HAVE_DECL_HW_NCPU 0
+
+/* Define to 1 if you have the declaration of
+   `nvmlDeviceGetMaxPcieLinkGeneration', and to 0 if you don't. */
+/* #undef HAVE_DECL_NVMLDEVICEGETMAXPCIELINKGENERATION */
+
+/* Define to 1 if you have the declaration of `pthread_getaffinity_np', and to
+   0 if you don't. */
+#define HAVE_DECL_PTHREAD_GETAFFINITY_NP 1
+
+/* Define to 1 if you have the declaration of `pthread_setaffinity_np', and to
+   0 if you don't. */
+#define HAVE_DECL_PTHREAD_SETAFFINITY_NP 1
+
+/* Define to 1 if you have the declaration of `strtoull', and to 0 if you
+   don't. */
+#define HAVE_DECL_STRTOULL 1
+
+/* Define to 1 if you have the declaration of `_SC_LARGE_PAGESIZE', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_LARGE_PAGESIZE 0
+
+/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_CONF', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_NPROCESSORS_CONF 1
+
+/* Define to 1 if you have the declaration of `_SC_NPROCESSORS_ONLN', and to 0
+   if you don't. */
+#define HAVE_DECL__SC_NPROCESSORS_ONLN 1
+
+/* Define to 1 if you have the declaration of `_SC_NPROC_CONF', and to 0 if
+   you don't. */
+#define HAVE_DECL__SC_NPROC_CONF 0
+
+/* Define to 1 if you have the declaration of `_SC_NPROC_ONLN', and to 0 if
+   you don't. */
+#define HAVE_DECL__SC_NPROC_ONLN 0
+
+/* Define to 1 if you have the declaration of `_SC_PAGESIZE', and to 0 if you
+   don't. */
+#define HAVE_DECL__SC_PAGESIZE 1
+
+/* Define to 1 if you have the declaration of `_SC_PAGE_SIZE', and to 0 if you
+   don't. */
+#define HAVE_DECL__SC_PAGE_SIZE 1
+
+/* Define to 1 if you have the <dirent.h> header file. */
+#define HAVE_DIRENT_H 1
+
+/* Define to 1 if you have the <dlfcn.h> header file. */
+#define HAVE_DLFCN_H 1
+
+/* Define to 1 if you have the <endian.h> header file. */
+#define HAVE_ENDIAN_H 1
+
+/* Define to 1 if you have the <fcntl.h> header file. */
+#define HAVE_FCNTL_H 1
+
+/* Define to 1 if you have the `ffs' function. */
+#define HAVE_FFS 1
+
+/* Define to 1 if you have the `ffsl' function. */
+#define HAVE_FFSL 1
+
+/* Define to 1 if you have the `fls' function. */
+/* #undef HAVE_FLS */
+
+/* Define to 1 if you have the `flsl' function. */
+/* #undef HAVE_FLSL */
+
+/* Define to 1 if you have the `getpagesize' function. */
+#define HAVE_GETPAGESIZE 1
+
+/* Define to 1 if you have the `getpid' function. */
+#define HAVE_GETPID 1
+
+/* Define to 1 if you have the `gettimeofday' function. */
+#define HAVE_GETTIMEOFDAY 1
+
+/* Define to 1 if the system has the type `GROUP_AFFINITY'. */
+/* #undef HAVE_GROUP_AFFINITY */
+
+/* Define to 1 if the system has the type `GROUP_RELATIONSHIP'. */
+/* #undef HAVE_GROUP_RELATIONSHIP */
+
+/* Define to 1 if you have the `host_info' function. */
+/* #undef HAVE_HOST_INFO */
+
+/* Define to 1 if you have the <infiniband/verbs.h> header file. */
+/* #undef HAVE_INFINIBAND_VERBS_H */
+
+/* Define to 1 if you have the <inttypes.h> header file. */
+#define HAVE_INTTYPES_H 1
+
+/* Define to 1 if the system has the type `KAFFINITY'. */
+/* #undef HAVE_KAFFINITY */
+
+/* Define to 1 if you have the <kstat.h> header file. */
+/* #undef HAVE_KSTAT_H */
+
+/* Define to 1 if you have the <langinfo.h> header file. */
+#define HAVE_LANGINFO_H 1
+
+/* Define to 1 if we have -lgdi32 */
+/* #undef HAVE_LIBGDI32 */
+
+/* Define to 1 if we have -libverbs */
+/* #undef HAVE_LIBIBVERBS */
+
+/* Define to 1 if we have -lkstat */
+/* #undef HAVE_LIBKSTAT */
+
+/* Define to 1 if we have -llgrp */
+/* #undef HAVE_LIBLGRP */
+
+/* Define to 1 if you have the <libudev.h> header file. */
+/* #undef HAVE_LIBUDEV_H */
+
+/* Define to 1 if you have the `localeconv' function. */
+#define HAVE_LOCALECONV 1
+
+/* Define to 1 if you have the <locale.h> header file. */
+#define HAVE_LOCALE_H 1
+
+/* Define to 1 if the system has the type `LOGICAL_PROCESSOR_RELATIONSHIP'. */
+/* #undef HAVE_LOGICAL_PROCESSOR_RELATIONSHIP */
+
+/* Define to 1 if the system has the type 'long long int'. */
+#define HAVE_LONG_LONG_INT 1
+
+/* Define to 1 if you have the <mach/mach_host.h> header file. */
+/* #undef HAVE_MACH_MACH_HOST_H */
+
+/* Define to 1 if you have the <mach/mach_init.h> header file. */
+/* #undef HAVE_MACH_MACH_INIT_H */
+
+/* Define to 1 if you have the <malloc.h> header file. */
+#define HAVE_MALLOC_H 1
+
+/* Define to 1 if you have the `memalign' function. */
+#define HAVE_MEMALIGN 1
+
+/* Define to 1 if you have the <memory.h> header file. */
+#define HAVE_MEMORY_H 1
+
+/* Define to 1 if we have -lmyriexpress */
+/* #undef HAVE_MYRIEXPRESS */
+
+/* Define to 1 if you have the <myriexpress.h> header file. */
+/* #undef HAVE_MYRIEXPRESS_H */
+
+/* Define to 1 if you have the `nl_langinfo' function. */
+#define HAVE_NL_LANGINFO 1
+
+/* Define to 1 if you have the <numaif.h> header file. */
+/* #undef HAVE_NUMAIF_H */
+
+/* Define to 1 if the system has the type `NUMA_NODE_RELATIONSHIP'. */
+/* #undef HAVE_NUMA_NODE_RELATIONSHIP */
+
+/* Define to 1 if you have the <NVCtrl/NVCtrl.h> header file. */
+/* #undef HAVE_NVCTRL_NVCTRL_H */
+
+/* Define to 1 if you have the <nvml.h> header file. */
+/* #undef HAVE_NVML_H */
+
+/* Define to 1 if you have the `open' function. */
+#define HAVE_OPEN 1
+
+/* Define to 1 if you have the `openat' function. */
+#define HAVE_OPENAT 1
+
+/* Define to 1 if you have the <picl.h> header file. */
+/* #undef HAVE_PICL_H */
+
+/* Define to 1 if you have the `posix_memalign' function. */
+#define HAVE_POSIX_MEMALIGN 1
+
+/* Define to 1 if the system has the type `PROCESSOR_CACHE_TYPE'. */
+/* #undef HAVE_PROCESSOR_CACHE_TYPE */
+
+/* Define to 1 if the system has the type `PROCESSOR_GROUP_INFO'. */
+/* #undef HAVE_PROCESSOR_GROUP_INFO */
+
+/* Define to 1 if the system has the type `PROCESSOR_RELATIONSHIP'. */
+/* #undef HAVE_PROCESSOR_RELATIONSHIP */
+
+/* Define to '1' if program_invocation_name is present and usable */
+#define HAVE_PROGRAM_INVOCATION_NAME 1
+
+/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_BLOCK'. */
+/* #undef HAVE_PSAPI_WORKING_SET_EX_BLOCK */
+
+/* Define to 1 if the system has the type `PSAPI_WORKING_SET_EX_INFORMATION'.
+   */
+/* #undef HAVE_PSAPI_WORKING_SET_EX_INFORMATION */
+
+/* Define to 1 if you have the <pthread_np.h> header file. */
+/* #undef HAVE_PTHREAD_NP_H */
+
+/* Define to 1 if the system has the type `pthread_t'. */
+#define HAVE_PTHREAD_T 1
+
+/* Define to 1 if you have the `putwc' function. */
+#define HAVE_PUTWC 1
+
+/* Define to 1 if you have the `read' function. */
+#define HAVE_READ 1
+
+/* Define to 1 if the system has the type `RelationProcessorPackage'. */
+/* #undef HAVE_RELATIONPROCESSORPACKAGE */
+
+/* Define to 1 if you have the <sched.h> header file. */
+#define HAVE_SCHED_H 1
+
+/* Define to 1 if you have the `sched_yield' function. */
+#define HAVE_SCHED_YIELD 1
+
+/* Define to 1 if you have the `setlocale' function. */
+#define HAVE_SETLOCALE 1
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HAVE_STDINT_H 1
+
+/* Define to 1 if you have the <stdlib.h> header file. */
+#define HAVE_STDLIB_H 1
+
+/* Define to 1 if you have the `strftime' function. */
+#define HAVE_STRFTIME 1
+
+/* Define to 1 if you have the <strings.h> header file. */
+#define HAVE_STRINGS_H 1
+
+/* Define to 1 if you have the <string.h> header file. */
+#define HAVE_STRING_H 1
+
+/* Define to 1 if you have the `strncasecmp' function. */
+#define HAVE_STRNCASECMP 1
+
+/* Define to 1 if you have the `strtoll' function. */
+#define HAVE_STRTOLL 1
+
+/* Define to 1 if gcc's __sync builtins are available */
+#define HAVE_SYNC_BUILTINS 1
+
+/* Define to '1' if sysctl is present and usable */
+#define HAVE_SYSCTL 1
+
+/* Define to '1' if sysctlbyname is present and usable */
+/* #undef HAVE_SYSCTLBYNAME */
+
+/* Define to 1 if the system has the type
+   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION'. */
+/* #undef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION */
+
+/* Define to 1 if the system has the type
+   `SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX'. */
+/* #undef HAVE_SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX */
+
+/* Define to 1 if you have the <sys/cpuset.h> header file. */
+/* #undef HAVE_SYS_CPUSET_H */
+
+/* Define to 1 if you have the <sys/lgrp_user.h> header file. */
+/* #undef HAVE_SYS_LGRP_USER_H */
+
+/* Define to 1 if you have the <sys/mman.h> header file. */
+#define HAVE_SYS_MMAN_H 1
+
+/* Define to 1 if you have the <sys/param.h> header file. */
+#define HAVE_SYS_PARAM_H 1
+
+/* Define to 1 if you have the <sys/stat.h> header file. */
+#define HAVE_SYS_STAT_H 1
+
+/* Define to 1 if you have the <sys/sysctl.h> header file. */
+#define HAVE_SYS_SYSCTL_H 1
+
+/* Define to 1 if you have the <sys/time.h> header file. */
+#define HAVE_SYS_TIME_H 1
+
+/* Define to 1 if you have the <sys/types.h> header file. */
+#define HAVE_SYS_TYPES_H 1
+
+/* Define to 1 if you have the <sys/utsname.h> header file. */
+#define HAVE_SYS_UTSNAME_H 1
+
+/* Define to 1 if you have the `uname' function. */
+#define HAVE_UNAME 1
+
+/* Define to 1 if you have the <unistd.h> header file. */
+#define HAVE_UNISTD_H 1
+
+/* Define to 1 if the system has the type 'unsigned long long int'. */
+#define HAVE_UNSIGNED_LONG_LONG_INT 1
+
+/* Define to 1 if you have the `uselocale' function. */
+#define HAVE_USELOCALE 1
+
+/* Define to 1 if the system has the type `wchar_t'. */
+#define HAVE_WCHAR_T 1
+
+/* Define to 1 if you have the <X11/keysym.h> header file. */
+#define HAVE_X11_KEYSYM_H 1
+
+/* Define to 1 if you have the <X11/Xlib.h> header file. */
+#define HAVE_X11_XLIB_H 1
+
+/* Define to 1 if you have the <X11/Xutil.h> header file. */
+#define HAVE_X11_XUTIL_H 1
+
+/* Define to 1 if you have the <xlocale.h> header file. */
+#define HAVE_XLOCALE_H 1
+
+/* Define to '1' if __progname is present and usable */
+#define HAVE___PROGNAME 1
+
+/* Define to 1 on AIX */
+/* #undef HWLOC_AIX_SYS */
+
+/* Define to 1 on BlueGene/Q */
+/* #undef HWLOC_BGQ_SYS */
+
+/* Whether C compiler supports symbol visibility or not */
+#define HWLOC_C_HAVE_VISIBILITY 1
+
+/* Define to 1 on Darwin */
+/* #undef HWLOC_DARWIN_SYS */
+
+/* Whether we are in debugging mode or not */
+/* #undef HWLOC_DEBUG */
+
+/* Define to 1 on *FREEBSD */
+/* #undef HWLOC_FREEBSD_SYS */
+
+/* Whether your compiler has __attribute__ or not */
+#define HWLOC_HAVE_ATTRIBUTE 1
+
+/* Whether your compiler has __attribute__ aligned or not */
+#define HWLOC_HAVE_ATTRIBUTE_ALIGNED 1
+
+/* Whether your compiler has __attribute__ always_inline or not */
+#define HWLOC_HAVE_ATTRIBUTE_ALWAYS_INLINE 1
+
+/* Whether your compiler has __attribute__ cold or not */
+#define HWLOC_HAVE_ATTRIBUTE_COLD 1
+
+/* Whether your compiler has __attribute__ const or not */
+#define HWLOC_HAVE_ATTRIBUTE_CONST 1
+
+/* Whether your compiler has __attribute__ deprecated or not */
+#define HWLOC_HAVE_ATTRIBUTE_DEPRECATED 1
+
+/* Whether your compiler has __attribute__ format or not */
+#define HWLOC_HAVE_ATTRIBUTE_FORMAT 1
+
+/* Whether your compiler has __attribute__ hot or not */
+#define HWLOC_HAVE_ATTRIBUTE_HOT 1
+
+/* Whether your compiler has __attribute__ malloc or not */
+#define HWLOC_HAVE_ATTRIBUTE_MALLOC 1
+
+/* Whether your compiler has __attribute__ may_alias or not */
+#define HWLOC_HAVE_ATTRIBUTE_MAY_ALIAS 1
+
+/* Whether your compiler has __attribute__ nonnull or not */
+#define HWLOC_HAVE_ATTRIBUTE_NONNULL 1
+
+/* Whether your compiler has __attribute__ noreturn or not */
+#define HWLOC_HAVE_ATTRIBUTE_NORETURN 1
+
+/* Whether your compiler has __attribute__ no_instrument_function or not */
+#define HWLOC_HAVE_ATTRIBUTE_NO_INSTRUMENT_FUNCTION 1
+
+/* Whether your compiler has __attribute__ packed or not */
+#define HWLOC_HAVE_ATTRIBUTE_PACKED 1
+
+/* Whether your compiler has __attribute__ pure or not */
+#define HWLOC_HAVE_ATTRIBUTE_PURE 1
+
+/* Whether your compiler has __attribute__ sentinel or not */
+#define HWLOC_HAVE_ATTRIBUTE_SENTINEL 1
+
+/* Whether your compiler has __attribute__ unused or not */
+#define HWLOC_HAVE_ATTRIBUTE_UNUSED 1
+
+/* Whether your compiler has __attribute__ warn unused result or not */
+#define HWLOC_HAVE_ATTRIBUTE_WARN_UNUSED_RESULT 1
+
+/* Whether your compiler has __attribute__ weak alias or not */
+#define HWLOC_HAVE_ATTRIBUTE_WEAK_ALIAS 1
+
+/* Define to 1 if your `ffs' function is known to be broken. */
+/* #undef HWLOC_HAVE_BROKEN_FFS */
+
+/* Define to 1 if you have the `cairo' library. */
+#define HWLOC_HAVE_CAIRO 1
+
+/* Define to 1 if you have the `clz' function. */
+/* #undef HWLOC_HAVE_CLZ */
+
+/* Define to 1 if you have the `clzl' function. */
+/* #undef HWLOC_HAVE_CLZL */
+
+/* Define to 1 if the CPU_SET macro works */
+#define HWLOC_HAVE_CPU_SET 1
+
+/* Define to 1 if the CPU_SET_S macro works */
+#define HWLOC_HAVE_CPU_SET_S 1
+
+/* Define to 1 if you have the `cudart' SDK. */
+/* #undef HWLOC_HAVE_CUDART */
+
+/* Define to 1 if function `clz' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_CLZ */
+
+/* Define to 1 if function `clzl' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_CLZL */
+
+/* Define to 1 if function `ffs' is declared by system headers */
+#define HWLOC_HAVE_DECL_FFS 1
+
+/* Define to 1 if function `ffsl' is declared by system headers */
+#define HWLOC_HAVE_DECL_FFSL 1
+
+/* Define to 1 if function `fls' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_FLS */
+
+/* Define to 1 if function `flsl' is declared by system headers */
+/* #undef HWLOC_HAVE_DECL_FLSL */
+
+/* Define to 1 if function `strncasecmp' is declared by system headers */
+#define HWLOC_HAVE_DECL_STRNCASECMP 1
+
+/* Define to 1 if you have the `ffs' function. */
+#define HWLOC_HAVE_FFS 1
+
+/* Define to 1 if you have the `ffsl' function. */
+#define HWLOC_HAVE_FFSL 1
+
+/* Define to 1 if you have the `fls' function. */
+/* #undef HWLOC_HAVE_FLS */
+
+/* Define to 1 if you have the `flsl' function. */
+/* #undef HWLOC_HAVE_FLSL */
+
+/* Define to 1 if you have the GL module components. */
+/* #undef HWLOC_HAVE_GL */
+
+/* Define to 1 if you have a library providing the termcap interface */
+/* #undef HWLOC_HAVE_LIBTERMCAP */
+
+/* Define to 1 if you have the `libxml2' library. */
+/* #undef HWLOC_HAVE_LIBXML2 */
+
+/* Define to 1 if building the Linux PCI component */
+#define HWLOC_HAVE_LINUXPCI 1
+
+/* Define to 1 if mbind is available. */
+/* #undef HWLOC_HAVE_MBIND */
+
+/* Define to 1 if migrate_pages is available. */
+/* #undef HWLOC_HAVE_MIGRATE_PAGES */
+
+/* Define to 1 if you have the `NVML' library. */
+/* #undef HWLOC_HAVE_NVML */
+
+/* Define to 1 if glibc provides the old prototype (without length) of
+   sched_setaffinity() */
+/* #undef HWLOC_HAVE_OLD_SCHED_SETAFFINITY */
+
+/* Define to 1 if you have the `OpenCL' library. */
+/* #undef HWLOC_HAVE_OPENCL */
+
+/* Define to 1 if the hwloc library should support dynamically-loaded plugins
+   */
+/* #undef HWLOC_HAVE_PLUGINS */
+
+/* `Define to 1 if you have pthread_getthrds_np' */
+/* #undef HWLOC_HAVE_PTHREAD_GETTHRDS_NP */
+
+/* Define to 1 if pthread mutexes are available */
+#define HWLOC_HAVE_PTHREAD_MUTEX 1
+
+/* Define to 1 if glibc provides a prototype of sched_setaffinity() */
+#define HWLOC_HAVE_SCHED_SETAFFINITY 1
+
+/* Define to 1 if set_mempolicy is available. */
+/* #undef HWLOC_HAVE_SET_MEMPOLICY */
+
+/* Define to 1 if you have the <stdint.h> header file. */
+#define HWLOC_HAVE_STDINT_H 1
+
+/* Define to 1 if you have the `windows.h' header. */
+/* #undef HWLOC_HAVE_WINDOWS_H */
+
+/* Define to 1 if X11 headers including Xutil.h and keysym.h are available. */
+#define HWLOC_HAVE_X11_KEYSYM 1
+
+/* Define to 1 if you have x86 cpuid */
+#define HWLOC_HAVE_X86_CPUID 1
+
+/* Define to 1 if the _syscall3 macro works */
+/* #undef HWLOC_HAVE__SYSCALL3 */
+
+/* Define to 1 on HP-UX */
+/* #undef HWLOC_HPUX_SYS */
+
+/* Define to 1 on Irix */
+/* #undef HWLOC_IRIX_SYS */
+
+/* Define to 1 on Linux */
+#define HWLOC_LINUX_SYS 1
+
+/* Define to 1 on *NETBSD */
+/* #undef HWLOC_NETBSD_SYS */
+
+/* Define to 1 on OSF */
+/* #undef HWLOC_OSF_SYS */
+
+/* The size of `unsigned int', as computed by sizeof */
+#define HWLOC_SIZEOF_UNSIGNED_INT 4
+
+/* The size of `unsigned long', as computed by sizeof */
+#define HWLOC_SIZEOF_UNSIGNED_LONG 8
+
+/* Define to 1 on Solaris */
+/* #undef HWLOC_SOLARIS_SYS */
+
+/* The hwloc symbol prefix */
+#define HWLOC_SYM_PREFIX hwloc_
+
+/* The hwloc symbol prefix in all caps */
+#define HWLOC_SYM_PREFIX_CAPS HWLOC_
+
+/* Whether we need to re-define all the hwloc public symbols or not */
+#define HWLOC_SYM_TRANSFORM 0
+
+/* Define to 1 on unsupported systems */
+/* #undef HWLOC_UNSUPPORTED_SYS */
+
+/* Define to 1 if ncurses works, preferred over curses */
+/* #undef HWLOC_USE_NCURSES */
+
+/* The library version, always available, even in embedded mode, contrary to
+   VERSION */
+#define HWLOC_VERSION "2.0.0a1-git"
+
+/* Define to 1 on WINDOWS */
+/* #undef HWLOC_WIN_SYS */
+
+/* Define to 1 on x86_32 */
+/* #undef HWLOC_X86_32_ARCH */
+
+/* Define to 1 on x86_64 */
+#define HWLOC_X86_64_ARCH 1
+
+/* Define to the sub-directory in which libtool stores uninstalled libraries.
+   */
+#define LT_OBJDIR ".libs/"
+
+/* Name of package */
+#define PACKAGE "hwloc"
+
+/* Define to the address where bug reports for this package should be sent. */
+#define PACKAGE_BUGREPORT "http://www.open-mpi.org/projects/hwloc/"
+
+/* Define to the full name of this package. */
+#define PACKAGE_NAME "hwloc"
+
+/* Define to the full name and version of this package. */
+#define PACKAGE_STRING "hwloc 2.0.0a1-git"
+
+/* Define to the one symbol short name of this package. */
+#define PACKAGE_TARNAME "hwloc"
+
+/* Define to the home page for this package. */
+#define PACKAGE_URL ""
+
+/* Define to the version of this package. */
+#define PACKAGE_VERSION "2.0.0a1-git"
+
+/* The size of `unsigned int', as computed by sizeof. */
+#define SIZEOF_UNSIGNED_INT 4
+
+/* The size of `unsigned long', as computed by sizeof. */
+#define SIZEOF_UNSIGNED_LONG 8
+
+/* The size of `void *', as computed by sizeof. */
+#define SIZEOF_VOID_P 8
+
+/* Define to 1 if you have the ANSI C header files. */
+#define STDC_HEADERS 1
+
+/* Enable extensions on HP-UX. */
+#ifndef _HPUX_SOURCE
+# define _HPUX_SOURCE 1
+#endif
+
+
+/* Enable extensions on AIX 3, Interix.  */
+#ifndef _ALL_SOURCE
+# define _ALL_SOURCE 1
+#endif
+/* Enable GNU extensions on systems that have them.  */
+#ifndef _GNU_SOURCE
+# define _GNU_SOURCE 1
+#endif
+/* Enable threading extensions on Solaris.  */
+#ifndef _POSIX_PTHREAD_SEMANTICS
+# define _POSIX_PTHREAD_SEMANTICS 1
+#endif
+/* Enable extensions on HP NonStop.  */
+#ifndef _TANDEM_SOURCE
+# define _TANDEM_SOURCE 1
+#endif
+/* Enable general extensions on Solaris.  */
+#ifndef __EXTENSIONS__
+# define __EXTENSIONS__ 1
+#endif
+
+
+/* Define to 1 if /dev/urandom should be used for seeding the hash function */
+#define USE_URANDOM 1
+
+/* Define to 1 if CryptGenRandom should be used for seeding the hash function
+   */
+#define USE_WINDOWS_CRYPTOAPI 1
+
+/* Version number of package */
+#define VERSION "2.0.0a1-git"
+
+/* Define to 1 if the X Window System is missing or not being used. */
+/* #undef X_DISPLAY_MISSING */
+
+/* Are we building for HP-UX? */
+#define _HPUX_SOURCE 1
+
+/* Define to 1 if on MINIX. */
+/* #undef _MINIX */
+
+/* Define to 2 if the system does not provide POSIX.1 features except with
+   this defined. */
+/* #undef _POSIX_1_SOURCE */
+
+/* Define to 1 if you need to in order for `stat' and other things to work. */
+/* #undef _POSIX_SOURCE */
+
+/* Define for Solaris 2.5.1 so the uint32_t typedef from <sys/synch.h>,
+   <pthread.h>, or <semaphore.h> is not used. If the typedef were allowed, the
+   #define below would cause a syntax error. */
+/* #undef _UINT32_T */
+
+/* Define this to the process ID type */
+#define hwloc_pid_t pid_t
+
+/* Define this to the thread ID type */
+#define hwloc_thread_t pthread_t
+
+/* Define to `__inline__' or `__inline' if that's what the C compiler
+   calls it, or to nothing if 'inline' is not supported under any name.  */
+#ifndef __cplusplus
+/* #undef inline */
+#endif
+
+/* Define to the type of a signed integer type of width exactly 32 bits if
+   such a type exists and the standard includes do not define it. */
+/* #undef int32_t */
+
+/* Define to the type of an unsigned integer type of width exactly 32 bits if
+   such a type exists and the standard includes do not define it. */
+/* #undef uint32_t */
+
+
+#endif /* HWLOC_CONFIGURE_H */
+
diff --git a/ext/hwloc/include/private/components.h b/ext/hwloc/include/private/components.h
new file mode 100644
index 0000000..b366345
--- /dev/null
+++ b/ext/hwloc/include/private/components.h
@@ -0,0 +1,40 @@
+/*
+ * Copyright © 2012 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+
+#ifdef HWLOC_INSIDE_PLUGIN
+/*
+ * these declarations are internal only, they are not available to plugins
+ * (many functions below are internal static symbols).
+ */
+#error This file should not be used in plugins
+#endif
+
+
+#ifndef PRIVATE_COMPONENTS_H
+#define PRIVATE_COMPONENTS_H 1
+
+#include <hwloc/plugins.h>
+
+struct hwloc_topology;
+
+extern int hwloc_disc_component_force_enable(struct hwloc_topology *topology,
+					     int envvar_forced, /* 1 if forced through envvar, 0 if forced through API */
+					     int type, const char *name,
+					     const void *data1, const void *data2, const void *data3);
+extern void hwloc_disc_components_enable_others(struct hwloc_topology *topology);
+
+/* Compute the topology is_thissystem flag based on enabled backends */
+extern void hwloc_backends_is_thissystem(struct hwloc_topology *topology);
+
+/* Disable and destroy all backends used by a topology */
+extern void hwloc_backends_disable_all(struct hwloc_topology *topology);
+
+/* Used by the core to setup/destroy the list of components */
+extern void hwloc_components_init(struct hwloc_topology *topology); /* increases components refcount, should be called exactly once per topology (during init) */
+extern void hwloc_components_destroy_all(struct hwloc_topology *topology); /* decreases components refcount, should be called exactly once per topology (during destroy) */
+
+#endif /* PRIVATE_COMPONENTS_H */
+
diff --git a/ext/hwloc/include/private/cpuid-x86.h b/ext/hwloc/include/private/cpuid-x86.h
new file mode 100644
index 0000000..8a8c48e
--- /dev/null
+++ b/ext/hwloc/include/private/cpuid-x86.h
@@ -0,0 +1,89 @@
+/*
+ * Copyright © 2010-2012, 2014 Université Bordeaux
+ * Copyright © 2010 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2014 Inria.  All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* Internals for x86's cpuid.  */
+
+#ifndef HWLOC_PRIVATE_CPUID_X86_H
+#define HWLOC_PRIVATE_CPUID_X86_H
+
+#if (defined HWLOC_X86_32_ARCH) && (!defined HWLOC_HAVE_MSVC_CPUIDEX)
+static __hwloc_inline int hwloc_have_x86_cpuid(void)
+{
+  int ret;
+  unsigned tmp, tmp2;
+  __asm__(
+      "mov $0,%0\n\t"   /* Not supported a priori */
+
+      "pushfl   \n\t"   /* Save flags */
+
+      "pushfl   \n\t"                                           \
+      "pop %1   \n\t"   /* Get flags */                         \
+
+#define TRY_TOGGLE                                              \
+      "xor $0x00200000,%1\n\t"        /* Try to toggle ID */    \
+      "mov %1,%2\n\t"   /* Save expected value */               \
+      "push %1  \n\t"                                           \
+      "popfl    \n\t"   /* Try to toggle */                     \
+      "pushfl   \n\t"                                           \
+      "pop %1   \n\t"                                           \
+      "cmp %1,%2\n\t"   /* Compare with expected value */       \
+      "jnz 0f\n\t"   /* Unexpected, failure */               \
+
+      TRY_TOGGLE        /* Try to set/clear */
+      TRY_TOGGLE        /* Try to clear/set */
+
+      "mov $1,%0\n\t"   /* Passed the test! */
+
+      "0: \n\t"
+      "popfl    \n\t"   /* Restore flags */
+
+      : "=r" (ret), "=&r" (tmp), "=&r" (tmp2));
+  return ret;
+}
+#endif /* !defined HWLOC_X86_32_ARCH && !defined HWLOC_HAVE_MSVC_CPUIDEX*/
+#if (defined HWLOC_X86_64_ARCH) || (defined HWLOC_HAVE_MSVC_CPUIDEX)
+static __hwloc_inline int hwloc_have_x86_cpuid(void) { return 1; }
+#endif /* HWLOC_X86_64_ARCH */
+
+static __hwloc_inline void hwloc_x86_cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
+{
+#ifdef HWLOC_HAVE_MSVC_CPUIDEX
+  int regs[4];
+  __cpuidex(regs, *eax, *ecx);
+  *eax = regs[0];
+  *ebx = regs[1];
+  *ecx = regs[2];
+  *edx = regs[3];
+#else /* HWLOC_HAVE_MSVC_CPUIDEX */
+  /* Note: gcc might want to use bx or the stack for %1 addressing, so we can't
+   * use them :/ */
+#ifdef HWLOC_X86_64_ARCH
+  hwloc_uint64_t sav_rbx;
+  __asm__(
+  "mov %%rbx,%2\n\t"
+  "cpuid\n\t"
+  "xchg %2,%%rbx\n\t"
+  "movl %k2,%1\n\t"
+  : "+a" (*eax), "=m" (*ebx), "=&r"(sav_rbx),
+    "+c" (*ecx), "=&d" (*edx));
+#elif defined(HWLOC_X86_32_ARCH)
+  unsigned long sav_ebx;
+  __asm__(
+  "mov %%ebx,%2\n\t"
+  "cpuid\n\t"
+  "xchg %2,%%ebx\n\t"
+  "movl %k2,%1\n\t"
+  : "+a" (*eax), "=m" (*ebx), "=&r"(sav_ebx),
+    "+c" (*ecx), "=&d" (*edx));
+#else
+#error unknown architecture
+#endif
+#endif /* HWLOC_HAVE_MSVC_CPUIDEX */
+}
+
+#endif /* HWLOC_PRIVATE_X86_CPUID_H */
diff --git a/ext/hwloc/include/private/cpuid.h b/ext/hwloc/include/private/cpuid.h
new file mode 100644
index 0000000..214ab38
--- /dev/null
+++ b/ext/hwloc/include/private/cpuid.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2010-2012 Université Bordeaux 1
+ * Copyright © 2010 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2014 Inria.  All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* Internals for x86's cpuid.  */
+
+#ifndef HWLOC_PRIVATE_CPUID_H
+#define HWLOC_PRIVATE_CPUID_H
+
+#ifdef HWLOC_X86_32_ARCH
+static __hwloc_inline int hwloc_have_cpuid(void)
+{
+  int ret;
+  unsigned tmp, tmp2;
+  asm(
+      "mov $0,%0\n\t"   /* Not supported a priori */
+
+      "pushfl   \n\t"   /* Save flags */
+
+      "pushfl   \n\t"                                           \
+      "pop %1   \n\t"   /* Get flags */                         \
+
+#define TRY_TOGGLE                                              \
+      "xor $0x00200000,%1\n\t"        /* Try to toggle ID */    \
+      "mov %1,%2\n\t"   /* Save expected value */               \
+      "push %1  \n\t"                                           \
+      "popfl    \n\t"   /* Try to toggle */                     \
+      "pushfl   \n\t"                                           \
+      "pop %1   \n\t"                                           \
+      "cmp %1,%2\n\t"   /* Compare with expected value */       \
+      "jnz Lhwloc1\n\t"   /* Unexpected, failure */               \
+
+      TRY_TOGGLE        /* Try to set/clear */
+      TRY_TOGGLE        /* Try to clear/set */
+
+      "mov $1,%0\n\t"   /* Passed the test! */
+
+      "Lhwloc1: \n\t"
+      "popfl    \n\t"   /* Restore flags */
+
+      : "=r" (ret), "=&r" (tmp), "=&r" (tmp2));
+  return ret;
+}
+#endif /* HWLOC_X86_32_ARCH */
+#ifdef HWLOC_X86_64_ARCH
+static __hwloc_inline int hwloc_have_cpuid(void) { return 1; }
+#endif /* HWLOC_X86_64_ARCH */
+
+static __hwloc_inline void hwloc_cpuid(unsigned *eax, unsigned *ebx, unsigned *ecx, unsigned *edx)
+{
+  /* Note: gcc might want to use bx or the stack for %1 addressing, so we can't
+   * use them :/ */
+#ifdef HWLOC_X86_64_ARCH
+  hwloc_uint64_t sav_rbx;
+  asm(
+  "mov %%rbx,%2\n\t"
+  "cpuid\n\t"
+  "xchg %2,%%rbx\n\t"
+  "movl %k2,%1\n\t"
+  : "+a" (*eax), "=m" (*ebx), "=&r"(sav_rbx),
+    "+c" (*ecx), "=&d" (*edx));
+#elif defined(HWLOC_X86_32_ARCH)
+  unsigned long sav_ebx;
+  asm(
+  "mov %%ebx,%2\n\t"
+  "cpuid\n\t"
+  "xchg %2,%%ebx\n\t"
+  "movl %k2,%1\n\t"
+  : "+a" (*eax), "=m" (*ebx), "=&r"(sav_ebx),
+    "+c" (*ecx), "=&d" (*edx));
+#else
+#error unknown architecture
+#endif
+}
+
+#endif /* HWLOC_PRIVATE_CPUID_H */
diff --git a/ext/hwloc/include/private/debug.h b/ext/hwloc/include/private/debug.h
new file mode 100644
index 0000000..4de91bf
--- /dev/null
+++ b/ext/hwloc/include/private/debug.h
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2012 Inria.  All rights reserved.
+ * Copyright © 2009, 2011 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* The configuration file */
+
+#ifndef HWLOC_DEBUG_H
+#define HWLOC_DEBUG_H
+
+#include <private/autogen/config.h>
+
+#ifdef HWLOC_DEBUG
+#include <stdarg.h>
+#include <stdio.h>
+#endif
+
+static __hwloc_inline void hwloc_debug(const char *s __hwloc_attribute_unused, ...)
+{
+#ifdef HWLOC_DEBUG
+    va_list ap;
+
+    va_start(ap, s);
+    vfprintf(stderr, s, ap);
+    va_end(ap);
+#endif
+}
+
+#ifdef HWLOC_DEBUG
+#define hwloc_debug_bitmap(fmt, bitmap) do { \
+  char *s; \
+  hwloc_bitmap_asprintf(&s, bitmap); \
+  fprintf(stderr, fmt, s); \
+  free(s); \
+} while (0)
+#define hwloc_debug_1arg_bitmap(fmt, arg1, bitmap) do { \
+  char *s; \
+  hwloc_bitmap_asprintf(&s, bitmap); \
+  fprintf(stderr, fmt, arg1, s); \
+  free(s); \
+} while (0)
+#define hwloc_debug_2args_bitmap(fmt, arg1, arg2, bitmap) do { \
+  char *s; \
+  hwloc_bitmap_asprintf(&s, bitmap); \
+  fprintf(stderr, fmt, arg1, arg2, s); \
+  free(s); \
+} while (0)
+#else
+#define hwloc_debug_bitmap(s, bitmap) do { } while(0)
+#define hwloc_debug_1arg_bitmap(s, arg1, bitmap) do { } while(0)
+#define hwloc_debug_2args_bitmap(s, arg1, arg2, bitmap) do { } while(0)
+#endif
+
+#endif /* HWLOC_DEBUG_H */
diff --git a/ext/hwloc/include/private/map.h b/ext/hwloc/include/private/map.h
new file mode 100644
index 0000000..77c18a5
--- /dev/null
+++ b/ext/hwloc/include/private/map.h
@@ -0,0 +1,110 @@
+/*
+ * Copyright © 2013 Inria.  All rights reserved.
+ * Copyright © 2013 Cisco Systems, Inc.  All rights reserved.
+ * Copyright © 2013-2014 University of Wisconsin-La Crosse.
+ *                         All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ *
+ * $HEADER$
+ */
+
+#ifndef _PRIVATE_NETLOC_MAP_H_
+#define _PRIVATE_NETLOC_MAP_H_
+
+#include <hwloc.h>
+#include <netloc.h>
+
+
+struct netloc_map__subnet;
+struct netloc_map__server;
+
+struct netloc_map__port {
+  struct netloc_map__subnet * subnet;
+  struct netloc_map__server * server;
+
+  netloc_edge_t * edge;
+
+  unsigned hwloc_obj_depth;
+  unsigned hwloc_obj_index;
+  hwloc_obj_t hwloc_obj; /* cached from depth/index above,
+			  * only non-NULL if the topology hasn't been compressed in the meantime.
+			  */
+
+  struct netloc_map__port *prev, *next;
+
+  char id[0];
+};
+
+struct netloc_map__subnet {
+  netloc_topology_t topology;
+  netloc_network_type_t type;
+
+  int port_by_id_ready;
+  struct netloc_dt_lookup_table port_by_id;
+
+  struct netloc_map__subnet *prev, *next;
+
+  struct netloc_map__port *port_first, *port_last;
+  unsigned ports_nr;
+
+  char id[0];
+};
+
+struct netloc_map__server {
+  hwloc_topology_t topology; /* NULL if compressed */
+#if HWLOC_API_VERSION >= 0x00010800
+  hwloc_topology_diff_t topology_diff;
+  struct netloc_map__server *topology_diff_refserver;
+#endif
+
+  int usecount; /* references from the application,
+		 * or from topology diff for other servers.
+		 * no compression when > 0
+		 */
+
+  unsigned nr_ports;
+  unsigned nr_ports_allocated;
+  struct netloc_map__port ** ports;
+
+  struct netloc_map__server *prev, *next;
+  struct netloc_map *map;
+
+  char name[0];
+};
+
+enum netloc_map_verbose_flags_e {
+  NETLOC_MAP_VERBOSE_FLAG_COMPRESS = (1<<0)
+};
+
+struct netloc_map {
+  unsigned long flags;
+  unsigned long verbose_flags;
+
+  unsigned server_ports_nr; /* needed during build, to create large-enough hash tables */
+
+  char *hwloc_xml_path;
+  struct netloc_dt_lookup_table server_by_name;
+  struct netloc_map__server *server_first, *server_last;
+  unsigned servers_nr;
+
+  char *netloc_data_path;
+  struct netloc_dt_lookup_table subnet_by_id[NETLOC_NETWORK_TYPE_INVALID]; /* enough room for existing types */
+  struct netloc_map__subnet *subnet_first, *subnet_last;
+  unsigned subnets_nr;
+
+  int merged;
+};
+
+struct netloc_map__paths {
+  struct netloc_map *map;
+  unsigned long flags;
+  unsigned nr_paths;
+  struct netloc_map__path {
+    /* FIXME: cache the subnet */
+    unsigned nr_edges;
+    struct netloc_map_edge_s *edges;
+  } * paths;
+};
+
+#endif /* _PRIVATE_NETLOC_MAP_H_ */
diff --git a/ext/hwloc/include/private/misc.h b/ext/hwloc/include/private/misc.h
new file mode 100644
index 0000000..d0e6a46
--- /dev/null
+++ b/ext/hwloc/include/private/misc.h
@@ -0,0 +1,382 @@
+/*
+ * Copyright © 2009 CNRS
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2011 Cisco Systems, Inc.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+/* Misc macros and inlines.  */
+
+#ifndef HWLOC_PRIVATE_MISC_H
+#define HWLOC_PRIVATE_MISC_H
+
+#include <hwloc/autogen/config.h>
+#include <private/autogen/config.h>
+
+#ifdef HWLOC_HAVE_DECL_STRNCASECMP
+#ifdef HAVE_STRINGS_H
+#include <strings.h>
+#endif
+#else
+#ifdef HAVE_CTYPE_H
+#include <ctype.h>
+#endif
+#endif
+
+/* Compile-time assertion */
+#define HWLOC_BUILD_ASSERT(condition) ((void)sizeof(char[1 - 2*!(condition)]))
+
+#define HWLOC_BITS_PER_LONG (HWLOC_SIZEOF_UNSIGNED_LONG * 8)
+#define HWLOC_BITS_PER_INT (HWLOC_SIZEOF_UNSIGNED_INT * 8)
+
+#if (HWLOC_BITS_PER_LONG != 32) && (HWLOC_BITS_PER_LONG != 64)
+#error "unknown size for unsigned long."
+#endif
+
+#if (HWLOC_BITS_PER_INT != 16) && (HWLOC_BITS_PER_INT != 32) && (HWLOC_BITS_PER_INT != 64)
+#error "unknown size for unsigned int."
+#endif
+
+
+/**
+ * ffsl helpers.
+ */
+
+#if defined(HWLOC_HAVE_BROKEN_FFS)
+
+/* System has a broken ffs().
+ * We must check the before __GNUC__ or HWLOC_HAVE_FFSL
+ */
+#    define HWLOC_NO_FFS
+
+#elif defined(__GNUC__)
+
+#  if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
+     /* Starting from 3.4, gcc has a long variant.  */
+#    define hwloc_ffsl(x) __builtin_ffsl(x)
+#  else
+#    define hwloc_ffs(x) __builtin_ffs(x)
+#    define HWLOC_NEED_FFSL
+#  endif
+
+#elif defined(HWLOC_HAVE_FFSL)
+
+#  ifndef HWLOC_HAVE_DECL_FFSL
+extern int ffsl(long) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_ffsl(x) ffsl(x)
+
+#elif defined(HWLOC_HAVE_FFS)
+
+#  ifndef HWLOC_HAVE_DECL_FFS
+extern int ffs(int) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_ffs(x) ffs(x)
+#  define HWLOC_NEED_FFSL
+
+#else /* no ffs implementation */
+
+#    define HWLOC_NO_FFS
+
+#endif
+
+#ifdef HWLOC_NO_FFS
+
+/* no ffs or it is known to be broken */
+static __hwloc_inline int
+hwloc_ffsl_manual(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_ffsl_manual(unsigned long x)
+{
+	int i;
+
+	if (!x)
+		return 0;
+
+	i = 1;
+#if HWLOC_BITS_PER_LONG >= 64
+	if (!(x & 0xfffffffful)) {
+		x >>= 32;
+		i += 32;
+	}
+#endif
+	if (!(x & 0xffffu)) {
+		x >>= 16;
+		i += 16;
+	}
+	if (!(x & 0xff)) {
+		x >>= 8;
+		i += 8;
+	}
+	if (!(x & 0xf)) {
+		x >>= 4;
+		i += 4;
+	}
+	if (!(x & 0x3)) {
+		x >>= 2;
+		i += 2;
+	}
+	if (!(x & 0x1)) {
+		x >>= 1;
+		i += 1;
+	}
+
+	return i;
+}
+/* always define hwloc_ffsl as a macro, to avoid renaming breakage */
+#define hwloc_ffsl hwloc_ffsl_manual
+
+#elif defined(HWLOC_NEED_FFSL)
+
+/* We only have an int ffs(int) implementation, build a long one.  */
+
+/* First make it 32 bits if it was only 16.  */
+static __hwloc_inline int
+hwloc_ffs32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_ffs32(unsigned long x)
+{
+#if HWLOC_BITS_PER_INT == 16
+	int low_ffs, hi_ffs;
+
+	low_ffs = hwloc_ffs(x & 0xfffful);
+	if (low_ffs)
+		return low_ffs;
+
+	hi_ffs = hwloc_ffs(x >> 16);
+	if (hi_ffs)
+		return hi_ffs + 16;
+
+	return 0;
+#else
+	return hwloc_ffs(x);
+#endif
+}
+
+/* Then make it 64 bit if longs are.  */
+static __hwloc_inline int
+hwloc_ffsl_from_ffs32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_ffsl_from_ffs32(unsigned long x)
+{
+#if HWLOC_BITS_PER_LONG == 64
+	int low_ffs, hi_ffs;
+
+	low_ffs = hwloc_ffs32(x & 0xfffffffful);
+	if (low_ffs)
+		return low_ffs;
+
+	hi_ffs = hwloc_ffs32(x >> 32);
+	if (hi_ffs)
+		return hi_ffs + 32;
+
+	return 0;
+#else
+	return hwloc_ffs32(x);
+#endif
+}
+/* always define hwloc_ffsl as a macro, to avoid renaming breakage */
+#define hwloc_ffsl hwloc_ffsl_from_ffs32
+
+#endif
+
+/**
+ * flsl helpers.
+ */
+#ifdef __GNUC_____
+
+#  if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4))
+#    define hwloc_flsl(x) (x ? 8*sizeof(long) - __builtin_clzl(x) : 0)
+#  else
+#    define hwloc_fls(x) (x ? 8*sizeof(int) - __builtin_clz(x) : 0)
+#    define HWLOC_NEED_FLSL
+#  endif
+
+#elif defined(HWLOC_HAVE_FLSL)
+
+#  ifndef HWLOC_HAVE_DECL_FLSL
+extern int flsl(long) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_flsl(x) flsl(x)
+
+#elif defined(HWLOC_HAVE_CLZL)
+
+#  ifndef HWLOC_HAVE_DECL_CLZL
+extern int clzl(long) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_flsl(x) (x ? 8*sizeof(long) - clzl(x) : 0)
+
+#elif defined(HWLOC_HAVE_FLS)
+
+#  ifndef HWLOC_HAVE_DECL_FLS
+extern int fls(int) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_fls(x) fls(x)
+#  define HWLOC_NEED_FLSL
+
+#elif defined(HWLOC_HAVE_CLZ)
+
+#  ifndef HWLOC_HAVE_DECL_CLZ
+extern int clz(int) __hwloc_attribute_const;
+#  endif
+
+#  define hwloc_fls(x) (x ? 8*sizeof(int) - clz(x) : 0)
+#  define HWLOC_NEED_FLSL
+
+#else /* no fls implementation */
+
+static __hwloc_inline int
+hwloc_flsl_manual(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_flsl_manual(unsigned long x)
+{
+	int i = 0;
+
+	if (!x)
+		return 0;
+
+	i = 1;
+#if HWLOC_BITS_PER_LONG >= 64
+	if ((x & 0xffffffff00000000ul)) {
+		x >>= 32;
+		i += 32;
+	}
+#endif
+	if ((x & 0xffff0000u)) {
+		x >>= 16;
+		i += 16;
+	}
+	if ((x & 0xff00)) {
+		x >>= 8;
+		i += 8;
+	}
+	if ((x & 0xf0)) {
+		x >>= 4;
+		i += 4;
+	}
+	if ((x & 0xc)) {
+		x >>= 2;
+		i += 2;
+	}
+	if ((x & 0x2)) {
+		x >>= 1;
+		i += 1;
+	}
+
+	return i;
+}
+/* always define hwloc_flsl as a macro, to avoid renaming breakage */
+#define hwloc_flsl hwloc_flsl_manual
+
+#endif
+
+#ifdef HWLOC_NEED_FLSL
+
+/* We only have an int fls(int) implementation, build a long one.  */
+
+/* First make it 32 bits if it was only 16.  */
+static __hwloc_inline int
+hwloc_fls32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_fls32(unsigned long x)
+{
+#if HWLOC_BITS_PER_INT == 16
+	int low_fls, hi_fls;
+
+	hi_fls = hwloc_fls(x >> 16);
+	if (hi_fls)
+		return hi_fls + 16;
+
+	low_fls = hwloc_fls(x & 0xfffful);
+	if (low_fls)
+		return low_fls;
+
+	return 0;
+#else
+	return hwloc_fls(x);
+#endif
+}
+
+/* Then make it 64 bit if longs are.  */
+static __hwloc_inline int
+hwloc_flsl_from_fls32(unsigned long x) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_flsl_from_fls32(unsigned long x)
+{
+#if HWLOC_BITS_PER_LONG == 64
+	int low_fls, hi_fls;
+
+	hi_fls = hwloc_fls32(x >> 32);
+	if (hi_fls)
+		return hi_fls + 32;
+
+	low_fls = hwloc_fls32(x & 0xfffffffful);
+	if (low_fls)
+		return low_fls;
+
+	return 0;
+#else
+	return hwloc_fls32(x);
+#endif
+}
+/* always define hwloc_flsl as a macro, to avoid renaming breakage */
+#define hwloc_flsl hwloc_flsl_from_fls32
+
+#endif
+
+static __hwloc_inline int
+hwloc_weight_long(unsigned long w) __hwloc_attribute_const;
+static __hwloc_inline int
+hwloc_weight_long(unsigned long w)
+{
+#if HWLOC_BITS_PER_LONG == 32
+#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__) >= 4)
+	return __builtin_popcount(w);
+#else
+	unsigned int res = (w & 0x55555555) + ((w >> 1) & 0x55555555);
+	res = (res & 0x33333333) + ((res >> 2) & 0x33333333);
+	res = (res & 0x0F0F0F0F) + ((res >> 4) & 0x0F0F0F0F);
+	res = (res & 0x00FF00FF) + ((res >> 8) & 0x00FF00FF);
+	return (res & 0x0000FFFF) + ((res >> 16) & 0x0000FFFF);
+#endif
+#else /* HWLOC_BITS_PER_LONG == 32 */
+#if (__GNUC__ >= 4) || ((__GNUC__ == 3) && (__GNUC_MINOR__) >= 4)
+	return __builtin_popcountll(w);
+#else
+	unsigned long res;
+	res = (w & 0x5555555555555555ul) + ((w >> 1) & 0x5555555555555555ul);
+	res = (res & 0x3333333333333333ul) + ((res >> 2) & 0x3333333333333333ul);
+	res = (res & 0x0F0F0F0F0F0F0F0Ful) + ((res >> 4) & 0x0F0F0F0F0F0F0F0Ful);
+	res = (res & 0x00FF00FF00FF00FFul) + ((res >> 8) & 0x00FF00FF00FF00FFul);
+	res = (res & 0x0000FFFF0000FFFFul) + ((res >> 16) & 0x0000FFFF0000FFFFul);
+	return (res & 0x00000000FFFFFFFFul) + ((res >> 32) & 0x00000000FFFFFFFFul);
+#endif
+#endif /* HWLOC_BITS_PER_LONG == 64 */
+}
+
+#if !HAVE_DECL_STRTOULL
+unsigned long long int strtoull(const char *nptr, char **endptr, int base);
+#endif
+
+static __hwloc_inline int hwloc_strncasecmp(const char *s1, const char *s2, size_t n)
+{
+#ifdef HWLOC_HAVE_DECL_STRNCASECMP
+  return strncasecmp(s1, s2, n);
+#else
+  while (n) {
+    char c1 = tolower(*s1), c2 = tolower(*s2);
+    if (!c1 || !c2 || c1 != c2)
+      return c1-c2;
+    n--; s1++; s2++;
+  }
+  return 0;
+#endif
+}
+
+#endif /* HWLOC_PRIVATE_MISC_H */
diff --git a/ext/hwloc/include/private/private.h b/ext/hwloc/include/private/private.h
new file mode 100644
index 0000000..fa344ac
--- /dev/null
+++ b/ext/hwloc/include/private/private.h
@@ -0,0 +1,335 @@
+/*
+ * Copyright © 2009      CNRS
+ * Copyright © 2009-2015 Inria.  All rights reserved.
+ * Copyright © 2009-2012 Université Bordeaux
+ * Copyright © 2009-2011 Cisco Systems, Inc.  All rights reserved.
+ *
+ * See COPYING in top-level directory.
+ */
+
+/* Internal types and helpers. */
+
+
+#ifdef HWLOC_INSIDE_PLUGIN
+/*
+ * these declarations are internal only, they are not available to plugins
+ * (many functions below are internal static symbols).
+ */
+#error This file should not be used in plugins
+#endif
+
+
+#ifndef HWLOC_PRIVATE_H
+#define HWLOC_PRIVATE_H
+
+#include <private/autogen/config.h>
+#include <hwloc.h>
+#include <hwloc/bitmap.h>
+#include <private/components.h>
+#include <private/debug.h>
+#include <sys/types.h>
+#ifdef HAVE_UNISTD_H
+#include <unistd.h>
+#endif
+#ifdef HAVE_STDINT_H
+#include <stdint.h>
+#endif
+#ifdef HAVE_SYS_UTSNAME_H
+#include <sys/utsname.h>
+#endif
+#include <string.h>
+
+enum hwloc_ignore_type_e {
+  HWLOC_IGNORE_TYPE_NEVER = 0,
+  HWLOC_IGNORE_TYPE_KEEP_STRUCTURE,
+  HWLOC_IGNORE_TYPE_ALWAYS
+};
+
+#define HWLOC_DEPTH_MAX 128
+
+struct hwloc_topology {
+  unsigned nb_levels;					/* Number of horizontal levels */
+  unsigned next_group_depth;				/* Depth of the next Group object that we may create */
+  unsigned level_nbobjects[HWLOC_DEPTH_MAX]; 		/* Number of objects on each horizontal level */
+  struct hwloc_obj **levels[HWLOC_DEPTH_MAX];		/* Direct access to levels, levels[l = 0 .. nblevels-1][0..level_nbobjects[l]] */
+  unsigned long flags;
+  int type_depth[HWLOC_OBJ_TYPE_MAX];
+  enum hwloc_ignore_type_e ignored_types[HWLOC_OBJ_TYPE_MAX];
+  int is_thissystem;
+  int is_loaded;
+  int modified;                                         /* >0 if objects were added/removed recently, which means a reconnect is needed */
+  hwloc_pid_t pid;                                      /* Process ID the topology is view from, 0 for self */
+  void *userdata;
+
+  unsigned bridge_nbobjects;
+  struct hwloc_obj **bridge_level;
+  struct hwloc_obj *first_bridge, *last_bridge;
+  unsigned pcidev_nbobjects;
+  struct hwloc_obj **pcidev_level;
+  struct hwloc_obj *first_pcidev, *last_pcidev;
+  unsigned osdev_nbobjects;
+  struct hwloc_obj **osdev_level;
+  struct hwloc_obj *first_osdev, *last_osdev;
+  unsigned misc_nbobjects;
+  struct hwloc_obj **misc_level;
+  struct hwloc_obj *first_misc, *last_misc;
+
+  struct hwloc_binding_hooks {
+    int (*set_thisproc_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
+    int (*get_thisproc_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+    int (*set_thisthread_cpubind)(hwloc_topology_t topology, hwloc_const_cpuset_t set, int flags);
+    int (*get_thisthread_cpubind)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+    int (*set_proc_cpubind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_cpuset_t set, int flags);
+    int (*get_proc_cpubind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+#ifdef hwloc_thread_t
+    int (*set_thread_cpubind)(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_const_cpuset_t set, int flags);
+    int (*get_thread_cpubind)(hwloc_topology_t topology, hwloc_thread_t tid, hwloc_cpuset_t set, int flags);
+#endif
+
+    int (*get_thisproc_last_cpu_location)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+    int (*get_thisthread_last_cpu_location)(hwloc_topology_t topology, hwloc_cpuset_t set, int flags);
+    int (*get_proc_last_cpu_location)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_cpuset_t set, int flags);
+
+    int (*set_thisproc_membind)(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*get_thisproc_membind)(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+    int (*set_thisthread_membind)(hwloc_topology_t topology, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*get_thisthread_membind)(hwloc_topology_t topology, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+    int (*set_proc_membind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*get_proc_membind)(hwloc_topology_t topology, hwloc_pid_t pid, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+    int (*set_area_membind)(hwloc_topology_t topology, const void *addr, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*get_area_membind)(hwloc_topology_t topology, const void *addr, size_t len, hwloc_nodeset_t nodeset, hwloc_membind_policy_t * policy, int flags);
+    /* This has to return the same kind of pointer as alloc_membind, so that free_membind can be used on it */
+    void *(*alloc)(hwloc_topology_t topology, size_t len);
+    /* alloc_membind has to always succeed if !(flags & HWLOC_MEMBIND_STRICT).
+     * see hwloc_alloc_or_fail which is convenient for that.  */
+    void *(*alloc_membind)(hwloc_topology_t topology, size_t len, hwloc_const_nodeset_t nodeset, hwloc_membind_policy_t policy, int flags);
+    int (*free_membind)(hwloc_topology_t topology, void *addr, size_t len);
+  } binding_hooks;
+
+  struct hwloc_topology_support support;
+
+  void (*userdata_export_cb)(void *reserved, struct hwloc_topology *topology, struct hwloc_obj *obj);
+  void (*userdata_import_cb)(struct hwloc_topology *topology, struct hwloc_obj *obj, const char *name, const void *buffer, size_t length);
+
+  struct hwloc_os_distances_s {
+    hwloc_obj_type_t type;
+    int nbobjs;
+    unsigned *indexes; /* array of OS indexes before we can convert them into objs. always available.
+			*/
+    struct hwloc_obj **objs; /* array of objects, in the same order as above.
+			      * either given (by a backend) together with the indexes array above.
+			      * or build from the above indexes array when not given (by the user).
+			      */
+    float *distances; /* distance matrices, ordered according to the above indexes/objs array.
+		       * distance from i to j is stored in slot i*nbnodes+j.
+		       * will be copied into the main logical-index-ordered distance at the end of the discovery.
+		       */
+    int forced; /* set if the user forced a matrix to ignore the OS one */
+
+    struct hwloc_os_distances_s *prev, *next;
+  } *first_osdist, *last_osdist;
+
+  /* list of enabled backends. */
+  struct hwloc_backend * backends;
+};
+
+extern void hwloc_alloc_obj_cpusets(hwloc_obj_t obj);
+extern void hwloc_setup_pu_level(struct hwloc_topology *topology, unsigned nb_pus);
+extern int hwloc_get_sysctlbyname(const char *name, int64_t *n);
+extern int hwloc_get_sysctl(int name[], unsigned namelen, int *n);
+extern unsigned hwloc_fallback_nbprocessors(struct hwloc_topology *topology);
+extern void hwloc_connect_children(hwloc_obj_t obj);
+extern int hwloc_connect_levels(hwloc_topology_t topology);
+
+extern int hwloc__object_cpusets_compare_first(hwloc_obj_t obj1, hwloc_obj_t obj2);
+extern void hwloc__reorder_children(hwloc_obj_t parent);
+
+extern void hwloc_topology_setup_defaults(struct hwloc_topology *topology);
+extern void hwloc_topology_clear(struct hwloc_topology *topology);
+
+extern void hwloc__add_info(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name, const char *value);
+extern char ** hwloc__find_info_slot(struct hwloc_obj_info_s **infosp, unsigned *countp, const char *name);
+extern void hwloc__move_infos(struct hwloc_obj_info_s **dst_infosp, unsigned *dst_countp, struct hwloc_obj_info_s **src_infosp, unsigned *src_countp);
+extern void hwloc__free_infos(struct hwloc_obj_info_s *infos, unsigned count);
+
+/* set native OS binding hooks */
+extern void hwloc_set_native_binding_hooks(struct hwloc_binding_hooks *hooks, struct hwloc_topology_support *support);
+/* set either native OS binding hooks (if thissystem), or dummy ones */
+extern void hwloc_set_binding_hooks(struct hwloc_topology *topology);
+
+#if defined(HWLOC_LINUX_SYS)
+extern void hwloc_set_linuxfs_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_LINUX_SYS */
+
+#if defined(HWLOC_BGQ_SYS)
+extern void hwloc_set_bgq_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_BGQ_SYS */
+
+#ifdef HWLOC_SOLARIS_SYS
+extern void hwloc_set_solaris_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_SOLARIS_SYS */
+
+#ifdef HWLOC_AIX_SYS
+extern void hwloc_set_aix_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_AIX_SYS */
+
+#ifdef HWLOC_OSF_SYS
+extern void hwloc_set_osf_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_OSF_SYS */
+
+#ifdef HWLOC_WIN_SYS
+extern void hwloc_set_windows_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_WIN_SYS */
+
+#ifdef HWLOC_DARWIN_SYS
+extern void hwloc_set_darwin_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_DARWIN_SYS */
+
+#ifdef HWLOC_FREEBSD_SYS
+extern void hwloc_set_freebsd_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_FREEBSD_SYS */
+
+#ifdef HWLOC_NETBSD_SYS
+extern void hwloc_set_netbsd_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_NETBSD_SYS */
+
+#ifdef HWLOC_HPUX_SYS
+extern void hwloc_set_hpux_hooks(struct hwloc_binding_hooks *binding_hooks, struct hwloc_topology_support *support);
+#endif /* HWLOC_HPUX_SYS */
+
+/* Insert uname-specific names/values in the object infos array.
+ * If cached_uname isn't NULL, it is used as a struct utsname instead of recalling uname.
+ * Any field that starts with \0 is ignored.
+ */
+extern void hwloc_add_uname_info(struct hwloc_topology *topology, void *cached_uname);
+
+/* Free obj and its attributes assuming it doesn't have any children/parent anymore */
+extern void hwloc_free_unlinked_object(hwloc_obj_t obj);
+
+/* Duplicate src and its children under newparent in newtopology */
+extern void hwloc__duplicate_objects(struct hwloc_topology *newtopology, struct hwloc_obj *newparent, struct hwloc_obj *src);
+
+/* This can be used for the alloc field to get allocated data that can be freed by free() */
+void *hwloc_alloc_heap(hwloc_topology_t topology, size_t len);
+
+/* This can be used for the alloc field to get allocated data that can be freed by munmap() */
+void *hwloc_alloc_mmap(hwloc_topology_t topology, size_t len);
+
+/* This can be used for the free_membind field to free data using free() */
+int hwloc_free_heap(hwloc_topology_t topology, void *addr, size_t len);
+
+/* This can be used for the free_membind field to free data using munmap() */
+int hwloc_free_mmap(hwloc_topology_t topology, void *addr, size_t len);
+
+/* Allocates unbound memory or fail, depending on whether STRICT is requested
+ * or not */
+static __hwloc_inline void *
+hwloc_alloc_or_fail(hwloc_topology_t topology, size_t len, int flags)
+{
+  if (flags & HWLOC_MEMBIND_STRICT)
+    return NULL;
+  return hwloc_alloc(topology, len);
+}
+
+extern void hwloc_distances_init(struct hwloc_topology *topology);
+extern void hwloc_distances_destroy(struct hwloc_topology *topology);
+extern void hwloc_distances_set(struct hwloc_topology *topology, hwloc_obj_type_t type, unsigned nbobjs, unsigned *indexes, hwloc_obj_t *objs, float *distances, int force);
+extern void hwloc_distances_set_from_env(struct hwloc_topology *topology);
+extern void hwloc_distances_restrict_os(struct hwloc_topology *topology);
+extern void hwloc_distances_restrict(struct hwloc_topology *topology, unsigned long flags);
+extern void hwloc_distances_finalize_os(struct hwloc_topology *topology);
+extern void hwloc_distances_finalize_logical(struct hwloc_topology *topology);
+extern void hwloc_clear_object_distances(struct hwloc_obj *obj);
+extern void hwloc_clear_object_distances_one(struct hwloc_distances_s *distances);
+extern void hwloc_group_by_distances(struct hwloc_topology *topology);
+
+#ifdef HAVE_USELOCALE
+#include "locale.h"
+#ifdef HAVE_XLOCALE_H
+#include "xlocale.h"
+#endif
+#define hwloc_localeswitch_declare locale_t __old_locale = (locale_t)0, __new_locale
+#define hwloc_localeswitch_init() do {                     \
+  __new_locale = newlocale(LC_ALL_MASK, "C", (locale_t)0); \
+  if (__new_locale != (locale_t)0)                         \
+    __old_locale = uselocale(__new_locale);                \
+} while (0)
+#define hwloc_localeswitch_fini() do { \
+  if (__new_locale != (locale_t)0) {   \
+    uselocale(__old_locale);           \
+    freelocale(__new_locale);          \
+  }                                    \
+} while(0)
+#else /* HAVE_USELOCALE */
+#define hwloc_localeswitch_declare int __dummy_nolocale __hwloc_attribute_unused
+#define hwloc_localeswitch_init()
+#define hwloc_localeswitch_fini()
+#endif /* HAVE_USELOCALE */
+
+#if !HAVE_DECL_FABSF
+#define fabsf(f) fabs((double)(f))
+#endif
+
+#if HAVE_DECL__SC_PAGE_SIZE
+#define hwloc_getpagesize() sysconf(_SC_PAGE_SIZE)
+#elif HAVE_DECL__SC_PAGESIZE
+#define hwloc_getpagesize() sysconf(_SC_PAGESIZE)
+#elif defined HAVE_GETPAGESIZE
+#define hwloc_getpagesize() getpagesize()
+#else
+#undef hwloc_getpagesize
+#endif
+
+/* encode src buffer into target buffer.
+ * targsize must be at least 4*((srclength+2)/3)+1.
+ * target will be 0-terminated.
+ */
+extern int hwloc_encode_to_base64(const char *src, size_t srclength, char *target, size_t targsize);
+/* decode src buffer into target buffer.
+ * src is 0-terminated.
+ * targsize must be at least srclength*3/4+1 (srclength not including \0)
+ * but only srclength*3/4 characters will be meaningful
+ * (the next one may be partially written during decoding, but it should be ignored).
+ */
+extern int hwloc_decode_from_base64(char const *src, char *target, size_t targsize);
+
+/* Check whether needle matches the beginning of haystack, at least n, and up
+ * to a colon or \0 */
+extern int hwloc_namecoloncmp(const char *haystack, const char *needle, size_t n);
+
+#ifdef HWLOC_HAVE_ATTRIBUTE_FORMAT
+# if HWLOC_HAVE_ATTRIBUTE_FORMAT
+#  define __hwloc_attribute_format(type, str, arg)  __attribute__((__format__(type, str, arg)))
+# else
+#  define __hwloc_attribute_format(type, str, arg)
+# endif
+#else
+# define __hwloc_attribute_format(type, str, arg)
+#endif
+
+#define hwloc_memory_size_printf_value(_size, _verbose) \
+  ((_size) < (10ULL<<20) || _verbose ? (((_size)>>9)+1)>>1 : (_size) < (10ULL<<30) ? (((_size)>>19)+1)>>1 : (_size) < (10ULL<<40) ? (((_size)>>29)+1)>>1 : (((_size)>>39)+1)>>1)
+#define hwloc_memory_size_printf_unit(_size, _verbose) \
+  ((_size) < (10ULL<<20) || _verbose ? "KB" : (_size) < (10ULL<<30) ? "MB" : (_size) < (10ULL<<40) ? "GB" : "TB")
+
+/* On some systems, snprintf returns the size of written data, not the actually
+ * required size.  hwloc_snprintf always report the actually required size. */
+extern int hwloc_snprintf(char *str, size_t size, const char *format, ...) __hwloc_attribute_format(printf, 3, 4);
+
+extern void hwloc_obj_add_info_nodup(hwloc_obj_t obj, const char *name, const char *value, int nodup);
+
+/* Return the name of the currently running program, if supported.
+ * If not NULL, must be freed by the caller.
+ */
+extern char * hwloc_progname(struct hwloc_topology *topology);
+
+#define HWLOC_BITMAP_EQUAL 0       /* Bitmaps are equal */
+#define HWLOC_BITMAP_INCLUDED 1    /* First bitmap included in second */
+#define HWLOC_BITMAP_CONTAINS 2    /* First bitmap contains second */
+#define HWLOC_BITMAP_INTERSECTS 3  /* Bitmaps intersect without any inclusion */
+#define HWLOC_BITMAP_DIFFERENT  4  /* Bitmaps do not intersect */
+
+/** \brief Compare bitmaps \p bitmap1 and \p bitmap2 from an inclusion point of view.
+ */
+HWLOC_DECLSPEC int hwloc_bitmap_compare_inclusion(hwloc_const_bitmap_t bitmap1, hwloc_const_bitmap_t bitmap2) __hwloc_attribute_pure;
+#endif /* HWLOC_PRIVATE_H */
diff --git a/ext/hwloc/include/private/solaris-chiptype.h b/ext/hwloc/include/private/solaris-chiptype.h
new file mode 100644
index 0000000..4af80d8
--- /dev/null
+++ b/ext/hwloc/include/private/solaris-chiptype.h
@@ -0,0 +1,59 @@
+/*
+ * Copyright © 2009-2010 Oracle and/or its affiliates.  All rights reserved.
+ *
+ * $COPYRIGHT$
+ *
+ * Additional copyrights may follow
+ *
+ * $HEADER$
+ */
+
+
+#ifdef HWLOC_INSIDE_PLUGIN
+/*
+ * these declarations are internal only, they are not available to plugins
+ * (functions below are internal static symbols).
+ */
+#error This file should not be used in plugins
+#endif
+
+
+#ifndef HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H
+#define HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H
+
+/* SPARC Chip Modes. */
+#define MODE_UNKNOWN            0
+#define MODE_SPITFIRE           1
+#define MODE_BLACKBIRD          2
+#define MODE_CHEETAH            3
+#define MODE_SPARC64_VI         4
+#define MODE_T1                 5
+#define MODE_T2                 6
+#define MODE_SPARC64_VII        7
+#define MODE_ROCK               8
+
+/* SPARC Chip Implementations. */
+#define IMPL_SPARC64_VI         0x6
+#define IMPL_SPARC64_VII        0x7
+#define IMPL_SPITFIRE           0x10
+#define IMPL_BLACKBIRD          0x11
+#define IMPL_SABRE              0x12
+#define IMPL_HUMMINGBIRD        0x13
+#define IMPL_CHEETAH            0x14
+#define IMPL_CHEETAHPLUS        0x15
+#define IMPL_JALAPENO           0x16
+#define IMPL_JAGUAR             0x18
+#define IMPL_PANTHER            0x19
+#define IMPL_NIAGARA            0x23
+#define IMPL_NIAGARA_2          0x24
+#define IMPL_ROCK               0x25
+
+/* Default Mfg, Cache, Speed settings */
+#define TI_MANUFACTURER         0x17
+#define TWO_MEG_CACHE           2097152
+#define SPITFIRE_SPEED          142943750
+
+char* hwloc_solaris_get_chip_type(void);
+char* hwloc_solaris_get_chip_model(void);
+
+#endif /* HWLOC_PRIVATE_SOLARIS_CHIPTYPE_H */
diff --git a/ext/hwloc/include/private/xml.h b/ext/hwloc/include/private/xml.h
new file mode 100644
index 0000000..75c6c43
--- /dev/null
+++ b/ext/hwloc/include/private/xml.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright © 2009-2014 Inria.  All rights reserved.
+ * See COPYING in top-level directory.
+ */
+
+#ifndef PRIVATE_XML_H
+#define PRIVATE_XML_H 1
+
+#include <hwloc.h>
+
+#include <sys/types.h>
+
+HWLOC_DECLSPEC int hwloc__xml_verbose(void);
+
+/**************
+ * XML import *
+ **************/
+
+typedef struct hwloc__xml_import_state_s {
+  struct hwloc__xml_import_state_s *parent;
+
+  /* globals shared because the entire stack of states during import */
+  struct hwloc_xml_backend_data_s *global;
+
+  /* opaque data used to store backend-specific data.
+   * statically allocated to allow stack-allocation by the common code without knowing actual backend needs.
+   */
+  char data[32];
+} * hwloc__xml_import_state_t;
+
+HWLOC_DECLSPEC int hwloc__xml_import_diff(hwloc__xml_import_state_t state, hwloc_topology_diff_t *firstdiffp);
+
+struct hwloc_xml_backend_data_s {
+  /* xml backend parameters */
+  int (*look_init)(struct hwloc_xml_backend_data_s *bdata, struct hwloc__xml_import_state_s *state);
+  void (*look_failed)(struct hwloc_xml_backend_data_s *bdata);
+  void (*backend_exit)(struct hwloc_xml_backend_data_s *bdata);
+  int (*next_attr)(struct hwloc__xml_import_state_s * state, char **namep, char **valuep);
+  int (*find_child)(struct hwloc__xml_import_state_s * state, struct hwloc__xml_import_state_s * childstate, char **tagp);
+  int (*close_tag)(struct hwloc__xml_import_state_s * state); /* look for an explicit closing tag </name> */
+  void (*close_child)(struct hwloc__xml_import_state_s * state);
+  int (*get_content)(struct hwloc__xml_import_state_s * state, char **beginp, size_t expected_length);
+  void (*close_content)(struct hwloc__xml_import_state_s * state);
+  char * msgprefix;
+  void *data; /* libxml2 doc, or nolibxml buffer */
+  int nbnumanodes;
+  struct hwloc_xml_imported_distances_s {
+    hwloc_obj_t root;
+    struct hwloc_distances_s distances;
+    struct hwloc_xml_imported_distances_s *prev, *next;
+  } *first_distances, *last_distances;
+};
+
+/**************
+ * XML export *
+ **************/
+
+typedef struct hwloc__xml_export_state_s {
+  struct hwloc__xml_export_state_s *parent;
+
+  void (*new_child)(struct hwloc__xml_export_state_s *parentstate, struct hwloc__xml_export_state_s *state, const char *name);
+  void (*new_prop)(struct hwloc__xml_export_state_s *state, const char *name, const char *value);
+  void (*add_content)(struct hwloc__xml_export_state_s *state, const char *buffer, size_t length);
+  void (*end_object)(struct hwloc__xml_export_state_s *state, const char *name);
+
+  /* opaque data used to store backend-specific data.
+   * statically allocated to allow stack-allocation by the common code without knowing actual backend needs.
+   */
+  char data[40];
+} * hwloc__xml_export_state_t;
+
+HWLOC_DECLSPEC void hwloc__xml_export_object (hwloc__xml_export_state_t state, struct hwloc_topology *topology, struct hwloc_obj *obj);
+
+HWLOC_DECLSPEC void hwloc__xml_export_diff(hwloc__xml_export_state_t parentstate, hwloc_topology_diff_t diff);
+
+/******************
+ * XML components *
+ ******************/
+
+struct hwloc_xml_callbacks {
+  int (*backend_init)(struct hwloc_xml_backend_data_s *bdata, const char *xmlpath, const char *xmlbuffer, int xmlbuflen);
+  int (*export_file)(struct hwloc_topology *topology, const char *filename);
+  int (*export_buffer)(struct hwloc_topology *topology, char **xmlbuffer, int *buflen);
+  void (*free_buffer)(void *xmlbuffer);
+  int (*import_diff)(struct hwloc__xml_import_state_s *state, const char *xmlpath, const char *xmlbuffer, int xmlbuflen, hwloc_topology_diff_t *diff, char **refnamep);
+  int (*export_diff_file)(union hwloc_topology_diff_u *diff, const char *refname, const char *filename);
+  int (*export_diff_buffer)(union hwloc_topology_diff_u *diff, const char *refname, char **xmlbuffer, int *buflen);
+};
+
+struct hwloc_xml_component {
+  struct hwloc_xml_callbacks *nolibxml_callbacks;
+  struct hwloc_xml_callbacks *libxml_callbacks;
+};
+
+HWLOC_DECLSPEC void hwloc_xml_callbacks_register(struct hwloc_xml_component *component);
+HWLOC_DECLSPEC void hwloc_xml_callbacks_reset(void);
+
+#endif /* PRIVATE_XML_H */
diff --git a/ext/hwloc/include/static-components.h b/ext/hwloc/include/static-components.h
new file mode 100644
index 0000000..ad23185
--- /dev/null
+++ b/ext/hwloc/include/static-components.h
@@ -0,0 +1,17 @@
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_noos_component;
+//HWLOC_DECLSPEC extern const struct hwloc_component hwloc_xml_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_synthetic_component;
+//HWLOC_DECLSPEC extern const struct hwloc_component hwloc_xml_nolibxml_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_linux_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_linuxpci_component;
+HWLOC_DECLSPEC extern const struct hwloc_component hwloc_x86_component;
+static const struct hwloc_component * hwloc_static_components[] = {
+  &hwloc_noos_component,
+//  &hwloc_xml_component,
+  &hwloc_synthetic_component,
+//  &hwloc_xml_nolibxml_component,
+  &hwloc_linux_component,
+  &hwloc_linuxpci_component,
+  &hwloc_x86_component,
+  NULL
+};
diff --git a/ext/lua/Makefile b/ext/lua/Makefile
new file mode 100644
index 0000000..3e60d68
--- /dev/null
+++ b/ext/lua/Makefile
@@ -0,0 +1,66 @@
+SRC_DIRS    = ./src
+MAKE_DIR   = ../../make
+
+#DO NOT EDIT BELOW
+
+include ../../config.mk
+include $(MAKE_DIR)/include_$(COMPILER).mk
+
+CFLAGS    = -O2 -Wall -fPIC
+INCLUDES  = -I./includes
+DEFINES   = -DLUA_COMPAT_ALL -DLUA_USE_LINUX
+LIBS      = -lm -Wl,-E -ldl
+#LFLAGS    =
+Q         ?= @
+ifeq ($(COMPILER),MIC)
+CFLAGS += -mmic
+LFLAGS += -mmic
+endif
+
+
+#CONFIGURE BUILD SYSTEM
+BUILD_DIR  = ./$(COMPILER)
+
+VPATH     = $(SRC_DIRS)
+FILES     = $(notdir $(foreach dir,$(SRC_DIRS),$(wildcard $(dir)/*.c)))
+OBJ       = $(patsubst %.c, $(BUILD_DIR)/%.o, $(FILES))
+
+CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
+
+STATIC_LIBLUA = liblua.a
+SHARED_LIBLUA = liblua.so
+INTERPRETER = lua
+
+all: $(BUILD_DIR) $(OBJ) $(INTERPRETER)
+
+$(BUILD_DIR):
+	@mkdir $(BUILD_DIR)
+
+$(STATIC_LIBLUA):
+	$(Q)${AR} -cq $(STATIC_LIBLUA) $(OBJ)
+
+$(SHARED_LIBLUA):
+	$(Q)$(CC) $(LFLAGS) -shared -fPIC -Wl,-soname,$(SHARED_LIBLUA).$(VERSION) -o $(SHARED_LIBLUA) $(OBJ)
+
+$(INTERPRETER): $(STATIC_LIBLUA) $(BUILD_DIR)/lua.o
+	$(Q)$(CC) -o $@ $(LFLAGS) $(BUILD_DIR)/lua.o $(STATIC_LIBLUA) $(LIBS)
+
+#PATTERN RULES
+$(BUILD_DIR)/%.o:  %.c
+	${Q}$(CC) -c  $(CFLAGS) $(CPPFLAGS) $< -o $@
+	${Q}$(CC) $(CPPFLAGS) -MT $(@:.d=.o) -MM  $< > $(BUILD_DIR)/$*.d
+
+ifeq ($(findstring $(MAKECMDGOALS),clean),)
+-include $(OBJ:.o=.d)
+endif
+
+.PHONY: clean distclean $(INTERPRETER)
+
+clean:
+	@rm -rf $(BUILD_DIR)
+
+distclean: clean
+	@rm -f $(TARGET) $(INTERPRETER) $(STATIC_LIBLUA) $(SHARED_LIBLUA)
+
+
+
diff --git a/ext/lua/includes/lapi.h b/ext/lua/includes/lapi.h
new file mode 100644
index 0000000..0909a39
--- /dev/null
+++ b/ext/lua/includes/lapi.h
@@ -0,0 +1,24 @@
+/*
+** $Id: lapi.h,v 2.7 2009/11/27 15:37:59 roberto Exp $
+** Auxiliary functions from Lua API
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lapi_h
+#define lapi_h
+
+
+#include "llimits.h"
+#include "lstate.h"
+
+#define api_incr_top(L)   {L->top++; api_check(L, L->top <= L->ci->top, \
+				"stack overflow");}
+
+#define adjustresults(L,nres) \
+    { if ((nres) == LUA_MULTRET && L->ci->top < L->top) L->ci->top = L->top; }
+
+#define api_checknelems(L,n)	api_check(L, (n) < (L->top - L->ci->func), \
+				  "not enough elements in the stack")
+
+
+#endif
diff --git a/ext/lua/includes/lauxlib.h b/ext/lua/includes/lauxlib.h
new file mode 100644
index 0000000..ac4d15f
--- /dev/null
+++ b/ext/lua/includes/lauxlib.h
@@ -0,0 +1,212 @@
+/*
+** $Id: lauxlib.h,v 1.120 2011/11/29 15:55:08 roberto Exp $
+** Auxiliary functions for building Lua libraries
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lauxlib_h
+#define lauxlib_h
+
+
+#include <stddef.h>
+#include <stdio.h>
+
+#include "lua.h"
+
+
+
+/* extra error code for `luaL_load' */
+#define LUA_ERRFILE     (LUA_ERRERR+1)
+
+
+typedef struct luaL_Reg {
+  const char *name;
+  lua_CFunction func;
+} luaL_Reg;
+
+
+LUALIB_API void (luaL_checkversion_) (lua_State *L, lua_Number ver);
+#define luaL_checkversion(L)	luaL_checkversion_(L, LUA_VERSION_NUM)
+
+LUALIB_API int (luaL_getmetafield) (lua_State *L, int obj, const char *e);
+LUALIB_API int (luaL_callmeta) (lua_State *L, int obj, const char *e);
+LUALIB_API const char *(luaL_tolstring) (lua_State *L, int idx, size_t *len);
+LUALIB_API int (luaL_argerror) (lua_State *L, int numarg, const char *extramsg);
+LUALIB_API const char *(luaL_checklstring) (lua_State *L, int numArg,
+                                                          size_t *l);
+LUALIB_API const char *(luaL_optlstring) (lua_State *L, int numArg,
+                                          const char *def, size_t *l);
+LUALIB_API lua_Number (luaL_checknumber) (lua_State *L, int numArg);
+LUALIB_API lua_Number (luaL_optnumber) (lua_State *L, int nArg, lua_Number def);
+
+LUALIB_API lua_Integer (luaL_checkinteger) (lua_State *L, int numArg);
+LUALIB_API lua_Integer (luaL_optinteger) (lua_State *L, int nArg,
+                                          lua_Integer def);
+LUALIB_API lua_Unsigned (luaL_checkunsigned) (lua_State *L, int numArg);
+LUALIB_API lua_Unsigned (luaL_optunsigned) (lua_State *L, int numArg,
+                                            lua_Unsigned def);
+
+LUALIB_API void (luaL_checkstack) (lua_State *L, int sz, const char *msg);
+LUALIB_API void (luaL_checktype) (lua_State *L, int narg, int t);
+LUALIB_API void (luaL_checkany) (lua_State *L, int narg);
+
+LUALIB_API int   (luaL_newmetatable) (lua_State *L, const char *tname);
+LUALIB_API void  (luaL_setmetatable) (lua_State *L, const char *tname);
+LUALIB_API void *(luaL_testudata) (lua_State *L, int ud, const char *tname);
+LUALIB_API void *(luaL_checkudata) (lua_State *L, int ud, const char *tname);
+
+LUALIB_API void (luaL_where) (lua_State *L, int lvl);
+LUALIB_API int (luaL_error) (lua_State *L, const char *fmt, ...);
+
+LUALIB_API int (luaL_checkoption) (lua_State *L, int narg, const char *def,
+                                   const char *const lst[]);
+
+LUALIB_API int (luaL_fileresult) (lua_State *L, int stat, const char *fname);
+LUALIB_API int (luaL_execresult) (lua_State *L, int stat);
+
+/* pre-defined references */
+#define LUA_NOREF       (-2)
+#define LUA_REFNIL      (-1)
+
+LUALIB_API int (luaL_ref) (lua_State *L, int t);
+LUALIB_API void (luaL_unref) (lua_State *L, int t, int ref);
+
+LUALIB_API int (luaL_loadfilex) (lua_State *L, const char *filename,
+                                               const char *mode);
+
+#define luaL_loadfile(L,f)	luaL_loadfilex(L,f,NULL)
+
+LUALIB_API int (luaL_loadbufferx) (lua_State *L, const char *buff, size_t sz,
+                                   const char *name, const char *mode);
+LUALIB_API int (luaL_loadstring) (lua_State *L, const char *s);
+
+LUALIB_API lua_State *(luaL_newstate) (void);
+
+LUALIB_API int (luaL_len) (lua_State *L, int idx);
+
+LUALIB_API const char *(luaL_gsub) (lua_State *L, const char *s, const char *p,
+                                                  const char *r);
+
+LUALIB_API void (luaL_setfuncs) (lua_State *L, const luaL_Reg *l, int nup);
+
+LUALIB_API int (luaL_getsubtable) (lua_State *L, int idx, const char *fname);
+
+LUALIB_API void (luaL_traceback) (lua_State *L, lua_State *L1,
+                                  const char *msg, int level);
+
+LUALIB_API void (luaL_requiref) (lua_State *L, const char *modname,
+                                 lua_CFunction openf, int glb);
+
+/*
+** ===============================================================
+** some useful macros
+** ===============================================================
+*/
+
+
+#define luaL_newlibtable(L,l)	\
+  lua_createtable(L, 0, sizeof(l)/sizeof((l)[0]) - 1)
+
+#define luaL_newlib(L,l)	(luaL_newlibtable(L,l), luaL_setfuncs(L,l,0))
+
+#define luaL_argcheck(L, cond,numarg,extramsg)	\
+		((void)((cond) || luaL_argerror(L, (numarg), (extramsg))))
+#define luaL_checkstring(L,n)	(luaL_checklstring(L, (n), NULL))
+#define luaL_optstring(L,n,d)	(luaL_optlstring(L, (n), (d), NULL))
+#define luaL_checkint(L,n)	((int)luaL_checkinteger(L, (n)))
+#define luaL_optint(L,n,d)	((int)luaL_optinteger(L, (n), (d)))
+#define luaL_checklong(L,n)	((long)luaL_checkinteger(L, (n)))
+#define luaL_optlong(L,n,d)	((long)luaL_optinteger(L, (n), (d)))
+
+#define luaL_typename(L,i)	lua_typename(L, lua_type(L,(i)))
+
+#define luaL_dofile(L, fn) \
+	(luaL_loadfile(L, fn) || lua_pcall(L, 0, LUA_MULTRET, 0))
+
+#define luaL_dostring(L, s) \
+	(luaL_loadstring(L, s) || lua_pcall(L, 0, LUA_MULTRET, 0))
+
+#define luaL_getmetatable(L,n)	(lua_getfield(L, LUA_REGISTRYINDEX, (n)))
+
+#define luaL_opt(L,f,n,d)	(lua_isnoneornil(L,(n)) ? (d) : f(L,(n)))
+
+#define luaL_loadbuffer(L,s,sz,n)	luaL_loadbufferx(L,s,sz,n,NULL)
+
+
+/*
+** {======================================================
+** Generic Buffer manipulation
+** =======================================================
+*/
+
+typedef struct luaL_Buffer {
+  char *b;  /* buffer address */
+  size_t size;  /* buffer size */
+  size_t n;  /* number of characters in buffer */
+  lua_State *L;
+  char initb[LUAL_BUFFERSIZE];  /* initial buffer */
+} luaL_Buffer;
+
+
+#define luaL_addchar(B,c) \
+  ((void)((B)->n < (B)->size || luaL_prepbuffsize((B), 1)), \
+   ((B)->b[(B)->n++] = (c)))
+
+#define luaL_addsize(B,s)	((B)->n += (s))
+
+LUALIB_API void (luaL_buffinit) (lua_State *L, luaL_Buffer *B);
+LUALIB_API char *(luaL_prepbuffsize) (luaL_Buffer *B, size_t sz);
+LUALIB_API void (luaL_addlstring) (luaL_Buffer *B, const char *s, size_t l);
+LUALIB_API void (luaL_addstring) (luaL_Buffer *B, const char *s);
+LUALIB_API void (luaL_addvalue) (luaL_Buffer *B);
+LUALIB_API void (luaL_pushresult) (luaL_Buffer *B);
+LUALIB_API void (luaL_pushresultsize) (luaL_Buffer *B, size_t sz);
+LUALIB_API char *(luaL_buffinitsize) (lua_State *L, luaL_Buffer *B, size_t sz);
+
+#define luaL_prepbuffer(B)	luaL_prepbuffsize(B, LUAL_BUFFERSIZE)
+
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** File handles for IO library
+** =======================================================
+*/
+
+/*
+** A file handle is a userdata with metatable 'LUA_FILEHANDLE' and
+** initial structure 'luaL_Stream' (it may contain other fields
+** after that initial structure).
+*/
+
+#define LUA_FILEHANDLE          "FILE*"
+
+
+typedef struct luaL_Stream {
+  FILE *f;  /* stream (NULL for incompletely created streams) */
+  lua_CFunction closef;  /* to close stream (NULL for closed streams) */
+} luaL_Stream;
+
+/* }====================================================== */
+
+
+
+/* compatibility with old module system */
+#if defined(LUA_COMPAT_MODULE)
+
+LUALIB_API void (luaL_pushmodule) (lua_State *L, const char *modname,
+                                   int sizehint);
+LUALIB_API void (luaL_openlib) (lua_State *L, const char *libname,
+                                const luaL_Reg *l, int nup);
+
+#define luaL_register(L,n,l)	(luaL_openlib(L,(n),(l),0))
+
+#endif
+
+
+#endif
+
+
diff --git a/ext/lua/includes/lcode.h b/ext/lua/includes/lcode.h
new file mode 100644
index 0000000..5a1fa9f
--- /dev/null
+++ b/ext/lua/includes/lcode.h
@@ -0,0 +1,83 @@
+/*
+** $Id: lcode.h,v 1.58 2011/08/30 16:26:41 roberto Exp $
+** Code generator for Lua
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lcode_h
+#define lcode_h
+
+#include "llex.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+
+
+/*
+** Marks the end of a patch list. It is an invalid value both as an absolute
+** address, and as a list link (would link an element to itself).
+*/
+#define NO_JUMP (-1)
+
+
+/*
+** grep "ORDER OPR" if you change these enums  (ORDER OP)
+*/
+typedef enum BinOpr {
+  OPR_ADD, OPR_SUB, OPR_MUL, OPR_DIV, OPR_MOD, OPR_POW,
+  OPR_CONCAT,
+  OPR_EQ, OPR_LT, OPR_LE,
+  OPR_NE, OPR_GT, OPR_GE,
+  OPR_AND, OPR_OR,
+  OPR_NOBINOPR
+} BinOpr;
+
+
+typedef enum UnOpr { OPR_MINUS, OPR_NOT, OPR_LEN, OPR_NOUNOPR } UnOpr;
+
+
+#define getcode(fs,e)	((fs)->f->code[(e)->u.info])
+
+#define luaK_codeAsBx(fs,o,A,sBx)	luaK_codeABx(fs,o,A,(sBx)+MAXARG_sBx)
+
+#define luaK_setmultret(fs,e)	luaK_setreturns(fs, e, LUA_MULTRET)
+
+#define luaK_jumpto(fs,t)	luaK_patchlist(fs, luaK_jump(fs), t)
+
+LUAI_FUNC int luaK_codeABx (FuncState *fs, OpCode o, int A, unsigned int Bx);
+LUAI_FUNC int luaK_codeABC (FuncState *fs, OpCode o, int A, int B, int C);
+LUAI_FUNC int luaK_codek (FuncState *fs, int reg, int k);
+LUAI_FUNC void luaK_fixline (FuncState *fs, int line);
+LUAI_FUNC void luaK_nil (FuncState *fs, int from, int n);
+LUAI_FUNC void luaK_reserveregs (FuncState *fs, int n);
+LUAI_FUNC void luaK_checkstack (FuncState *fs, int n);
+LUAI_FUNC int luaK_stringK (FuncState *fs, TString *s);
+LUAI_FUNC int luaK_numberK (FuncState *fs, lua_Number r);
+LUAI_FUNC void luaK_dischargevars (FuncState *fs, expdesc *e);
+LUAI_FUNC int luaK_exp2anyreg (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_exp2anyregup (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_exp2nextreg (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_exp2val (FuncState *fs, expdesc *e);
+LUAI_FUNC int luaK_exp2RK (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_self (FuncState *fs, expdesc *e, expdesc *key);
+LUAI_FUNC void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k);
+LUAI_FUNC void luaK_goiftrue (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_goiffalse (FuncState *fs, expdesc *e);
+LUAI_FUNC void luaK_storevar (FuncState *fs, expdesc *var, expdesc *e);
+LUAI_FUNC void luaK_setreturns (FuncState *fs, expdesc *e, int nresults);
+LUAI_FUNC void luaK_setoneret (FuncState *fs, expdesc *e);
+LUAI_FUNC int luaK_jump (FuncState *fs);
+LUAI_FUNC void luaK_ret (FuncState *fs, int first, int nret);
+LUAI_FUNC void luaK_patchlist (FuncState *fs, int list, int target);
+LUAI_FUNC void luaK_patchtohere (FuncState *fs, int list);
+LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level);
+LUAI_FUNC void luaK_concat (FuncState *fs, int *l1, int l2);
+LUAI_FUNC int luaK_getlabel (FuncState *fs);
+LUAI_FUNC void luaK_prefix (FuncState *fs, UnOpr op, expdesc *v, int line);
+LUAI_FUNC void luaK_infix (FuncState *fs, BinOpr op, expdesc *v);
+LUAI_FUNC void luaK_posfix (FuncState *fs, BinOpr op, expdesc *v1,
+                            expdesc *v2, int line);
+LUAI_FUNC void luaK_setlist (FuncState *fs, int base, int nelems, int tostore);
+
+
+#endif
diff --git a/ext/lua/includes/lctype.h b/ext/lua/includes/lctype.h
new file mode 100644
index 0000000..99c7d12
--- /dev/null
+++ b/ext/lua/includes/lctype.h
@@ -0,0 +1,95 @@
+/*
+** $Id: lctype.h,v 1.12 2011/07/15 12:50:29 roberto Exp $
+** 'ctype' functions for Lua
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lctype_h
+#define lctype_h
+
+#include "lua.h"
+
+
+/*
+** WARNING: the functions defined here do not necessarily correspond
+** to the similar functions in the standard C ctype.h. They are
+** optimized for the specific needs of Lua
+*/
+
+#if !defined(LUA_USE_CTYPE)
+
+#if 'A' == 65 && '0' == 48
+/* ASCII case: can use its own tables; faster and fixed */
+#define LUA_USE_CTYPE	0
+#else
+/* must use standard C ctype */
+#define LUA_USE_CTYPE	1
+#endif
+
+#endif
+
+
+#if !LUA_USE_CTYPE	/* { */
+
+#include <limits.h>
+
+#include "llimits.h"
+
+
+#define ALPHABIT	0
+#define DIGITBIT	1
+#define PRINTBIT	2
+#define SPACEBIT	3
+#define XDIGITBIT	4
+
+
+#define MASK(B)		(1 << (B))
+
+
+/*
+** add 1 to char to allow index -1 (EOZ)
+*/
+#define testprop(c,p)	(luai_ctype_[(c)+1] & (p))
+
+/*
+** 'lalpha' (Lua alphabetic) and 'lalnum' (Lua alphanumeric) both include '_'
+*/
+#define lislalpha(c)	testprop(c, MASK(ALPHABIT))
+#define lislalnum(c)	testprop(c, (MASK(ALPHABIT) | MASK(DIGITBIT)))
+#define lisdigit(c)	testprop(c, MASK(DIGITBIT))
+#define lisspace(c)	testprop(c, MASK(SPACEBIT))
+#define lisprint(c)	testprop(c, MASK(PRINTBIT))
+#define lisxdigit(c)	testprop(c, MASK(XDIGITBIT))
+
+/*
+** this 'ltolower' only works for alphabetic characters
+*/
+#define ltolower(c)	((c) | ('A' ^ 'a'))
+
+
+/* two more entries for 0 and -1 (EOZ) */
+LUAI_DDEC const lu_byte luai_ctype_[UCHAR_MAX + 2];
+
+
+#else			/* }{ */
+
+/*
+** use standard C ctypes
+*/
+
+#include <ctype.h>
+
+
+#define lislalpha(c)	(isalpha(c) || (c) == '_')
+#define lislalnum(c)	(isalnum(c) || (c) == '_')
+#define lisdigit(c)	(isdigit(c))
+#define lisspace(c)	(isspace(c))
+#define lisprint(c)	(isprint(c))
+#define lisxdigit(c)	(isxdigit(c))
+
+#define ltolower(c)	(tolower(c))
+
+#endif			/* } */
+
+#endif
+
diff --git a/ext/lua/includes/ldebug.h b/ext/lua/includes/ldebug.h
new file mode 100644
index 0000000..fe39556
--- /dev/null
+++ b/ext/lua/includes/ldebug.h
@@ -0,0 +1,34 @@
+/*
+** $Id: ldebug.h,v 2.7 2011/10/07 20:45:19 roberto Exp $
+** Auxiliary functions from Debug Interface module
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ldebug_h
+#define ldebug_h
+
+
+#include "lstate.h"
+
+
+#define pcRel(pc, p)	(cast(int, (pc) - (p)->code) - 1)
+
+#define getfuncline(f,pc)	(((f)->lineinfo) ? (f)->lineinfo[pc] : 0)
+
+#define resethookcount(L)	(L->hookcount = L->basehookcount)
+
+/* Active Lua function (given call info) */
+#define ci_func(ci)		(clLvalue((ci)->func))
+
+
+LUAI_FUNC l_noret luaG_typeerror (lua_State *L, const TValue *o,
+                                                const char *opname);
+LUAI_FUNC l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2);
+LUAI_FUNC l_noret luaG_aritherror (lua_State *L, const TValue *p1,
+                                                 const TValue *p2);
+LUAI_FUNC l_noret luaG_ordererror (lua_State *L, const TValue *p1,
+                                                 const TValue *p2);
+LUAI_FUNC l_noret luaG_runerror (lua_State *L, const char *fmt, ...);
+LUAI_FUNC l_noret luaG_errormsg (lua_State *L);
+
+#endif
diff --git a/ext/lua/includes/ldo.h b/ext/lua/includes/ldo.h
new file mode 100644
index 0000000..27b837d
--- /dev/null
+++ b/ext/lua/includes/ldo.h
@@ -0,0 +1,46 @@
+/*
+** $Id: ldo.h,v 2.20 2011/11/29 15:55:08 roberto Exp $
+** Stack and Call structure of Lua
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ldo_h
+#define ldo_h
+
+
+#include "lobject.h"
+#include "lstate.h"
+#include "lzio.h"
+
+
+#define luaD_checkstack(L,n)	if (L->stack_last - L->top <= (n)) \
+				    luaD_growstack(L, n); else condmovestack(L);
+
+
+#define incr_top(L) {L->top++; luaD_checkstack(L,0);}
+
+#define savestack(L,p)		((char *)(p) - (char *)L->stack)
+#define restorestack(L,n)	((TValue *)((char *)L->stack + (n)))
+
+
+/* type of protected functions, to be ran by `runprotected' */
+typedef void (*Pfunc) (lua_State *L, void *ud);
+
+LUAI_FUNC int luaD_protectedparser (lua_State *L, ZIO *z, const char *name,
+                                                  const char *mode);
+LUAI_FUNC void luaD_hook (lua_State *L, int event, int line);
+LUAI_FUNC int luaD_precall (lua_State *L, StkId func, int nresults);
+LUAI_FUNC void luaD_call (lua_State *L, StkId func, int nResults,
+                                        int allowyield);
+LUAI_FUNC int luaD_pcall (lua_State *L, Pfunc func, void *u,
+                                        ptrdiff_t oldtop, ptrdiff_t ef);
+LUAI_FUNC int luaD_poscall (lua_State *L, StkId firstResult);
+LUAI_FUNC void luaD_reallocstack (lua_State *L, int newsize);
+LUAI_FUNC void luaD_growstack (lua_State *L, int n);
+LUAI_FUNC void luaD_shrinkstack (lua_State *L);
+
+LUAI_FUNC l_noret luaD_throw (lua_State *L, int errcode);
+LUAI_FUNC int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud);
+
+#endif
+
diff --git a/ext/lua/includes/lfunc.h b/ext/lua/includes/lfunc.h
new file mode 100644
index 0000000..e236a71
--- /dev/null
+++ b/ext/lua/includes/lfunc.h
@@ -0,0 +1,33 @@
+/*
+** $Id: lfunc.h,v 2.8 2012/05/08 13:53:33 roberto Exp $
+** Auxiliary functions to manipulate prototypes and closures
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lfunc_h
+#define lfunc_h
+
+
+#include "lobject.h"
+
+
+#define sizeCclosure(n)	(cast(int, sizeof(CClosure)) + \
+                         cast(int, sizeof(TValue)*((n)-1)))
+
+#define sizeLclosure(n)	(cast(int, sizeof(LClosure)) + \
+                         cast(int, sizeof(TValue *)*((n)-1)))
+
+
+LUAI_FUNC Proto *luaF_newproto (lua_State *L);
+LUAI_FUNC Closure *luaF_newCclosure (lua_State *L, int nelems);
+LUAI_FUNC Closure *luaF_newLclosure (lua_State *L, int nelems);
+LUAI_FUNC UpVal *luaF_newupval (lua_State *L);
+LUAI_FUNC UpVal *luaF_findupval (lua_State *L, StkId level);
+LUAI_FUNC void luaF_close (lua_State *L, StkId level);
+LUAI_FUNC void luaF_freeproto (lua_State *L, Proto *f);
+LUAI_FUNC void luaF_freeupval (lua_State *L, UpVal *uv);
+LUAI_FUNC const char *luaF_getlocalname (const Proto *func, int local_number,
+                                         int pc);
+
+
+#endif
diff --git a/ext/lua/includes/lgc.h b/ext/lua/includes/lgc.h
new file mode 100644
index 0000000..dee270b
--- /dev/null
+++ b/ext/lua/includes/lgc.h
@@ -0,0 +1,157 @@
+/*
+** $Id: lgc.h,v 2.58 2012/09/11 12:53:08 roberto Exp $
+** Garbage Collector
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lgc_h
+#define lgc_h
+
+
+#include "lobject.h"
+#include "lstate.h"
+
+/*
+** Collectable objects may have one of three colors: white, which
+** means the object is not marked; gray, which means the
+** object is marked, but its references may be not marked; and
+** black, which means that the object and all its references are marked.
+** The main invariant of the garbage collector, while marking objects,
+** is that a black object can never point to a white one. Moreover,
+** any gray object must be in a "gray list" (gray, grayagain, weak,
+** allweak, ephemeron) so that it can be visited again before finishing
+** the collection cycle. These lists have no meaning when the invariant
+** is not being enforced (e.g., sweep phase).
+*/
+
+
+
+/* how much to allocate before next GC step */
+#if !defined(GCSTEPSIZE)
+/* ~100 small strings */
+#define GCSTEPSIZE	(cast_int(100 * sizeof(TString)))
+#endif
+
+
+/*
+** Possible states of the Garbage Collector
+*/
+#define GCSpropagate	0
+#define GCSatomic	1
+#define GCSsweepstring	2
+#define GCSsweepudata	3
+#define GCSsweep	4
+#define GCSpause	5
+
+
+#define issweepphase(g)  \
+	(GCSsweepstring <= (g)->gcstate && (g)->gcstate <= GCSsweep)
+
+#define isgenerational(g)	((g)->gckind == KGC_GEN)
+
+/*
+** macros to tell when main invariant (white objects cannot point to black
+** ones) must be kept. During a non-generational collection, the sweep
+** phase may break the invariant, as objects turned white may point to
+** still-black objects. The invariant is restored when sweep ends and
+** all objects are white again. During a generational collection, the
+** invariant must be kept all times.
+*/
+
+#define keepinvariant(g)	(isgenerational(g) || g->gcstate <= GCSatomic)
+
+
+/*
+** Outside the collector, the state in generational mode is kept in
+** 'propagate', so 'keepinvariant' is always true.
+*/
+#define keepinvariantout(g)  \
+  check_exp(g->gcstate == GCSpropagate || !isgenerational(g),  \
+            g->gcstate <= GCSatomic)
+
+
+/*
+** some useful bit tricks
+*/
+#define resetbits(x,m)		((x) &= cast(lu_byte, ~(m)))
+#define setbits(x,m)		((x) |= (m))
+#define testbits(x,m)		((x) & (m))
+#define bitmask(b)		(1<<(b))
+#define bit2mask(b1,b2)		(bitmask(b1) | bitmask(b2))
+#define l_setbit(x,b)		setbits(x, bitmask(b))
+#define resetbit(x,b)		resetbits(x, bitmask(b))
+#define testbit(x,b)		testbits(x, bitmask(b))
+
+
+/* Layout for bit use in `marked' field: */
+#define WHITE0BIT	0  /* object is white (type 0) */
+#define WHITE1BIT	1  /* object is white (type 1) */
+#define BLACKBIT	2  /* object is black */
+#define FINALIZEDBIT	3  /* object has been separated for finalization */
+#define SEPARATED	4  /* object is in 'finobj' list or in 'tobefnz' */
+#define FIXEDBIT	5  /* object is fixed (should not be collected) */
+#define OLDBIT		6  /* object is old (only in generational mode) */
+/* bit 7 is currently used by tests (luaL_checkmemory) */
+
+#define WHITEBITS	bit2mask(WHITE0BIT, WHITE1BIT)
+
+
+#define iswhite(x)      testbits((x)->gch.marked, WHITEBITS)
+#define isblack(x)      testbit((x)->gch.marked, BLACKBIT)
+#define isgray(x)  /* neither white nor black */  \
+	(!testbits((x)->gch.marked, WHITEBITS | bitmask(BLACKBIT)))
+
+#define isold(x)	testbit((x)->gch.marked, OLDBIT)
+
+/* MOVE OLD rule: whenever an object is moved to the beginning of
+   a GC list, its old bit must be cleared */
+#define resetoldbit(o)	resetbit((o)->gch.marked, OLDBIT)
+
+#define otherwhite(g)	(g->currentwhite ^ WHITEBITS)
+#define isdeadm(ow,m)	(!(((m) ^ WHITEBITS) & (ow)))
+#define isdead(g,v)	isdeadm(otherwhite(g), (v)->gch.marked)
+
+#define changewhite(x)	((x)->gch.marked ^= WHITEBITS)
+#define gray2black(x)	l_setbit((x)->gch.marked, BLACKBIT)
+
+#define valiswhite(x)	(iscollectable(x) && iswhite(gcvalue(x)))
+
+#define luaC_white(g)	cast(lu_byte, (g)->currentwhite & WHITEBITS)
+
+
+#define luaC_condGC(L,c) \
+	{if (G(L)->GCdebt > 0) {c;}; condchangemem(L);}
+#define luaC_checkGC(L)		luaC_condGC(L, luaC_step(L);)
+
+
+#define luaC_barrier(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p)))  \
+	luaC_barrier_(L,obj2gco(p),gcvalue(v)); }
+
+#define luaC_barrierback(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p)))  \
+	luaC_barrierback_(L,p); }
+
+#define luaC_objbarrier(L,p,o)  \
+	{ if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) \
+		luaC_barrier_(L,obj2gco(p),obj2gco(o)); }
+
+#define luaC_objbarrierback(L,p,o)  \
+   { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) luaC_barrierback_(L,p); }
+
+#define luaC_barrierproto(L,p,c) \
+   { if (isblack(obj2gco(p))) luaC_barrierproto_(L,p,c); }
+
+LUAI_FUNC void luaC_freeallobjects (lua_State *L);
+LUAI_FUNC void luaC_step (lua_State *L);
+LUAI_FUNC void luaC_forcestep (lua_State *L);
+LUAI_FUNC void luaC_runtilstate (lua_State *L, int statesmask);
+LUAI_FUNC void luaC_fullgc (lua_State *L, int isemergency);
+LUAI_FUNC GCObject *luaC_newobj (lua_State *L, int tt, size_t sz,
+                                 GCObject **list, int offset);
+LUAI_FUNC void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v);
+LUAI_FUNC void luaC_barrierback_ (lua_State *L, GCObject *o);
+LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c);
+LUAI_FUNC void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt);
+LUAI_FUNC void luaC_checkupvalcolor (global_State *g, UpVal *uv);
+LUAI_FUNC void luaC_changemode (lua_State *L, int mode);
+
+#endif
diff --git a/ext/lua/includes/llex.h b/ext/lua/includes/llex.h
new file mode 100644
index 0000000..9ca8a29
--- /dev/null
+++ b/ext/lua/includes/llex.h
@@ -0,0 +1,78 @@
+/*
+** $Id: llex.h,v 1.72 2011/11/30 12:43:51 roberto Exp $
+** Lexical Analyzer
+** See Copyright Notice in lua.h
+*/
+
+#ifndef llex_h
+#define llex_h
+
+#include "lobject.h"
+#include "lzio.h"
+
+
+#define FIRST_RESERVED	257
+
+
+
+/*
+* WARNING: if you change the order of this enumeration,
+* grep "ORDER RESERVED"
+*/
+enum RESERVED {
+  /* terminal symbols denoted by reserved words */
+  TK_AND = FIRST_RESERVED, TK_BREAK,
+  TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION,
+  TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT,
+  TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE,
+  /* other terminal symbols */
+  TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE, TK_DBCOLON, TK_EOS,
+  TK_NUMBER, TK_NAME, TK_STRING
+};
+
+/* number of reserved words */
+#define NUM_RESERVED	(cast(int, TK_WHILE-FIRST_RESERVED+1))
+
+
+typedef union {
+  lua_Number r;
+  TString *ts;
+} SemInfo;  /* semantics information */
+
+
+typedef struct Token {
+  int token;
+  SemInfo seminfo;
+} Token;
+
+
+/* state of the lexer plus state of the parser when shared by all
+   functions */
+typedef struct LexState {
+  int current;  /* current character (charint) */
+  int linenumber;  /* input line counter */
+  int lastline;  /* line of last token `consumed' */
+  Token t;  /* current token */
+  Token lookahead;  /* look ahead token */
+  struct FuncState *fs;  /* current function (parser) */
+  struct lua_State *L;
+  ZIO *z;  /* input stream */
+  Mbuffer *buff;  /* buffer for tokens */
+  struct Dyndata *dyd;  /* dynamic structures used by the parser */
+  TString *source;  /* current source name */
+  TString *envn;  /* environment variable name */
+  char decpoint;  /* locale decimal point */
+} LexState;
+
+
+LUAI_FUNC void luaX_init (lua_State *L);
+LUAI_FUNC void luaX_setinput (lua_State *L, LexState *ls, ZIO *z,
+                              TString *source, int firstchar);
+LUAI_FUNC TString *luaX_newstring (LexState *ls, const char *str, size_t l);
+LUAI_FUNC void luaX_next (LexState *ls);
+LUAI_FUNC int luaX_lookahead (LexState *ls);
+LUAI_FUNC l_noret luaX_syntaxerror (LexState *ls, const char *s);
+LUAI_FUNC const char *luaX_token2str (LexState *ls, int token);
+
+
+#endif
diff --git a/ext/lua/includes/llimits.h b/ext/lua/includes/llimits.h
new file mode 100644
index 0000000..1b8c79b
--- /dev/null
+++ b/ext/lua/includes/llimits.h
@@ -0,0 +1,309 @@
+/*
+** $Id: llimits.h,v 1.103 2013/02/20 14:08:56 roberto Exp $
+** Limits, basic types, and some other `installation-dependent' definitions
+** See Copyright Notice in lua.h
+*/
+
+#ifndef llimits_h
+#define llimits_h
+
+
+#include <limits.h>
+#include <stddef.h>
+
+
+#include "lua.h"
+
+
+typedef unsigned LUA_INT32 lu_int32;
+
+typedef LUAI_UMEM lu_mem;
+
+typedef LUAI_MEM l_mem;
+
+
+
+/* chars used as small naturals (so that `char' is reserved for characters) */
+typedef unsigned char lu_byte;
+
+
+#define MAX_SIZET	((size_t)(~(size_t)0)-2)
+
+#define MAX_LUMEM	((lu_mem)(~(lu_mem)0)-2)
+
+#define MAX_LMEM	((l_mem) ((MAX_LUMEM >> 1) - 2))
+
+
+#define MAX_INT (INT_MAX-2)  /* maximum value of an int (-2 for safety) */
+
+/*
+** conversion of pointer to integer
+** this is for hashing only; there is no problem if the integer
+** cannot hold the whole pointer value
+*/
+#define IntPoint(p)  ((unsigned int)(lu_mem)(p))
+
+
+
+/* type to ensure maximum alignment */
+#if !defined(LUAI_USER_ALIGNMENT_T)
+#define LUAI_USER_ALIGNMENT_T	union { double u; void *s; long l; }
+#endif
+
+typedef LUAI_USER_ALIGNMENT_T L_Umaxalign;
+
+
+/* result of a `usual argument conversion' over lua_Number */
+typedef LUAI_UACNUMBER l_uacNumber;
+
+
+/* internal assertions for in-house debugging */
+#if defined(lua_assert)
+#define check_exp(c,e)		(lua_assert(c), (e))
+/* to avoid problems with conditions too long */
+#define lua_longassert(c)	{ if (!(c)) lua_assert(0); }
+#else
+#define lua_assert(c)		((void)0)
+#define check_exp(c,e)		(e)
+#define lua_longassert(c)	((void)0)
+#endif
+
+/*
+** assertion for checking API calls
+*/
+#if !defined(luai_apicheck)
+
+#if defined(LUA_USE_APICHECK)
+#include <assert.h>
+#define luai_apicheck(L,e)	assert(e)
+#else
+#define luai_apicheck(L,e)	lua_assert(e)
+#endif
+
+#endif
+
+#define api_check(l,e,msg)	luai_apicheck(l,(e) && msg)
+
+
+#if !defined(UNUSED)
+#define UNUSED(x)	((void)(x))	/* to avoid warnings */
+#endif
+
+
+#define cast(t, exp)	((t)(exp))
+
+#define cast_byte(i)	cast(lu_byte, (i))
+#define cast_num(i)	cast(lua_Number, (i))
+#define cast_int(i)	cast(int, (i))
+#define cast_uchar(i)	cast(unsigned char, (i))
+
+
+/*
+** non-return type
+*/
+#if defined(__GNUC__)
+#define l_noret		void __attribute__((noreturn))
+#elif defined(_MSC_VER)
+#define l_noret		void __declspec(noreturn)
+#else
+#define l_noret		void
+#endif
+
+
+
+/*
+** maximum depth for nested C calls and syntactical nested non-terminals
+** in a program. (Value must fit in an unsigned short int.)
+*/
+#if !defined(LUAI_MAXCCALLS)
+#define LUAI_MAXCCALLS		200
+#endif
+
+/*
+** maximum number of upvalues in a closure (both C and Lua). (Value
+** must fit in an unsigned char.)
+*/
+#define MAXUPVAL	UCHAR_MAX
+
+
+/*
+** type for virtual-machine instructions
+** must be an unsigned with (at least) 4 bytes (see details in lopcodes.h)
+*/
+typedef lu_int32 Instruction;
+
+
+
+/* maximum stack for a Lua function */
+#define MAXSTACK	250
+
+
+
+/* minimum size for the string table (must be power of 2) */
+#if !defined(MINSTRTABSIZE)
+#define MINSTRTABSIZE	32
+#endif
+
+
+/* minimum size for string buffer */
+#if !defined(LUA_MINBUFFER)
+#define LUA_MINBUFFER	32
+#endif
+
+
+#if !defined(lua_lock)
+#define lua_lock(L)     ((void) 0)
+#define lua_unlock(L)   ((void) 0)
+#endif
+
+#if !defined(luai_threadyield)
+#define luai_threadyield(L)     {lua_unlock(L); lua_lock(L);}
+#endif
+
+
+/*
+** these macros allow user-specific actions on threads when you defined
+** LUAI_EXTRASPACE and need to do something extra when a thread is
+** created/deleted/resumed/yielded.
+*/
+#if !defined(luai_userstateopen)
+#define luai_userstateopen(L)		((void)L)
+#endif
+
+#if !defined(luai_userstateclose)
+#define luai_userstateclose(L)		((void)L)
+#endif
+
+#if !defined(luai_userstatethread)
+#define luai_userstatethread(L,L1)	((void)L)
+#endif
+
+#if !defined(luai_userstatefree)
+#define luai_userstatefree(L,L1)	((void)L)
+#endif
+
+#if !defined(luai_userstateresume)
+#define luai_userstateresume(L,n)       ((void)L)
+#endif
+
+#if !defined(luai_userstateyield)
+#define luai_userstateyield(L,n)        ((void)L)
+#endif
+
+/*
+** lua_number2int is a macro to convert lua_Number to int.
+** lua_number2integer is a macro to convert lua_Number to lua_Integer.
+** lua_number2unsigned is a macro to convert a lua_Number to a lua_Unsigned.
+** lua_unsigned2number is a macro to convert a lua_Unsigned to a lua_Number.
+** luai_hashnum is a macro to hash a lua_Number value into an integer.
+** The hash must be deterministic and give reasonable values for
+** both small and large values (outside the range of integers).
+*/
+
+#if defined(MS_ASMTRICK) || defined(LUA_MSASMTRICK)	/* { */
+/* trick with Microsoft assembler for X86 */
+
+#define lua_number2int(i,n)  __asm {__asm fld n   __asm fistp i}
+#define lua_number2integer(i,n)		lua_number2int(i, n)
+#define lua_number2unsigned(i,n)  \
+  {__int64 l; __asm {__asm fld n   __asm fistp l} i = (unsigned int)l;}
+
+
+#elif defined(LUA_IEEE754TRICK)		/* }{ */
+/* the next trick should work on any machine using IEEE754 with
+   a 32-bit int type */
+
+union luai_Cast { double l_d; LUA_INT32 l_p[2]; };
+
+#if !defined(LUA_IEEEENDIAN)	/* { */
+#define LUAI_EXTRAIEEE	\
+  static const union luai_Cast ieeeendian = {-(33.0 + 6755399441055744.0)};
+#define LUA_IEEEENDIANLOC	(ieeeendian.l_p[1] == 33)
+#else
+#define LUA_IEEEENDIANLOC	LUA_IEEEENDIAN
+#define LUAI_EXTRAIEEE		/* empty */
+#endif				/* } */
+
+#define lua_number2int32(i,n,t) \
+  { LUAI_EXTRAIEEE \
+    volatile union luai_Cast u; u.l_d = (n) + 6755399441055744.0; \
+    (i) = (t)u.l_p[LUA_IEEEENDIANLOC]; }
+
+#define luai_hashnum(i,n)  \
+  { volatile union luai_Cast u; u.l_d = (n) + 1.0;  /* avoid -0 */ \
+    (i) = u.l_p[0]; (i) += u.l_p[1]; }  /* add double bits for his hash */
+
+#define lua_number2int(i,n)		lua_number2int32(i, n, int)
+#define lua_number2unsigned(i,n)	lua_number2int32(i, n, lua_Unsigned)
+
+/* the trick can be expanded to lua_Integer when it is a 32-bit value */
+#if defined(LUA_IEEELL)
+#define lua_number2integer(i,n)		lua_number2int32(i, n, lua_Integer)
+#endif
+
+#endif				/* } */
+
+
+/* the following definitions always work, but may be slow */
+
+#if !defined(lua_number2int)
+#define lua_number2int(i,n)	((i)=(int)(n))
+#endif
+
+#if !defined(lua_number2integer)
+#define lua_number2integer(i,n)	((i)=(lua_Integer)(n))
+#endif
+
+#if !defined(lua_number2unsigned)	/* { */
+/* the following definition assures proper modulo behavior */
+#if defined(LUA_NUMBER_DOUBLE) || defined(LUA_NUMBER_FLOAT)
+#include <math.h>
+#define SUPUNSIGNED	((lua_Number)(~(lua_Unsigned)0) + 1)
+#define lua_number2unsigned(i,n)  \
+	((i)=(lua_Unsigned)((n) - floor((n)/SUPUNSIGNED)*SUPUNSIGNED))
+#else
+#define lua_number2unsigned(i,n)	((i)=(lua_Unsigned)(n))
+#endif
+#endif				/* } */
+
+
+#if !defined(lua_unsigned2number)
+/* on several machines, coercion from unsigned to double is slow,
+   so it may be worth to avoid */
+#define lua_unsigned2number(u)  \
+    (((u) <= (lua_Unsigned)INT_MAX) ? (lua_Number)(int)(u) : (lua_Number)(u))
+#endif
+
+
+
+#if defined(ltable_c) && !defined(luai_hashnum)
+
+#include <float.h>
+#include <math.h>
+
+#define luai_hashnum(i,n) { int e;  \
+  n = l_mathop(frexp)(n, &e) * (lua_Number)(INT_MAX - DBL_MAX_EXP);  \
+  lua_number2int(i, n); i += e; }
+
+#endif
+
+
+
+/*
+** macro to control inclusion of some hard tests on stack reallocation
+*/
+#if !defined(HARDSTACKTESTS)
+#define condmovestack(L)	((void)0)
+#else
+/* realloc stack keeping its size */
+#define condmovestack(L)	luaD_reallocstack((L), (L)->stacksize)
+#endif
+
+#if !defined(HARDMEMTESTS)
+#define condchangemem(L)	condmovestack(L)
+#else
+#define condchangemem(L)  \
+	((void)(!(G(L)->gcrunning) || (luaC_fullgc(L, 0), 1)))
+#endif
+
+#endif
diff --git a/ext/lua/includes/lmem.h b/ext/lua/includes/lmem.h
new file mode 100644
index 0000000..5f85099
--- /dev/null
+++ b/ext/lua/includes/lmem.h
@@ -0,0 +1,57 @@
+/*
+** $Id: lmem.h,v 1.40 2013/02/20 14:08:21 roberto Exp $
+** Interface to Memory Manager
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lmem_h
+#define lmem_h
+
+
+#include <stddef.h>
+
+#include "llimits.h"
+#include "lua.h"
+
+
+/*
+** This macro avoids the runtime division MAX_SIZET/(e), as 'e' is
+** always constant.
+** The macro is somewhat complex to avoid warnings:
+** +1 avoids warnings of "comparison has constant result";
+** cast to 'void' avoids warnings of "value unused".
+*/
+#define luaM_reallocv(L,b,on,n,e) \
+  (cast(void, \
+     (cast(size_t, (n)+1) > MAX_SIZET/(e)) ? (luaM_toobig(L), 0) : 0), \
+   luaM_realloc_(L, (b), (on)*(e), (n)*(e)))
+
+#define luaM_freemem(L, b, s)	luaM_realloc_(L, (b), (s), 0)
+#define luaM_free(L, b)		luaM_realloc_(L, (b), sizeof(*(b)), 0)
+#define luaM_freearray(L, b, n)   luaM_reallocv(L, (b), n, 0, sizeof((b)[0]))
+
+#define luaM_malloc(L,s)	luaM_realloc_(L, NULL, 0, (s))
+#define luaM_new(L,t)		cast(t *, luaM_malloc(L, sizeof(t)))
+#define luaM_newvector(L,n,t) \
+		cast(t *, luaM_reallocv(L, NULL, 0, n, sizeof(t)))
+
+#define luaM_newobject(L,tag,s)	luaM_realloc_(L, NULL, tag, (s))
+
+#define luaM_growvector(L,v,nelems,size,t,limit,e) \
+          if ((nelems)+1 > (size)) \
+            ((v)=cast(t *, luaM_growaux_(L,v,&(size),sizeof(t),limit,e)))
+
+#define luaM_reallocvector(L, v,oldn,n,t) \
+   ((v)=cast(t *, luaM_reallocv(L, v, oldn, n, sizeof(t))))
+
+LUAI_FUNC l_noret luaM_toobig (lua_State *L);
+
+/* not to be called directly */
+LUAI_FUNC void *luaM_realloc_ (lua_State *L, void *block, size_t oldsize,
+                                                          size_t size);
+LUAI_FUNC void *luaM_growaux_ (lua_State *L, void *block, int *size,
+                               size_t size_elem, int limit,
+                               const char *what);
+
+#endif
+
diff --git a/ext/lua/includes/lobject.h b/ext/lua/includes/lobject.h
new file mode 100644
index 0000000..dd23b91
--- /dev/null
+++ b/ext/lua/includes/lobject.h
@@ -0,0 +1,607 @@
+/*
+** $Id: lobject.h,v 2.71 2012/09/11 18:21:44 roberto Exp $
+** Type definitions for Lua objects
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lobject_h
+#define lobject_h
+
+
+#include <stdarg.h>
+
+
+#include "llimits.h"
+#include "lua.h"
+
+
+/*
+** Extra tags for non-values
+*/
+#define LUA_TPROTO	LUA_NUMTAGS
+#define LUA_TUPVAL	(LUA_NUMTAGS+1)
+#define LUA_TDEADKEY	(LUA_NUMTAGS+2)
+
+/*
+** number of all possible tags (including LUA_TNONE but excluding DEADKEY)
+*/
+#define LUA_TOTALTAGS	(LUA_TUPVAL+2)
+
+
+/*
+** tags for Tagged Values have the following use of bits:
+** bits 0-3: actual tag (a LUA_T* value)
+** bits 4-5: variant bits
+** bit 6: whether value is collectable
+*/
+
+#define VARBITS		(3 << 4)
+
+
+/*
+** LUA_TFUNCTION variants:
+** 0 - Lua function
+** 1 - light C function
+** 2 - regular C function (closure)
+*/
+
+/* Variant tags for functions */
+#define LUA_TLCL	(LUA_TFUNCTION | (0 << 4))  /* Lua closure */
+#define LUA_TLCF	(LUA_TFUNCTION | (1 << 4))  /* light C function */
+#define LUA_TCCL	(LUA_TFUNCTION | (2 << 4))  /* C closure */
+
+
+/* Variant tags for strings */
+#define LUA_TSHRSTR	(LUA_TSTRING | (0 << 4))  /* short strings */
+#define LUA_TLNGSTR	(LUA_TSTRING | (1 << 4))  /* long strings */
+
+
+/* Bit mark for collectable types */
+#define BIT_ISCOLLECTABLE	(1 << 6)
+
+/* mark a tag as collectable */
+#define ctb(t)			((t) | BIT_ISCOLLECTABLE)
+
+
+/*
+** Union of all collectable objects
+*/
+typedef union GCObject GCObject;
+
+
+/*
+** Common Header for all collectable objects (in macro form, to be
+** included in other objects)
+*/
+#define CommonHeader	GCObject *next; lu_byte tt; lu_byte marked
+
+
+/*
+** Common header in struct form
+*/
+typedef struct GCheader {
+  CommonHeader;
+} GCheader;
+
+
+
+/*
+** Union of all Lua values
+*/
+typedef union Value Value;
+
+
+#define numfield	lua_Number n;    /* numbers */
+
+
+
+/*
+** Tagged Values. This is the basic representation of values in Lua,
+** an actual value plus a tag with its type.
+*/
+
+#define TValuefields	Value value_; int tt_
+
+typedef struct lua_TValue TValue;
+
+
+/* macro defining a nil value */
+#define NILCONSTANT	{NULL}, LUA_TNIL
+
+
+#define val_(o)		((o)->value_)
+#define num_(o)		(val_(o).n)
+
+
+/* raw type tag of a TValue */
+#define rttype(o)	((o)->tt_)
+
+/* tag with no variants (bits 0-3) */
+#define novariant(x)	((x) & 0x0F)
+
+/* type tag of a TValue (bits 0-3 for tags + variant bits 4-5) */
+#define ttype(o)	(rttype(o) & 0x3F)
+
+/* type tag of a TValue with no variants (bits 0-3) */
+#define ttypenv(o)	(novariant(rttype(o)))
+
+
+/* Macros to test type */
+#define checktag(o,t)		(rttype(o) == (t))
+#define checktype(o,t)		(ttypenv(o) == (t))
+#define ttisnumber(o)		checktag((o), LUA_TNUMBER)
+#define ttisnil(o)		checktag((o), LUA_TNIL)
+#define ttisboolean(o)		checktag((o), LUA_TBOOLEAN)
+#define ttislightuserdata(o)	checktag((o), LUA_TLIGHTUSERDATA)
+#define ttisstring(o)		checktype((o), LUA_TSTRING)
+#define ttisshrstring(o)	checktag((o), ctb(LUA_TSHRSTR))
+#define ttislngstring(o)	checktag((o), ctb(LUA_TLNGSTR))
+#define ttistable(o)		checktag((o), ctb(LUA_TTABLE))
+#define ttisfunction(o)		checktype(o, LUA_TFUNCTION)
+#define ttisclosure(o)		((rttype(o) & 0x1F) == LUA_TFUNCTION)
+#define ttisCclosure(o)		checktag((o), ctb(LUA_TCCL))
+#define ttisLclosure(o)		checktag((o), ctb(LUA_TLCL))
+#define ttislcf(o)		checktag((o), LUA_TLCF)
+#define ttisuserdata(o)		checktag((o), ctb(LUA_TUSERDATA))
+#define ttisthread(o)		checktag((o), ctb(LUA_TTHREAD))
+#define ttisdeadkey(o)		checktag((o), LUA_TDEADKEY)
+
+#define ttisequal(o1,o2)	(rttype(o1) == rttype(o2))
+
+/* Macros to access values */
+#define nvalue(o)	check_exp(ttisnumber(o), num_(o))
+#define gcvalue(o)	check_exp(iscollectable(o), val_(o).gc)
+#define pvalue(o)	check_exp(ttislightuserdata(o), val_(o).p)
+#define rawtsvalue(o)	check_exp(ttisstring(o), &val_(o).gc->ts)
+#define tsvalue(o)	(&rawtsvalue(o)->tsv)
+#define rawuvalue(o)	check_exp(ttisuserdata(o), &val_(o).gc->u)
+#define uvalue(o)	(&rawuvalue(o)->uv)
+#define clvalue(o)	check_exp(ttisclosure(o), &val_(o).gc->cl)
+#define clLvalue(o)	check_exp(ttisLclosure(o), &val_(o).gc->cl.l)
+#define clCvalue(o)	check_exp(ttisCclosure(o), &val_(o).gc->cl.c)
+#define fvalue(o)	check_exp(ttislcf(o), val_(o).f)
+#define hvalue(o)	check_exp(ttistable(o), &val_(o).gc->h)
+#define bvalue(o)	check_exp(ttisboolean(o), val_(o).b)
+#define thvalue(o)	check_exp(ttisthread(o), &val_(o).gc->th)
+/* a dead value may get the 'gc' field, but cannot access its contents */
+#define deadvalue(o)	check_exp(ttisdeadkey(o), cast(void *, val_(o).gc))
+
+#define l_isfalse(o)	(ttisnil(o) || (ttisboolean(o) && bvalue(o) == 0))
+
+
+#define iscollectable(o)	(rttype(o) & BIT_ISCOLLECTABLE)
+
+
+/* Macros for internal tests */
+#define righttt(obj)		(ttype(obj) == gcvalue(obj)->gch.tt)
+
+#define checkliveness(g,obj) \
+	lua_longassert(!iscollectable(obj) || \
+			(righttt(obj) && !isdead(g,gcvalue(obj))))
+
+
+/* Macros to set values */
+#define settt_(o,t)	((o)->tt_=(t))
+
+#define setnvalue(obj,x) \
+  { TValue *io=(obj); num_(io)=(x); settt_(io, LUA_TNUMBER); }
+
+#define setnilvalue(obj) settt_(obj, LUA_TNIL)
+
+#define setfvalue(obj,x) \
+  { TValue *io=(obj); val_(io).f=(x); settt_(io, LUA_TLCF); }
+
+#define setpvalue(obj,x) \
+  { TValue *io=(obj); val_(io).p=(x); settt_(io, LUA_TLIGHTUSERDATA); }
+
+#define setbvalue(obj,x) \
+  { TValue *io=(obj); val_(io).b=(x); settt_(io, LUA_TBOOLEAN); }
+
+#define setgcovalue(L,obj,x) \
+  { TValue *io=(obj); GCObject *i_g=(x); \
+    val_(io).gc=i_g; settt_(io, ctb(gch(i_g)->tt)); }
+
+#define setsvalue(L,obj,x) \
+  { TValue *io=(obj); \
+    TString *x_ = (x); \
+    val_(io).gc=cast(GCObject *, x_); settt_(io, ctb(x_->tsv.tt)); \
+    checkliveness(G(L),io); }
+
+#define setuvalue(L,obj,x) \
+  { TValue *io=(obj); \
+    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TUSERDATA)); \
+    checkliveness(G(L),io); }
+
+#define setthvalue(L,obj,x) \
+  { TValue *io=(obj); \
+    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTHREAD)); \
+    checkliveness(G(L),io); }
+
+#define setclLvalue(L,obj,x) \
+  { TValue *io=(obj); \
+    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TLCL)); \
+    checkliveness(G(L),io); }
+
+#define setclCvalue(L,obj,x) \
+  { TValue *io=(obj); \
+    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TCCL)); \
+    checkliveness(G(L),io); }
+
+#define sethvalue(L,obj,x) \
+  { TValue *io=(obj); \
+    val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTABLE)); \
+    checkliveness(G(L),io); }
+
+#define setdeadvalue(obj)	settt_(obj, LUA_TDEADKEY)
+
+
+
+#define setobj(L,obj1,obj2) \
+	{ const TValue *io2=(obj2); TValue *io1=(obj1); \
+	  io1->value_ = io2->value_; io1->tt_ = io2->tt_; \
+	  checkliveness(G(L),io1); }
+
+
+/*
+** different types of assignments, according to destination
+*/
+
+/* from stack to (same) stack */
+#define setobjs2s	setobj
+/* to stack (not from same stack) */
+#define setobj2s	setobj
+#define setsvalue2s	setsvalue
+#define sethvalue2s	sethvalue
+#define setptvalue2s	setptvalue
+/* from table to same table */
+#define setobjt2t	setobj
+/* to table */
+#define setobj2t	setobj
+/* to new object */
+#define setobj2n	setobj
+#define setsvalue2n	setsvalue
+
+
+/* check whether a number is valid (useful only for NaN trick) */
+#define luai_checknum(L,o,c)	{ /* empty */ }
+
+
+/*
+** {======================================================
+** NaN Trick
+** =======================================================
+*/
+#if defined(LUA_NANTRICK)
+
+/*
+** numbers are represented in the 'd_' field. All other values have the
+** value (NNMARK | tag) in 'tt__'. A number with such pattern would be
+** a "signaled NaN", which is never generated by regular operations by
+** the CPU (nor by 'strtod')
+*/
+
+/* allows for external implementation for part of the trick */
+#if !defined(NNMARK)	/* { */
+
+
+#if !defined(LUA_IEEEENDIAN)
+#error option 'LUA_NANTRICK' needs 'LUA_IEEEENDIAN'
+#endif
+
+
+#define NNMARK		0x7FF7A500
+#define NNMASK		0x7FFFFF00
+
+#undef TValuefields
+#undef NILCONSTANT
+
+#if (LUA_IEEEENDIAN == 0)	/* { */
+
+/* little endian */
+#define TValuefields  \
+	union { struct { Value v__; int tt__; } i; double d__; } u
+#define NILCONSTANT	{{{NULL}, tag2tt(LUA_TNIL)}}
+/* field-access macros */
+#define v_(o)		((o)->u.i.v__)
+#define d_(o)		((o)->u.d__)
+#define tt_(o)		((o)->u.i.tt__)
+
+#else				/* }{ */
+
+/* big endian */
+#define TValuefields  \
+	union { struct { int tt__; Value v__; } i; double d__; } u
+#define NILCONSTANT	{{tag2tt(LUA_TNIL), {NULL}}}
+/* field-access macros */
+#define v_(o)		((o)->u.i.v__)
+#define d_(o)		((o)->u.d__)
+#define tt_(o)		((o)->u.i.tt__)
+
+#endif				/* } */
+
+#endif			/* } */
+
+
+/* correspondence with standard representation */
+#undef val_
+#define val_(o)		v_(o)
+#undef num_
+#define num_(o)		d_(o)
+
+
+#undef numfield
+#define numfield	/* no such field; numbers are the entire struct */
+
+/* basic check to distinguish numbers from non-numbers */
+#undef ttisnumber
+#define ttisnumber(o)	((tt_(o) & NNMASK) != NNMARK)
+
+#define tag2tt(t)	(NNMARK | (t))
+
+#undef rttype
+#define rttype(o)	(ttisnumber(o) ? LUA_TNUMBER : tt_(o) & 0xff)
+
+#undef settt_
+#define settt_(o,t)	(tt_(o) = tag2tt(t))
+
+#undef setnvalue
+#define setnvalue(obj,x) \
+	{ TValue *io_=(obj); num_(io_)=(x); lua_assert(ttisnumber(io_)); }
+
+#undef setobj
+#define setobj(L,obj1,obj2) \
+	{ const TValue *o2_=(obj2); TValue *o1_=(obj1); \
+	  o1_->u = o2_->u; \
+	  checkliveness(G(L),o1_); }
+
+
+/*
+** these redefinitions are not mandatory, but these forms are more efficient
+*/
+
+#undef checktag
+#undef checktype
+#define checktag(o,t)	(tt_(o) == tag2tt(t))
+#define checktype(o,t)	(ctb(tt_(o) | VARBITS) == ctb(tag2tt(t) | VARBITS))
+
+#undef ttisequal
+#define ttisequal(o1,o2)  \
+	(ttisnumber(o1) ? ttisnumber(o2) : (tt_(o1) == tt_(o2)))
+
+
+#undef luai_checknum
+#define luai_checknum(L,o,c)	{ if (!ttisnumber(o)) c; }
+
+#endif
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** types and prototypes
+** =======================================================
+*/
+
+
+union Value {
+  GCObject *gc;    /* collectable objects */
+  void *p;         /* light userdata */
+  int b;           /* booleans */
+  lua_CFunction f; /* light C functions */
+  numfield         /* numbers */
+};
+
+
+struct lua_TValue {
+  TValuefields;
+};
+
+
+typedef TValue *StkId;  /* index to stack elements */
+
+
+
+
+/*
+** Header for string value; string bytes follow the end of this structure
+*/
+typedef union TString {
+  L_Umaxalign dummy;  /* ensures maximum alignment for strings */
+  struct {
+    CommonHeader;
+    lu_byte extra;  /* reserved words for short strings; "has hash" for longs */
+    unsigned int hash;
+    size_t len;  /* number of characters in string */
+  } tsv;
+} TString;
+
+
+/* get the actual string (array of bytes) from a TString */
+#define getstr(ts)	cast(const char *, (ts) + 1)
+
+/* get the actual string (array of bytes) from a Lua value */
+#define svalue(o)       getstr(rawtsvalue(o))
+
+
+/*
+** Header for userdata; memory area follows the end of this structure
+*/
+typedef union Udata {
+  L_Umaxalign dummy;  /* ensures maximum alignment for `local' udata */
+  struct {
+    CommonHeader;
+    struct Table *metatable;
+    struct Table *env;
+    size_t len;  /* number of bytes */
+  } uv;
+} Udata;
+
+
+
+/*
+** Description of an upvalue for function prototypes
+*/
+typedef struct Upvaldesc {
+  TString *name;  /* upvalue name (for debug information) */
+  lu_byte instack;  /* whether it is in stack */
+  lu_byte idx;  /* index of upvalue (in stack or in outer function's list) */
+} Upvaldesc;
+
+
+/*
+** Description of a local variable for function prototypes
+** (used for debug information)
+*/
+typedef struct LocVar {
+  TString *varname;
+  int startpc;  /* first point where variable is active */
+  int endpc;    /* first point where variable is dead */
+} LocVar;
+
+
+/*
+** Function Prototypes
+*/
+typedef struct Proto {
+  CommonHeader;
+  TValue *k;  /* constants used by the function */
+  Instruction *code;
+  struct Proto **p;  /* functions defined inside the function */
+  int *lineinfo;  /* map from opcodes to source lines (debug information) */
+  LocVar *locvars;  /* information about local variables (debug information) */
+  Upvaldesc *upvalues;  /* upvalue information */
+  union Closure *cache;  /* last created closure with this prototype */
+  TString  *source;  /* used for debug information */
+  int sizeupvalues;  /* size of 'upvalues' */
+  int sizek;  /* size of `k' */
+  int sizecode;
+  int sizelineinfo;
+  int sizep;  /* size of `p' */
+  int sizelocvars;
+  int linedefined;
+  int lastlinedefined;
+  GCObject *gclist;
+  lu_byte numparams;  /* number of fixed parameters */
+  lu_byte is_vararg;
+  lu_byte maxstacksize;  /* maximum stack used by this function */
+} Proto;
+
+
+
+/*
+** Lua Upvalues
+*/
+typedef struct UpVal {
+  CommonHeader;
+  TValue *v;  /* points to stack or to its own value */
+  union {
+    TValue value;  /* the value (when closed) */
+    struct {  /* double linked list (when open) */
+      struct UpVal *prev;
+      struct UpVal *next;
+    } l;
+  } u;
+} UpVal;
+
+
+/*
+** Closures
+*/
+
+#define ClosureHeader \
+	CommonHeader; lu_byte nupvalues; GCObject *gclist
+
+typedef struct CClosure {
+  ClosureHeader;
+  lua_CFunction f;
+  TValue upvalue[1];  /* list of upvalues */
+} CClosure;
+
+
+typedef struct LClosure {
+  ClosureHeader;
+  struct Proto *p;
+  UpVal *upvals[1];  /* list of upvalues */
+} LClosure;
+
+
+typedef union Closure {
+  CClosure c;
+  LClosure l;
+} Closure;
+
+
+#define isLfunction(o)	ttisLclosure(o)
+
+#define getproto(o)	(clLvalue(o)->p)
+
+
+/*
+** Tables
+*/
+
+typedef union TKey {
+  struct {
+    TValuefields;
+    struct Node *next;  /* for chaining */
+  } nk;
+  TValue tvk;
+} TKey;
+
+
+typedef struct Node {
+  TValue i_val;
+  TKey i_key;
+} Node;
+
+
+typedef struct Table {
+  CommonHeader;
+  lu_byte flags;  /* 1<<p means tagmethod(p) is not present */
+  lu_byte lsizenode;  /* log2 of size of `node' array */
+  struct Table *metatable;
+  TValue *array;  /* array part */
+  Node *node;
+  Node *lastfree;  /* any free position is before this position */
+  GCObject *gclist;
+  int sizearray;  /* size of `array' array */
+} Table;
+
+
+
+/*
+** `module' operation for hashing (size is always a power of 2)
+*/
+#define lmod(s,size) \
+	(check_exp((size&(size-1))==0, (cast(int, (s) & ((size)-1)))))
+
+
+#define twoto(x)	(1<<(x))
+#define sizenode(t)	(twoto((t)->lsizenode))
+
+
+/*
+** (address of) a fixed nil value
+*/
+#define luaO_nilobject		(&luaO_nilobject_)
+
+
+LUAI_DDEC const TValue luaO_nilobject_;
+
+
+LUAI_FUNC int luaO_int2fb (unsigned int x);
+LUAI_FUNC int luaO_fb2int (int x);
+LUAI_FUNC int luaO_ceillog2 (unsigned int x);
+LUAI_FUNC lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2);
+LUAI_FUNC int luaO_str2d (const char *s, size_t len, lua_Number *result);
+LUAI_FUNC int luaO_hexavalue (int c);
+LUAI_FUNC const char *luaO_pushvfstring (lua_State *L, const char *fmt,
+                                                       va_list argp);
+LUAI_FUNC const char *luaO_pushfstring (lua_State *L, const char *fmt, ...);
+LUAI_FUNC void luaO_chunkid (char *out, const char *source, size_t len);
+
+
+#endif
+
diff --git a/ext/lua/includes/lopcodes.h b/ext/lua/includes/lopcodes.h
new file mode 100644
index 0000000..07d2b3f
--- /dev/null
+++ b/ext/lua/includes/lopcodes.h
@@ -0,0 +1,288 @@
+/*
+** $Id: lopcodes.h,v 1.142 2011/07/15 12:50:29 roberto Exp $
+** Opcodes for Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lopcodes_h
+#define lopcodes_h
+
+#include "llimits.h"
+
+
+/*===========================================================================
+  We assume that instructions are unsigned numbers.
+  All instructions have an opcode in the first 6 bits.
+  Instructions can have the following fields:
+	`A' : 8 bits
+	`B' : 9 bits
+	`C' : 9 bits
+	'Ax' : 26 bits ('A', 'B', and 'C' together)
+	`Bx' : 18 bits (`B' and `C' together)
+	`sBx' : signed Bx
+
+  A signed argument is represented in excess K; that is, the number
+  value is the unsigned value minus K. K is exactly the maximum value
+  for that argument (so that -max is represented by 0, and +max is
+  represented by 2*max), which is half the maximum for the corresponding
+  unsigned argument.
+===========================================================================*/
+
+
+enum OpMode {iABC, iABx, iAsBx, iAx};  /* basic instruction format */
+
+
+/*
+** size and position of opcode arguments.
+*/
+#define SIZE_C		9
+#define SIZE_B		9
+#define SIZE_Bx		(SIZE_C + SIZE_B)
+#define SIZE_A		8
+#define SIZE_Ax		(SIZE_C + SIZE_B + SIZE_A)
+
+#define SIZE_OP		6
+
+#define POS_OP		0
+#define POS_A		(POS_OP + SIZE_OP)
+#define POS_C		(POS_A + SIZE_A)
+#define POS_B		(POS_C + SIZE_C)
+#define POS_Bx		POS_C
+#define POS_Ax		POS_A
+
+
+/*
+** limits for opcode arguments.
+** we use (signed) int to manipulate most arguments,
+** so they must fit in LUAI_BITSINT-1 bits (-1 for sign)
+*/
+#if SIZE_Bx < LUAI_BITSINT-1
+#define MAXARG_Bx        ((1<<SIZE_Bx)-1)
+#define MAXARG_sBx        (MAXARG_Bx>>1)         /* `sBx' is signed */
+#else
+#define MAXARG_Bx        MAX_INT
+#define MAXARG_sBx        MAX_INT
+#endif
+
+#if SIZE_Ax < LUAI_BITSINT-1
+#define MAXARG_Ax	((1<<SIZE_Ax)-1)
+#else
+#define MAXARG_Ax	MAX_INT
+#endif
+
+
+#define MAXARG_A        ((1<<SIZE_A)-1)
+#define MAXARG_B        ((1<<SIZE_B)-1)
+#define MAXARG_C        ((1<<SIZE_C)-1)
+
+
+/* creates a mask with `n' 1 bits at position `p' */
+#define MASK1(n,p)	((~((~(Instruction)0)<<(n)))<<(p))
+
+/* creates a mask with `n' 0 bits at position `p' */
+#define MASK0(n,p)	(~MASK1(n,p))
+
+/*
+** the following macros help to manipulate instructions
+*/
+
+#define GET_OPCODE(i)	(cast(OpCode, ((i)>>POS_OP) & MASK1(SIZE_OP,0)))
+#define SET_OPCODE(i,o)	((i) = (((i)&MASK0(SIZE_OP,POS_OP)) | \
+		((cast(Instruction, o)<<POS_OP)&MASK1(SIZE_OP,POS_OP))))
+
+#define getarg(i,pos,size)	(cast(int, ((i)>>pos) & MASK1(size,0)))
+#define setarg(i,v,pos,size)	((i) = (((i)&MASK0(size,pos)) | \
+                ((cast(Instruction, v)<<pos)&MASK1(size,pos))))
+
+#define GETARG_A(i)	getarg(i, POS_A, SIZE_A)
+#define SETARG_A(i,v)	setarg(i, v, POS_A, SIZE_A)
+
+#define GETARG_B(i)	getarg(i, POS_B, SIZE_B)
+#define SETARG_B(i,v)	setarg(i, v, POS_B, SIZE_B)
+
+#define GETARG_C(i)	getarg(i, POS_C, SIZE_C)
+#define SETARG_C(i,v)	setarg(i, v, POS_C, SIZE_C)
+
+#define GETARG_Bx(i)	getarg(i, POS_Bx, SIZE_Bx)
+#define SETARG_Bx(i,v)	setarg(i, v, POS_Bx, SIZE_Bx)
+
+#define GETARG_Ax(i)	getarg(i, POS_Ax, SIZE_Ax)
+#define SETARG_Ax(i,v)	setarg(i, v, POS_Ax, SIZE_Ax)
+
+#define GETARG_sBx(i)	(GETARG_Bx(i)-MAXARG_sBx)
+#define SETARG_sBx(i,b)	SETARG_Bx((i),cast(unsigned int, (b)+MAXARG_sBx))
+
+
+#define CREATE_ABC(o,a,b,c)	((cast(Instruction, o)<<POS_OP) \
+			| (cast(Instruction, a)<<POS_A) \
+			| (cast(Instruction, b)<<POS_B) \
+			| (cast(Instruction, c)<<POS_C))
+
+#define CREATE_ABx(o,a,bc)	((cast(Instruction, o)<<POS_OP) \
+			| (cast(Instruction, a)<<POS_A) \
+			| (cast(Instruction, bc)<<POS_Bx))
+
+#define CREATE_Ax(o,a)		((cast(Instruction, o)<<POS_OP) \
+			| (cast(Instruction, a)<<POS_Ax))
+
+
+/*
+** Macros to operate RK indices
+*/
+
+/* this bit 1 means constant (0 means register) */
+#define BITRK		(1 << (SIZE_B - 1))
+
+/* test whether value is a constant */
+#define ISK(x)		((x) & BITRK)
+
+/* gets the index of the constant */
+#define INDEXK(r)	((int)(r) & ~BITRK)
+
+#define MAXINDEXRK	(BITRK - 1)
+
+/* code a constant index as a RK value */
+#define RKASK(x)	((x) | BITRK)
+
+
+/*
+** invalid register that fits in 8 bits
+*/
+#define NO_REG		MAXARG_A
+
+
+/*
+** R(x) - register
+** Kst(x) - constant (in constant table)
+** RK(x) == if ISK(x) then Kst(INDEXK(x)) else R(x)
+*/
+
+
+/*
+** grep "ORDER OP" if you change these enums
+*/
+
+typedef enum {
+/*----------------------------------------------------------------------
+name		args	description
+------------------------------------------------------------------------*/
+OP_MOVE,/*	A B	R(A) := R(B)					*/
+OP_LOADK,/*	A Bx	R(A) := Kst(Bx)					*/
+OP_LOADKX,/*	A 	R(A) := Kst(extra arg)				*/
+OP_LOADBOOL,/*	A B C	R(A) := (Bool)B; if (C) pc++			*/
+OP_LOADNIL,/*	A B	R(A), R(A+1), ..., R(A+B) := nil		*/
+OP_GETUPVAL,/*	A B	R(A) := UpValue[B]				*/
+
+OP_GETTABUP,/*	A B C	R(A) := UpValue[B][RK(C)]			*/
+OP_GETTABLE,/*	A B C	R(A) := R(B)[RK(C)]				*/
+
+OP_SETTABUP,/*	A B C	UpValue[A][RK(B)] := RK(C)			*/
+OP_SETUPVAL,/*	A B	UpValue[B] := R(A)				*/
+OP_SETTABLE,/*	A B C	R(A)[RK(B)] := RK(C)				*/
+
+OP_NEWTABLE,/*	A B C	R(A) := {} (size = B,C)				*/
+
+OP_SELF,/*	A B C	R(A+1) := R(B); R(A) := R(B)[RK(C)]		*/
+
+OP_ADD,/*	A B C	R(A) := RK(B) + RK(C)				*/
+OP_SUB,/*	A B C	R(A) := RK(B) - RK(C)				*/
+OP_MUL,/*	A B C	R(A) := RK(B) * RK(C)				*/
+OP_DIV,/*	A B C	R(A) := RK(B) / RK(C)				*/
+OP_MOD,/*	A B C	R(A) := RK(B) % RK(C)				*/
+OP_POW,/*	A B C	R(A) := RK(B) ^ RK(C)				*/
+OP_UNM,/*	A B	R(A) := -R(B)					*/
+OP_NOT,/*	A B	R(A) := not R(B)				*/
+OP_LEN,/*	A B	R(A) := length of R(B)				*/
+
+OP_CONCAT,/*	A B C	R(A) := R(B).. ... ..R(C)			*/
+
+OP_JMP,/*	A sBx	pc+=sBx; if (A) close all upvalues >= R(A) + 1	*/
+OP_EQ,/*	A B C	if ((RK(B) == RK(C)) ~= A) then pc++		*/
+OP_LT,/*	A B C	if ((RK(B) <  RK(C)) ~= A) then pc++		*/
+OP_LE,/*	A B C	if ((RK(B) <= RK(C)) ~= A) then pc++		*/
+
+OP_TEST,/*	A C	if not (R(A) <=> C) then pc++			*/
+OP_TESTSET,/*	A B C	if (R(B) <=> C) then R(A) := R(B) else pc++	*/
+
+OP_CALL,/*	A B C	R(A), ... ,R(A+C-2) := R(A)(R(A+1), ... ,R(A+B-1)) */
+OP_TAILCALL,/*	A B C	return R(A)(R(A+1), ... ,R(A+B-1))		*/
+OP_RETURN,/*	A B	return R(A), ... ,R(A+B-2)	(see note)	*/
+
+OP_FORLOOP,/*	A sBx	R(A)+=R(A+2);
+			if R(A) <?= R(A+1) then { pc+=sBx; R(A+3)=R(A) }*/
+OP_FORPREP,/*	A sBx	R(A)-=R(A+2); pc+=sBx				*/
+
+OP_TFORCALL,/*	A C	R(A+3), ... ,R(A+2+C) := R(A)(R(A+1), R(A+2));	*/
+OP_TFORLOOP,/*	A sBx	if R(A+1) ~= nil then { R(A)=R(A+1); pc += sBx }*/
+
+OP_SETLIST,/*	A B C	R(A)[(C-1)*FPF+i] := R(A+i), 1 <= i <= B	*/
+
+OP_CLOSURE,/*	A Bx	R(A) := closure(KPROTO[Bx])			*/
+
+OP_VARARG,/*	A B	R(A), R(A+1), ..., R(A+B-2) = vararg		*/
+
+OP_EXTRAARG/*	Ax	extra (larger) argument for previous opcode	*/
+} OpCode;
+
+
+#define NUM_OPCODES	(cast(int, OP_EXTRAARG) + 1)
+
+
+
+/*===========================================================================
+  Notes:
+  (*) In OP_CALL, if (B == 0) then B = top. If (C == 0), then `top' is
+  set to last_result+1, so next open instruction (OP_CALL, OP_RETURN,
+  OP_SETLIST) may use `top'.
+
+  (*) In OP_VARARG, if (B == 0) then use actual number of varargs and
+  set top (like in OP_CALL with C == 0).
+
+  (*) In OP_RETURN, if (B == 0) then return up to `top'.
+
+  (*) In OP_SETLIST, if (B == 0) then B = `top'; if (C == 0) then next
+  'instruction' is EXTRAARG(real C).
+
+  (*) In OP_LOADKX, the next 'instruction' is always EXTRAARG.
+
+  (*) For comparisons, A specifies what condition the test should accept
+  (true or false).
+
+  (*) All `skips' (pc++) assume that next instruction is a jump.
+
+===========================================================================*/
+
+
+/*
+** masks for instruction properties. The format is:
+** bits 0-1: op mode
+** bits 2-3: C arg mode
+** bits 4-5: B arg mode
+** bit 6: instruction set register A
+** bit 7: operator is a test (next instruction must be a jump)
+*/
+
+enum OpArgMask {
+  OpArgN,  /* argument is not used */
+  OpArgU,  /* argument is used */
+  OpArgR,  /* argument is a register or a jump offset */
+  OpArgK   /* argument is a constant or register/constant */
+};
+
+LUAI_DDEC const lu_byte luaP_opmodes[NUM_OPCODES];
+
+#define getOpMode(m)	(cast(enum OpMode, luaP_opmodes[m] & 3))
+#define getBMode(m)	(cast(enum OpArgMask, (luaP_opmodes[m] >> 4) & 3))
+#define getCMode(m)	(cast(enum OpArgMask, (luaP_opmodes[m] >> 2) & 3))
+#define testAMode(m)	(luaP_opmodes[m] & (1 << 6))
+#define testTMode(m)	(luaP_opmodes[m] & (1 << 7))
+
+
+LUAI_DDEC const char *const luaP_opnames[NUM_OPCODES+1];  /* opcode names */
+
+
+/* number of list items to accumulate before a SETLIST instruction */
+#define LFIELDS_PER_FLUSH	50
+
+
+#endif
diff --git a/ext/lua/includes/lparser.h b/ext/lua/includes/lparser.h
new file mode 100644
index 0000000..301167d
--- /dev/null
+++ b/ext/lua/includes/lparser.h
@@ -0,0 +1,119 @@
+/*
+** $Id: lparser.h,v 1.70 2012/05/08 13:53:33 roberto Exp $
+** Lua Parser
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lparser_h
+#define lparser_h
+
+#include "llimits.h"
+#include "lobject.h"
+#include "lzio.h"
+
+
+/*
+** Expression descriptor
+*/
+
+typedef enum {
+  VVOID,	/* no value */
+  VNIL,
+  VTRUE,
+  VFALSE,
+  VK,		/* info = index of constant in `k' */
+  VKNUM,	/* nval = numerical value */
+  VNONRELOC,	/* info = result register */
+  VLOCAL,	/* info = local register */
+  VUPVAL,       /* info = index of upvalue in 'upvalues' */
+  VINDEXED,	/* t = table register/upvalue; idx = index R/K */
+  VJMP,		/* info = instruction pc */
+  VRELOCABLE,	/* info = instruction pc */
+  VCALL,	/* info = instruction pc */
+  VVARARG	/* info = instruction pc */
+} expkind;
+
+
+#define vkisvar(k)	(VLOCAL <= (k) && (k) <= VINDEXED)
+#define vkisinreg(k)	((k) == VNONRELOC || (k) == VLOCAL)
+
+typedef struct expdesc {
+  expkind k;
+  union {
+    struct {  /* for indexed variables (VINDEXED) */
+      short idx;  /* index (R/K) */
+      lu_byte t;  /* table (register or upvalue) */
+      lu_byte vt;  /* whether 't' is register (VLOCAL) or upvalue (VUPVAL) */
+    } ind;
+    int info;  /* for generic use */
+    lua_Number nval;  /* for VKNUM */
+  } u;
+  int t;  /* patch list of `exit when true' */
+  int f;  /* patch list of `exit when false' */
+} expdesc;
+
+
+/* description of active local variable */
+typedef struct Vardesc {
+  short idx;  /* variable index in stack */
+} Vardesc;
+
+
+/* description of pending goto statements and label statements */
+typedef struct Labeldesc {
+  TString *name;  /* label identifier */
+  int pc;  /* position in code */
+  int line;  /* line where it appeared */
+  lu_byte nactvar;  /* local level where it appears in current block */
+} Labeldesc;
+
+
+/* list of labels or gotos */
+typedef struct Labellist {
+  Labeldesc *arr;  /* array */
+  int n;  /* number of entries in use */
+  int size;  /* array size */
+} Labellist;
+
+
+/* dynamic structures used by the parser */
+typedef struct Dyndata {
+  struct {  /* list of active local variables */
+    Vardesc *arr;
+    int n;
+    int size;
+  } actvar;
+  Labellist gt;  /* list of pending gotos */
+  Labellist label;   /* list of active labels */
+} Dyndata;
+
+
+/* control of blocks */
+struct BlockCnt;  /* defined in lparser.c */
+
+
+/* state needed to generate code for a given function */
+typedef struct FuncState {
+  Proto *f;  /* current function header */
+  Table *h;  /* table to find (and reuse) elements in `k' */
+  struct FuncState *prev;  /* enclosing function */
+  struct LexState *ls;  /* lexical state */
+  struct BlockCnt *bl;  /* chain of current blocks */
+  int pc;  /* next position to code (equivalent to `ncode') */
+  int lasttarget;   /* 'label' of last 'jump label' */
+  int jpc;  /* list of pending jumps to `pc' */
+  int nk;  /* number of elements in `k' */
+  int np;  /* number of elements in `p' */
+  int firstlocal;  /* index of first local var (in Dyndata array) */
+  short nlocvars;  /* number of elements in 'f->locvars' */
+  lu_byte nactvar;  /* number of active local variables */
+  lu_byte nups;  /* number of upvalues */
+  lu_byte freereg;  /* first free register */
+} FuncState;
+
+
+LUAI_FUNC Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff,
+                                Dyndata *dyd, const char *name, int firstchar);
+
+
+#endif
diff --git a/ext/lua/includes/lstate.h b/ext/lua/includes/lstate.h
new file mode 100644
index 0000000..c8a31f5
--- /dev/null
+++ b/ext/lua/includes/lstate.h
@@ -0,0 +1,228 @@
+/*
+** $Id: lstate.h,v 2.82 2012/07/02 13:37:04 roberto Exp $
+** Global State
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lstate_h
+#define lstate_h
+
+#include "lua.h"
+
+#include "lobject.h"
+#include "ltm.h"
+#include "lzio.h"
+
+
+/*
+
+** Some notes about garbage-collected objects:  All objects in Lua must
+** be kept somehow accessible until being freed.
+**
+** Lua keeps most objects linked in list g->allgc. The link uses field
+** 'next' of the CommonHeader.
+**
+** Strings are kept in several lists headed by the array g->strt.hash.
+**
+** Open upvalues are not subject to independent garbage collection. They
+** are collected together with their respective threads. Lua keeps a
+** double-linked list with all open upvalues (g->uvhead) so that it can
+** mark objects referred by them. (They are always gray, so they must
+** be remarked in the atomic step. Usually their contents would be marked
+** when traversing the respective threads, but the thread may already be
+** dead, while the upvalue is still accessible through closures.)
+**
+** Objects with finalizers are kept in the list g->finobj.
+**
+** The list g->tobefnz links all objects being finalized.
+
+*/
+
+
+struct lua_longjmp;  /* defined in ldo.c */
+
+
+
+/* extra stack space to handle TM calls and some other extras */
+#define EXTRA_STACK   5
+
+
+#define BASIC_STACK_SIZE        (2*LUA_MINSTACK)
+
+
+/* kinds of Garbage Collection */
+#define KGC_NORMAL	0
+#define KGC_EMERGENCY	1	/* gc was forced by an allocation failure */
+#define KGC_GEN		2	/* generational collection */
+
+
+typedef struct stringtable {
+  GCObject **hash;
+  lu_int32 nuse;  /* number of elements */
+  int size;
+} stringtable;
+
+
+/*
+** information about a call
+*/
+typedef struct CallInfo {
+  StkId func;  /* function index in the stack */
+  StkId	top;  /* top for this function */
+  struct CallInfo *previous, *next;  /* dynamic call link */
+  short nresults;  /* expected number of results from this function */
+  lu_byte callstatus;
+  ptrdiff_t extra;
+  union {
+    struct {  /* only for Lua functions */
+      StkId base;  /* base for this function */
+      const Instruction *savedpc;
+    } l;
+    struct {  /* only for C functions */
+      int ctx;  /* context info. in case of yields */
+      lua_CFunction k;  /* continuation in case of yields */
+      ptrdiff_t old_errfunc;
+      lu_byte old_allowhook;
+      lu_byte status;
+    } c;
+  } u;
+} CallInfo;
+
+
+/*
+** Bits in CallInfo status
+*/
+#define CIST_LUA	(1<<0)	/* call is running a Lua function */
+#define CIST_HOOKED	(1<<1)	/* call is running a debug hook */
+#define CIST_REENTRY	(1<<2)	/* call is running on same invocation of
+                                   luaV_execute of previous call */
+#define CIST_YIELDED	(1<<3)	/* call reentered after suspension */
+#define CIST_YPCALL	(1<<4)	/* call is a yieldable protected call */
+#define CIST_STAT	(1<<5)	/* call has an error status (pcall) */
+#define CIST_TAIL	(1<<6)	/* call was tail called */
+#define CIST_HOOKYIELD	(1<<7)	/* last hook called yielded */
+
+
+#define isLua(ci)	((ci)->callstatus & CIST_LUA)
+
+
+/*
+** `global state', shared by all threads of this state
+*/
+typedef struct global_State {
+  lua_Alloc frealloc;  /* function to reallocate memory */
+  void *ud;         /* auxiliary data to `frealloc' */
+  lu_mem totalbytes;  /* number of bytes currently allocated - GCdebt */
+  l_mem GCdebt;  /* bytes allocated not yet compensated by the collector */
+  lu_mem GCmemtrav;  /* memory traversed by the GC */
+  lu_mem GCestimate;  /* an estimate of the non-garbage memory in use */
+  stringtable strt;  /* hash table for strings */
+  TValue l_registry;
+  unsigned int seed;  /* randomized seed for hashes */
+  lu_byte currentwhite;
+  lu_byte gcstate;  /* state of garbage collector */
+  lu_byte gckind;  /* kind of GC running */
+  lu_byte gcrunning;  /* true if GC is running */
+  int sweepstrgc;  /* position of sweep in `strt' */
+  GCObject *allgc;  /* list of all collectable objects */
+  GCObject *finobj;  /* list of collectable objects with finalizers */
+  GCObject **sweepgc;  /* current position of sweep in list 'allgc' */
+  GCObject **sweepfin;  /* current position of sweep in list 'finobj' */
+  GCObject *gray;  /* list of gray objects */
+  GCObject *grayagain;  /* list of objects to be traversed atomically */
+  GCObject *weak;  /* list of tables with weak values */
+  GCObject *ephemeron;  /* list of ephemeron tables (weak keys) */
+  GCObject *allweak;  /* list of all-weak tables */
+  GCObject *tobefnz;  /* list of userdata to be GC */
+  UpVal uvhead;  /* head of double-linked list of all open upvalues */
+  Mbuffer buff;  /* temporary buffer for string concatenation */
+  int gcpause;  /* size of pause between successive GCs */
+  int gcmajorinc;  /* pause between major collections (only in gen. mode) */
+  int gcstepmul;  /* GC `granularity' */
+  lua_CFunction panic;  /* to be called in unprotected errors */
+  struct lua_State *mainthread;
+  const lua_Number *version;  /* pointer to version number */
+  TString *memerrmsg;  /* memory-error message */
+  TString *tmname[TM_N];  /* array with tag-method names */
+  struct Table *mt[LUA_NUMTAGS];  /* metatables for basic types */
+} global_State;
+
+
+/*
+** `per thread' state
+*/
+struct lua_State {
+  CommonHeader;
+  lu_byte status;
+  StkId top;  /* first free slot in the stack */
+  global_State *l_G;
+  CallInfo *ci;  /* call info for current function */
+  const Instruction *oldpc;  /* last pc traced */
+  StkId stack_last;  /* last free slot in the stack */
+  StkId stack;  /* stack base */
+  int stacksize;
+  unsigned short nny;  /* number of non-yieldable calls in stack */
+  unsigned short nCcalls;  /* number of nested C calls */
+  lu_byte hookmask;
+  lu_byte allowhook;
+  int basehookcount;
+  int hookcount;
+  lua_Hook hook;
+  GCObject *openupval;  /* list of open upvalues in this stack */
+  GCObject *gclist;
+  struct lua_longjmp *errorJmp;  /* current error recover point */
+  ptrdiff_t errfunc;  /* current error handling function (stack index) */
+  CallInfo base_ci;  /* CallInfo for first level (C calling Lua) */
+};
+
+
+#define G(L)	(L->l_G)
+
+
+/*
+** Union of all collectable objects
+*/
+union GCObject {
+  GCheader gch;  /* common header */
+  union TString ts;
+  union Udata u;
+  union Closure cl;
+  struct Table h;
+  struct Proto p;
+  struct UpVal uv;
+  struct lua_State th;  /* thread */
+};
+
+
+#define gch(o)		(&(o)->gch)
+
+/* macros to convert a GCObject into a specific value */
+#define rawgco2ts(o)  \
+	check_exp(novariant((o)->gch.tt) == LUA_TSTRING, &((o)->ts))
+#define gco2ts(o)	(&rawgco2ts(o)->tsv)
+#define rawgco2u(o)	check_exp((o)->gch.tt == LUA_TUSERDATA, &((o)->u))
+#define gco2u(o)	(&rawgco2u(o)->uv)
+#define gco2lcl(o)	check_exp((o)->gch.tt == LUA_TLCL, &((o)->cl.l))
+#define gco2ccl(o)	check_exp((o)->gch.tt == LUA_TCCL, &((o)->cl.c))
+#define gco2cl(o)  \
+	check_exp(novariant((o)->gch.tt) == LUA_TFUNCTION, &((o)->cl))
+#define gco2t(o)	check_exp((o)->gch.tt == LUA_TTABLE, &((o)->h))
+#define gco2p(o)	check_exp((o)->gch.tt == LUA_TPROTO, &((o)->p))
+#define gco2uv(o)	check_exp((o)->gch.tt == LUA_TUPVAL, &((o)->uv))
+#define gco2th(o)	check_exp((o)->gch.tt == LUA_TTHREAD, &((o)->th))
+
+/* macro to convert any Lua object into a GCObject */
+#define obj2gco(v)	(cast(GCObject *, (v)))
+
+
+/* actual number of total bytes allocated */
+#define gettotalbytes(g)	((g)->totalbytes + (g)->GCdebt)
+
+LUAI_FUNC void luaE_setdebt (global_State *g, l_mem debt);
+LUAI_FUNC void luaE_freethread (lua_State *L, lua_State *L1);
+LUAI_FUNC CallInfo *luaE_extendCI (lua_State *L);
+LUAI_FUNC void luaE_freeCI (lua_State *L);
+
+
+#endif
+
diff --git a/ext/lua/includes/lstring.h b/ext/lua/includes/lstring.h
new file mode 100644
index 0000000..d312ff3
--- /dev/null
+++ b/ext/lua/includes/lstring.h
@@ -0,0 +1,46 @@
+/*
+** $Id: lstring.h,v 1.49 2012/02/01 21:57:15 roberto Exp $
+** String table (keep all strings handled by Lua)
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lstring_h
+#define lstring_h
+
+#include "lgc.h"
+#include "lobject.h"
+#include "lstate.h"
+
+
+#define sizestring(s)	(sizeof(union TString)+((s)->len+1)*sizeof(char))
+
+#define sizeudata(u)	(sizeof(union Udata)+(u)->len)
+
+#define luaS_newliteral(L, s)	(luaS_newlstr(L, "" s, \
+                                 (sizeof(s)/sizeof(char))-1))
+
+#define luaS_fix(s)	l_setbit((s)->tsv.marked, FIXEDBIT)
+
+
+/*
+** test whether a string is a reserved word
+*/
+#define isreserved(s)	((s)->tsv.tt == LUA_TSHRSTR && (s)->tsv.extra > 0)
+
+
+/*
+** equality for short strings, which are always internalized
+*/
+#define eqshrstr(a,b)	check_exp((a)->tsv.tt == LUA_TSHRSTR, (a) == (b))
+
+
+LUAI_FUNC unsigned int luaS_hash (const char *str, size_t l, unsigned int seed);
+LUAI_FUNC int luaS_eqlngstr (TString *a, TString *b);
+LUAI_FUNC int luaS_eqstr (TString *a, TString *b);
+LUAI_FUNC void luaS_resize (lua_State *L, int newsize);
+LUAI_FUNC Udata *luaS_newudata (lua_State *L, size_t s, Table *e);
+LUAI_FUNC TString *luaS_newlstr (lua_State *L, const char *str, size_t l);
+LUAI_FUNC TString *luaS_new (lua_State *L, const char *str);
+
+
+#endif
diff --git a/ext/lua/includes/ltable.h b/ext/lua/includes/ltable.h
new file mode 100644
index 0000000..2f6f5c2
--- /dev/null
+++ b/ext/lua/includes/ltable.h
@@ -0,0 +1,41 @@
+/*
+** $Id: ltable.h,v 2.16 2011/08/17 20:26:47 roberto Exp $
+** Lua tables (hash)
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ltable_h
+#define ltable_h
+
+#include "lobject.h"
+
+
+#define gnode(t,i)	(&(t)->node[i])
+#define gkey(n)		(&(n)->i_key.tvk)
+#define gval(n)		(&(n)->i_val)
+#define gnext(n)	((n)->i_key.nk.next)
+
+#define invalidateTMcache(t)	((t)->flags = 0)
+
+
+LUAI_FUNC const TValue *luaH_getint (Table *t, int key);
+LUAI_FUNC void luaH_setint (lua_State *L, Table *t, int key, TValue *value);
+LUAI_FUNC const TValue *luaH_getstr (Table *t, TString *key);
+LUAI_FUNC const TValue *luaH_get (Table *t, const TValue *key);
+LUAI_FUNC TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key);
+LUAI_FUNC TValue *luaH_set (lua_State *L, Table *t, const TValue *key);
+LUAI_FUNC Table *luaH_new (lua_State *L);
+LUAI_FUNC void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize);
+LUAI_FUNC void luaH_resizearray (lua_State *L, Table *t, int nasize);
+LUAI_FUNC void luaH_free (lua_State *L, Table *t);
+LUAI_FUNC int luaH_next (lua_State *L, Table *t, StkId key);
+LUAI_FUNC int luaH_getn (Table *t);
+
+
+#if defined(LUA_DEBUG)
+LUAI_FUNC Node *luaH_mainposition (const Table *t, const TValue *key);
+LUAI_FUNC int luaH_isdummy (Node *n);
+#endif
+
+
+#endif
diff --git a/ext/lua/includes/ltm.h b/ext/lua/includes/ltm.h
new file mode 100644
index 0000000..89bdc19
--- /dev/null
+++ b/ext/lua/includes/ltm.h
@@ -0,0 +1,57 @@
+/*
+** $Id: ltm.h,v 2.11 2011/02/28 17:32:10 roberto Exp $
+** Tag methods
+** See Copyright Notice in lua.h
+*/
+
+#ifndef ltm_h
+#define ltm_h
+
+
+#include "lobject.h"
+
+
+/*
+* WARNING: if you change the order of this enumeration,
+* grep "ORDER TM"
+*/
+typedef enum {
+  TM_INDEX,
+  TM_NEWINDEX,
+  TM_GC,
+  TM_MODE,
+  TM_LEN,
+  TM_EQ,  /* last tag method with `fast' access */
+  TM_ADD,
+  TM_SUB,
+  TM_MUL,
+  TM_DIV,
+  TM_MOD,
+  TM_POW,
+  TM_UNM,
+  TM_LT,
+  TM_LE,
+  TM_CONCAT,
+  TM_CALL,
+  TM_N		/* number of elements in the enum */
+} TMS;
+
+
+
+#define gfasttm(g,et,e) ((et) == NULL ? NULL : \
+  ((et)->flags & (1u<<(e))) ? NULL : luaT_gettm(et, e, (g)->tmname[e]))
+
+#define fasttm(l,et,e)	gfasttm(G(l), et, e)
+
+#define ttypename(x)	luaT_typenames_[(x) + 1]
+#define objtypename(x)	ttypename(ttypenv(x))
+
+LUAI_DDEC const char *const luaT_typenames_[LUA_TOTALTAGS];
+
+
+LUAI_FUNC const TValue *luaT_gettm (Table *events, TMS event, TString *ename);
+LUAI_FUNC const TValue *luaT_gettmbyobj (lua_State *L, const TValue *o,
+                                                       TMS event);
+LUAI_FUNC void luaT_init (lua_State *L);
+
+#endif
diff --git a/ext/lua/includes/lua.h b/ext/lua/includes/lua.h
new file mode 100644
index 0000000..eb0482b
--- /dev/null
+++ b/ext/lua/includes/lua.h
@@ -0,0 +1,444 @@
+/*
+** $Id: lua.h,v 1.285 2013/03/15 13:04:22 roberto Exp $
+** Lua - A Scripting Language
+** Lua.org, PUC-Rio, Brazil (http://www.lua.org)
+** See Copyright Notice at the end of this file
+*/
+
+
+#ifndef lua_h
+#define lua_h
+
+#include <stdarg.h>
+#include <stddef.h>
+
+
+#include "luaconf.h"
+
+
+#define LUA_VERSION_MAJOR	"5"
+#define LUA_VERSION_MINOR	"2"
+#define LUA_VERSION_NUM		502
+#define LUA_VERSION_RELEASE	"2"
+
+#define LUA_VERSION	"Lua " LUA_VERSION_MAJOR "." LUA_VERSION_MINOR
+#define LUA_RELEASE	LUA_VERSION "." LUA_VERSION_RELEASE
+#define LUA_COPYRIGHT	LUA_RELEASE "  Copyright (C) 1994-2013 Lua.org, PUC-Rio"
+#define LUA_AUTHORS	"R. Ierusalimschy, L. H. de Figueiredo, W. Celes"
+
+
+/* mark for precompiled code ('<esc>Lua') */
+#define LUA_SIGNATURE	"\033Lua"
+
+/* option for multiple returns in 'lua_pcall' and 'lua_call' */
+#define LUA_MULTRET	(-1)
+
+
+/*
+** pseudo-indices
+*/
+#define LUA_REGISTRYINDEX	LUAI_FIRSTPSEUDOIDX
+#define lua_upvalueindex(i)	(LUA_REGISTRYINDEX - (i))
+
+
+/* thread status */
+#define LUA_OK		0
+#define LUA_YIELD	1
+#define LUA_ERRRUN	2
+#define LUA_ERRSYNTAX	3
+#define LUA_ERRMEM	4
+#define LUA_ERRGCMM	5
+#define LUA_ERRERR	6
+
+
+typedef struct lua_State lua_State;
+
+typedef int (*lua_CFunction) (lua_State *L);
+
+
+/*
+** functions that read/write blocks when loading/dumping Lua chunks
+*/
+typedef const char * (*lua_Reader) (lua_State *L, void *ud, size_t *sz);
+
+typedef int (*lua_Writer) (lua_State *L, const void* p, size_t sz, void* ud);
+
+
+/*
+** prototype for memory-allocation functions
+*/
+typedef void * (*lua_Alloc) (void *ud, void *ptr, size_t osize, size_t nsize);
+
+
+/*
+** basic types
+*/
+#define LUA_TNONE		(-1)
+
+#define LUA_TNIL		0
+#define LUA_TBOOLEAN		1
+#define LUA_TLIGHTUSERDATA	2
+#define LUA_TNUMBER		3
+#define LUA_TSTRING		4
+#define LUA_TTABLE		5
+#define LUA_TFUNCTION		6
+#define LUA_TUSERDATA		7
+#define LUA_TTHREAD		8
+
+#define LUA_NUMTAGS		9
+
+
+
+/* minimum Lua stack available to a C function */
+#define LUA_MINSTACK	20
+
+
+/* predefined values in the registry */
+#define LUA_RIDX_MAINTHREAD	1
+#define LUA_RIDX_GLOBALS	2
+#define LUA_RIDX_LAST		LUA_RIDX_GLOBALS
+
+
+/* type of numbers in Lua */
+typedef LUA_NUMBER lua_Number;
+
+
+/* type for integer functions */
+typedef LUA_INTEGER lua_Integer;
+
+/* unsigned integer type */
+typedef LUA_UNSIGNED lua_Unsigned;
+
+
+
+/*
+** generic extra include file
+*/
+#if defined(LUA_USER_H)
+#include LUA_USER_H
+#endif
+
+
+/*
+** RCS ident string
+*/
+extern const char lua_ident[];
+
+
+/*
+** state manipulation
+*/
+LUA_API lua_State *(lua_newstate) (lua_Alloc f, void *ud);
+LUA_API void       (lua_close) (lua_State *L);
+LUA_API lua_State *(lua_newthread) (lua_State *L);
+
+LUA_API lua_CFunction (lua_atpanic) (lua_State *L, lua_CFunction panicf);
+
+
+LUA_API const lua_Number *(lua_version) (lua_State *L);
+
+
+/*
+** basic stack manipulation
+*/
+LUA_API int   (lua_absindex) (lua_State *L, int idx);
+LUA_API int   (lua_gettop) (lua_State *L);
+LUA_API void  (lua_settop) (lua_State *L, int idx);
+LUA_API void  (lua_pushvalue) (lua_State *L, int idx);
+LUA_API void  (lua_remove) (lua_State *L, int idx);
+LUA_API void  (lua_insert) (lua_State *L, int idx);
+LUA_API void  (lua_replace) (lua_State *L, int idx);
+LUA_API void  (lua_copy) (lua_State *L, int fromidx, int toidx);
+LUA_API int   (lua_checkstack) (lua_State *L, int sz);
+
+LUA_API void  (lua_xmove) (lua_State *from, lua_State *to, int n);
+
+
+/*
+** access functions (stack -> C)
+*/
+
+LUA_API int             (lua_isnumber) (lua_State *L, int idx);
+LUA_API int             (lua_isstring) (lua_State *L, int idx);
+LUA_API int             (lua_iscfunction) (lua_State *L, int idx);
+LUA_API int             (lua_isuserdata) (lua_State *L, int idx);
+LUA_API int             (lua_type) (lua_State *L, int idx);
+LUA_API const char     *(lua_typename) (lua_State *L, int tp);
+
+LUA_API lua_Number      (lua_tonumberx) (lua_State *L, int idx, int *isnum);
+LUA_API lua_Integer     (lua_tointegerx) (lua_State *L, int idx, int *isnum);
+LUA_API lua_Unsigned    (lua_tounsignedx) (lua_State *L, int idx, int *isnum);
+LUA_API int             (lua_toboolean) (lua_State *L, int idx);
+LUA_API const char     *(lua_tolstring) (lua_State *L, int idx, size_t *len);
+LUA_API size_t          (lua_rawlen) (lua_State *L, int idx);
+LUA_API lua_CFunction   (lua_tocfunction) (lua_State *L, int idx);
+LUA_API void	       *(lua_touserdata) (lua_State *L, int idx);
+LUA_API lua_State      *(lua_tothread) (lua_State *L, int idx);
+LUA_API const void     *(lua_topointer) (lua_State *L, int idx);
+
+
+/*
+** Comparison and arithmetic functions
+*/
+
+#define LUA_OPADD	0	/* ORDER TM */
+#define LUA_OPSUB	1
+#define LUA_OPMUL	2
+#define LUA_OPDIV	3
+#define LUA_OPMOD	4
+#define LUA_OPPOW	5
+#define LUA_OPUNM	6
+
+LUA_API void  (lua_arith) (lua_State *L, int op);
+
+#define LUA_OPEQ	0
+#define LUA_OPLT	1
+#define LUA_OPLE	2
+
+LUA_API int   (lua_rawequal) (lua_State *L, int idx1, int idx2);
+LUA_API int   (lua_compare) (lua_State *L, int idx1, int idx2, int op);
+
+
+/*
+** push functions (C -> stack)
+*/
+LUA_API void        (lua_pushnil) (lua_State *L);
+LUA_API void        (lua_pushnumber) (lua_State *L, lua_Number n);
+LUA_API void        (lua_pushinteger) (lua_State *L, lua_Integer n);
+LUA_API void        (lua_pushunsigned) (lua_State *L, lua_Unsigned n);
+LUA_API const char *(lua_pushlstring) (lua_State *L, const char *s, size_t l);
+LUA_API const char *(lua_pushstring) (lua_State *L, const char *s);
+LUA_API const char *(lua_pushvfstring) (lua_State *L, const char *fmt,
+                                                      va_list argp);
+LUA_API const char *(lua_pushfstring) (lua_State *L, const char *fmt, ...);
+LUA_API void  (lua_pushcclosure) (lua_State *L, lua_CFunction fn, int n);
+LUA_API void  (lua_pushboolean) (lua_State *L, int b);
+LUA_API void  (lua_pushlightuserdata) (lua_State *L, void *p);
+LUA_API int   (lua_pushthread) (lua_State *L);
+
+
+/*
+** get functions (Lua -> stack)
+*/
+LUA_API void  (lua_getglobal) (lua_State *L, const char *var);
+LUA_API void  (lua_gettable) (lua_State *L, int idx);
+LUA_API void  (lua_getfield) (lua_State *L, int idx, const char *k);
+LUA_API void  (lua_rawget) (lua_State *L, int idx);
+LUA_API void  (lua_rawgeti) (lua_State *L, int idx, int n);
+LUA_API void  (lua_rawgetp) (lua_State *L, int idx, const void *p);
+LUA_API void  (lua_createtable) (lua_State *L, int narr, int nrec);
+LUA_API void *(lua_newuserdata) (lua_State *L, size_t sz);
+LUA_API int   (lua_getmetatable) (lua_State *L, int objindex);
+LUA_API void  (lua_getuservalue) (lua_State *L, int idx);
+
+
+/*
+** set functions (stack -> Lua)
+*/
+LUA_API void  (lua_setglobal) (lua_State *L, const char *var);
+LUA_API void  (lua_settable) (lua_State *L, int idx);
+LUA_API void  (lua_setfield) (lua_State *L, int idx, const char *k);
+LUA_API void  (lua_rawset) (lua_State *L, int idx);
+LUA_API void  (lua_rawseti) (lua_State *L, int idx, int n);
+LUA_API void  (lua_rawsetp) (lua_State *L, int idx, const void *p);
+LUA_API int   (lua_setmetatable) (lua_State *L, int objindex);
+LUA_API void  (lua_setuservalue) (lua_State *L, int idx);
+
+
+/*
+** 'load' and 'call' functions (load and run Lua code)
+*/
+LUA_API void  (lua_callk) (lua_State *L, int nargs, int nresults, int ctx,
+                           lua_CFunction k);
+#define lua_call(L,n,r)		lua_callk(L, (n), (r), 0, NULL)
+
+LUA_API int   (lua_getctx) (lua_State *L, int *ctx);
+
+LUA_API int   (lua_pcallk) (lua_State *L, int nargs, int nresults, int errfunc,
+                            int ctx, lua_CFunction k);
+#define lua_pcall(L,n,r,f)	lua_pcallk(L, (n), (r), (f), 0, NULL)
+
+LUA_API int   (lua_load) (lua_State *L, lua_Reader reader, void *dt,
+                                        const char *chunkname,
+                                        const char *mode);
+
+LUA_API int (lua_dump) (lua_State *L, lua_Writer writer, void *data);
+
+
+/*
+** coroutine functions
+*/
+LUA_API int  (lua_yieldk) (lua_State *L, int nresults, int ctx,
+                           lua_CFunction k);
+#define lua_yield(L,n)		lua_yieldk(L, (n), 0, NULL)
+LUA_API int  (lua_resume) (lua_State *L, lua_State *from, int narg);
+LUA_API int  (lua_status) (lua_State *L);
+
+/*
+** garbage-collection function and options
+*/
+
+#define LUA_GCSTOP		0
+#define LUA_GCRESTART		1
+#define LUA_GCCOLLECT		2
+#define LUA_GCCOUNT		3
+#define LUA_GCCOUNTB		4
+#define LUA_GCSTEP		5
+#define LUA_GCSETPAUSE		6
+#define LUA_GCSETSTEPMUL	7
+#define LUA_GCSETMAJORINC	8
+#define LUA_GCISRUNNING		9
+#define LUA_GCGEN		10
+#define LUA_GCINC		11
+
+LUA_API int (lua_gc) (lua_State *L, int what, int data);
+
+
+/*
+** miscellaneous functions
+*/
+
+LUA_API int   (lua_error) (lua_State *L);
+
+LUA_API int   (lua_next) (lua_State *L, int idx);
+
+LUA_API void  (lua_concat) (lua_State *L, int n);
+LUA_API void  (lua_len)    (lua_State *L, int idx);
+
+LUA_API lua_Alloc (lua_getallocf) (lua_State *L, void **ud);
+LUA_API void      (lua_setallocf) (lua_State *L, lua_Alloc f, void *ud);
+
+
+
+/*
+** ===============================================================
+** some useful macros
+** ===============================================================
+*/
+
+#define lua_tonumber(L,i)	lua_tonumberx(L,i,NULL)
+#define lua_tointeger(L,i)	lua_tointegerx(L,i,NULL)
+#define lua_tounsigned(L,i)	lua_tounsignedx(L,i,NULL)
+
+#define lua_pop(L,n)		lua_settop(L, -(n)-1)
+
+#define lua_newtable(L)		lua_createtable(L, 0, 0)
+
+#define lua_register(L,n,f) (lua_pushcfunction(L, (f)), lua_setglobal(L, (n)))
+
+#define lua_pushcfunction(L,f)	lua_pushcclosure(L, (f), 0)
+
+#define lua_isfunction(L,n)	(lua_type(L, (n)) == LUA_TFUNCTION)
+#define lua_istable(L,n)	(lua_type(L, (n)) == LUA_TTABLE)
+#define lua_islightuserdata(L,n)	(lua_type(L, (n)) == LUA_TLIGHTUSERDATA)
+#define lua_isnil(L,n)		(lua_type(L, (n)) == LUA_TNIL)
+#define lua_isboolean(L,n)	(lua_type(L, (n)) == LUA_TBOOLEAN)
+#define lua_isthread(L,n)	(lua_type(L, (n)) == LUA_TTHREAD)
+#define lua_isnone(L,n)		(lua_type(L, (n)) == LUA_TNONE)
+#define lua_isnoneornil(L, n)	(lua_type(L, (n)) <= 0)
+
+#define lua_pushliteral(L, s)	\
+	lua_pushlstring(L, "" s, (sizeof(s)/sizeof(char))-1)
+
+#define lua_pushglobaltable(L)  \
+	lua_rawgeti(L, LUA_REGISTRYINDEX, LUA_RIDX_GLOBALS)
+
+#define lua_tostring(L,i)	lua_tolstring(L, (i), NULL)
+
+
+
+/*
+** {======================================================================
+** Debug API
+** =======================================================================
+*/
+
+
+/*
+** Event codes
+*/
+#define LUA_HOOKCALL	0
+#define LUA_HOOKRET	1
+#define LUA_HOOKLINE	2
+#define LUA_HOOKCOUNT	3
+#define LUA_HOOKTAILCALL 4
+
+
+/*
+** Event masks
+*/
+#define LUA_MASKCALL	(1 << LUA_HOOKCALL)
+#define LUA_MASKRET	(1 << LUA_HOOKRET)
+#define LUA_MASKLINE	(1 << LUA_HOOKLINE)
+#define LUA_MASKCOUNT	(1 << LUA_HOOKCOUNT)
+
+typedef struct lua_Debug lua_Debug;  /* activation record */
+
+
+/* Functions to be called by the debugger in specific events */
+typedef void (*lua_Hook) (lua_State *L, lua_Debug *ar);
+
+
+LUA_API int (lua_getstack) (lua_State *L, int level, lua_Debug *ar);
+LUA_API int (lua_getinfo) (lua_State *L, const char *what, lua_Debug *ar);
+LUA_API const char *(lua_getlocal) (lua_State *L, const lua_Debug *ar, int n);
+LUA_API const char *(lua_setlocal) (lua_State *L, const lua_Debug *ar, int n);
+LUA_API const char *(lua_getupvalue) (lua_State *L, int funcindex, int n);
+LUA_API const char *(lua_setupvalue) (lua_State *L, int funcindex, int n);
+
+LUA_API void *(lua_upvalueid) (lua_State *L, int fidx, int n);
+LUA_API void  (lua_upvaluejoin) (lua_State *L, int fidx1, int n1,
+                                               int fidx2, int n2);
+
+LUA_API int (lua_sethook) (lua_State *L, lua_Hook func, int mask, int count);
+LUA_API lua_Hook (lua_gethook) (lua_State *L);
+LUA_API int (lua_gethookmask) (lua_State *L);
+LUA_API int (lua_gethookcount) (lua_State *L);
+
+
+struct lua_Debug {
+  int event;
+  const char *name;	/* (n) */
+  const char *namewhat;	/* (n) 'global', 'local', 'field', 'method' */
+  const char *what;	/* (S) 'Lua', 'C', 'main', 'tail' */
+  const char *source;	/* (S) */
+  int currentline;	/* (l) */
+  int linedefined;	/* (S) */
+  int lastlinedefined;	/* (S) */
+  unsigned char nups;	/* (u) number of upvalues */
+  unsigned char nparams;/* (u) number of parameters */
+  char isvararg;        /* (u) */
+  char istailcall;	/* (t) */
+  char short_src[LUA_IDSIZE]; /* (S) */
+  /* private part */
+  struct CallInfo *i_ci;  /* active function */
+};
+
+/* }====================================================================== */
+
+
+/******************************************************************************
+* Copyright (C) 1994-2013 Lua.org, PUC-Rio.
+*
+* Permission is hereby granted, free of charge, to any person obtaining
+* a copy of this software and associated documentation files (the
+* "Software"), to deal in the Software without restriction, including
+* without limitation the rights to use, copy, modify, merge, publish,
+* distribute, sublicense, and/or sell copies of the Software, and to
+* permit persons to whom the Software is furnished to do so, subject to
+* the following conditions:
+*
+* The above copyright notice and this permission notice shall be
+* included in all copies or substantial portions of the Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
+* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+******************************************************************************/
+
+
+#endif
diff --git a/ext/lua/includes/luaconf.h b/ext/lua/includes/luaconf.h
new file mode 100644
index 0000000..5c1d0ff
--- /dev/null
+++ b/ext/lua/includes/luaconf.h
@@ -0,0 +1,551 @@
+/*
+** $Id: luaconf.h,v 1.176 2013/03/16 21:10:18 roberto Exp $
+** Configuration file for Lua
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lconfig_h
+#define lconfig_h
+
+#include <limits.h>
+#include <stddef.h>
+
+
+/*
+** ==================================================================
+** Search for "@@" to find all configurable definitions.
+** ===================================================================
+*/
+
+
+/*
+@@ LUA_ANSI controls the use of non-ansi features.
+** CHANGE it (define it) if you want Lua to avoid the use of any
+** non-ansi feature or library.
+*/
+#if !defined(LUA_ANSI) && defined(__STRICT_ANSI__)
+#define LUA_ANSI
+#endif
+
+
+#if !defined(LUA_ANSI) && defined(_WIN32) && !defined(_WIN32_WCE)
+#define LUA_WIN		/* enable goodies for regular Windows platforms */
+#endif
+
+#if defined(LUA_WIN)
+#define LUA_DL_DLL
+#define LUA_USE_AFORMAT		/* assume 'printf' handles 'aA' specifiers */
+#endif
+
+
+
+#if defined(LUA_USE_LINUX)
+#define LUA_USE_POSIX
+#define LUA_USE_DLOPEN		/* needs an extra library: -ldl */
+//#define LUA_USE_READLINE	/* needs some extra libraries */
+#define LUA_USE_STRTODHEX	/* assume 'strtod' handles hex formats */
+#define LUA_USE_AFORMAT		/* assume 'printf' handles 'aA' specifiers */
+#define LUA_USE_LONGLONG	/* assume support for long long */
+#endif
+
+#if defined(LUA_USE_MACOSX)
+#define LUA_USE_POSIX
+#define LUA_USE_DLOPEN		/* does not need -ldl */
+#define LUA_USE_READLINE	/* needs an extra library: -lreadline */
+#define LUA_USE_STRTODHEX	/* assume 'strtod' handles hex formats */
+#define LUA_USE_AFORMAT		/* assume 'printf' handles 'aA' specifiers */
+#define LUA_USE_LONGLONG	/* assume support for long long */
+#endif
+
+
+
+/*
+@@ LUA_USE_POSIX includes all functionality listed as X/Open System
+@* Interfaces Extension (XSI).
+** CHANGE it (define it) if your system is XSI compatible.
+*/
+#if defined(LUA_USE_POSIX)
+#define LUA_USE_MKSTEMP
+#define LUA_USE_ISATTY
+#define LUA_USE_POPEN
+#define LUA_USE_ULONGJMP
+#define LUA_USE_GMTIME_R
+#endif
+
+
+
+/*
+@@ LUA_PATH_DEFAULT is the default path that Lua uses to look for
+@* Lua libraries.
+@@ LUA_CPATH_DEFAULT is the default path that Lua uses to look for
+@* C libraries.
+** CHANGE them if your machine has a non-conventional directory
+** hierarchy or if you want to install your libraries in
+** non-conventional directories.
+*/
+#if defined(_WIN32)	/* { */
+/*
+** In Windows, any exclamation mark ('!') in the path is replaced by the
+** path of the directory of the executable file of the current process.
+*/
+#define LUA_LDIR	"!\\lua\\"
+#define LUA_CDIR	"!\\"
+#define LUA_PATH_DEFAULT  \
+		LUA_LDIR"?.lua;"  LUA_LDIR"?\\init.lua;" \
+		LUA_CDIR"?.lua;"  LUA_CDIR"?\\init.lua;" ".\\?.lua"
+#define LUA_CPATH_DEFAULT \
+		LUA_CDIR"?.dll;" LUA_CDIR"loadall.dll;" ".\\?.dll"
+
+#else			/* }{ */
+
+#define LUA_VDIR	LUA_VERSION_MAJOR "." LUA_VERSION_MINOR "/"
+#define LUA_ROOT	"/usr/local/"
+#define LUA_LDIR	LUA_ROOT "share/lua/" LUA_VDIR
+#define LUA_CDIR	LUA_ROOT "lib/lua/" LUA_VDIR
+#define LUA_PATH_DEFAULT  \
+		LUA_LDIR"?.lua;"  LUA_LDIR"?/init.lua;" \
+		LUA_CDIR"?.lua;"  LUA_CDIR"?/init.lua;" "./?.lua"
+#define LUA_CPATH_DEFAULT \
+		LUA_CDIR"?.so;" LUA_CDIR"loadall.so;" "./?.so"
+#endif			/* } */
+
+
+/*
+@@ LUA_DIRSEP is the directory separator (for submodules).
+** CHANGE it if your machine does not use "/" as the directory separator
+** and is not Windows. (On Windows Lua automatically uses "\".)
+*/
+#if defined(_WIN32)
+#define LUA_DIRSEP	"\\"
+#else
+#define LUA_DIRSEP	"/"
+#endif
+
+
+/*
+@@ LUA_ENV is the name of the variable that holds the current
+@@ environment, used to access global names.
+** CHANGE it if you do not like this name.
+*/
+#define LUA_ENV		"_ENV"
+
+
+/*
+@@ LUA_API is a mark for all core API functions.
+@@ LUALIB_API is a mark for all auxiliary library functions.
+@@ LUAMOD_API is a mark for all standard library opening functions.
+** CHANGE them if you need to define those functions in some special way.
+** For instance, if you want to create one Windows DLL with the core and
+** the libraries, you may want to use the following definition (define
+** LUA_BUILD_AS_DLL to get it).
+*/
+#if defined(LUA_BUILD_AS_DLL)	/* { */
+
+#if defined(LUA_CORE) || defined(LUA_LIB)	/* { */
+#define LUA_API __declspec(dllexport)
+#else						/* }{ */
+#define LUA_API __declspec(dllimport)
+#endif						/* } */
+
+#else				/* }{ */
+
+#define LUA_API		extern
+
+#endif				/* } */
+
+
+/* more often than not the libs go together with the core */
+#define LUALIB_API	LUA_API
+#define LUAMOD_API	LUALIB_API
+
+
+/*
+@@ LUAI_FUNC is a mark for all extern functions that are not to be
+@* exported to outside modules.
+@@ LUAI_DDEF and LUAI_DDEC are marks for all extern (const) variables
+@* that are not to be exported to outside modules (LUAI_DDEF for
+@* definitions and LUAI_DDEC for declarations).
+** CHANGE them if you need to mark them in some special way. Elf/gcc
+** (versions 3.2 and later) mark them as "hidden" to optimize access
+** when Lua is compiled as a shared library. Not all elf targets support
+** this attribute. Unfortunately, gcc does not offer a way to check
+** whether the target offers that support, and those without support
+** give a warning about it. To avoid these warnings, change to the
+** default definition.
+*/
+#if defined(__GNUC__) && ((__GNUC__*100 + __GNUC_MINOR__) >= 302) && \
+    defined(__ELF__)		/* { */
+#define LUAI_FUNC	__attribute__((visibility("hidden"))) extern
+#define LUAI_DDEC	LUAI_FUNC
+#define LUAI_DDEF	/* empty */
+
+#else				/* }{ */
+#define LUAI_FUNC	extern
+#define LUAI_DDEC	extern
+#define LUAI_DDEF	/* empty */
+#endif				/* } */
+
+
+
+/*
+@@ LUA_QL describes how error messages quote program elements.
+** CHANGE it if you want a different appearance.
+*/
+#define LUA_QL(x)	"'" x "'"
+#define LUA_QS		LUA_QL("%s")
+
+
+/*
+@@ LUA_IDSIZE gives the maximum size for the description of the source
+@* of a function in debug information.
+** CHANGE it if you want a different size.
+*/
+#define LUA_IDSIZE	60
+
+
+/*
+@@ luai_writestring/luai_writeline define how 'print' prints its results.
+** They are only used in libraries and the stand-alone program. (The #if
+** avoids including 'stdio.h' everywhere.)
+*/
+#if defined(LUA_LIB) || defined(lua_c)
+#include <stdio.h>
+#define luai_writestring(s,l)	fwrite((s), sizeof(char), (l), stdout)
+#define luai_writeline()	(luai_writestring("\n", 1), fflush(stdout))
+#endif
+
+/*
+@@ luai_writestringerror defines how to print error messages.
+** (A format string with one argument is enough for Lua...)
+*/
+#define luai_writestringerror(s,p) \
+	(fprintf(stderr, (s), (p)), fflush(stderr))
+
+
+/*
+@@ LUAI_MAXSHORTLEN is the maximum length for short strings, that is,
+** strings that are internalized. (Cannot be smaller than reserved words
+** or tags for metamethods, as these strings must be internalized;
+** #("function") = 8, #("__newindex") = 10.)
+*/
+#define LUAI_MAXSHORTLEN        40
+
+
+
+/*
+** {==================================================================
+** Compatibility with previous versions
+** ===================================================================
+*/
+
+/*
+@@ LUA_COMPAT_ALL controls all compatibility options.
+** You can define it to get all options, or change specific options
+** to fit your specific needs.
+*/
+#if defined(LUA_COMPAT_ALL)	/* { */
+
+/*
+@@ LUA_COMPAT_UNPACK controls the presence of global 'unpack'.
+** You can replace it with 'table.unpack'.
+*/
+#define LUA_COMPAT_UNPACK
+
+/*
+@@ LUA_COMPAT_LOADERS controls the presence of table 'package.loaders'.
+** You can replace it with 'package.searchers'.
+*/
+#define LUA_COMPAT_LOADERS
+
+/*
+@@ macro 'lua_cpcall' emulates deprecated function lua_cpcall.
+** You can call your C function directly (with light C functions).
+*/
+#define lua_cpcall(L,f,u)  \
+	(lua_pushcfunction(L, (f)), \
+	 lua_pushlightuserdata(L,(u)), \
+	 lua_pcall(L,1,0,0))
+
+
+/*
+@@ LUA_COMPAT_LOG10 defines the function 'log10' in the math library.
+** You can rewrite 'log10(x)' as 'log(x, 10)'.
+*/
+#define LUA_COMPAT_LOG10
+
+/*
+@@ LUA_COMPAT_LOADSTRING defines the function 'loadstring' in the base
+** library. You can rewrite 'loadstring(s)' as 'load(s)'.
+*/
+#define LUA_COMPAT_LOADSTRING
+
+/*
+@@ LUA_COMPAT_MAXN defines the function 'maxn' in the table library.
+*/
+#define LUA_COMPAT_MAXN
+
+/*
+@@ The following macros supply trivial compatibility for some
+** changes in the API. The macros themselves document how to
+** change your code to avoid using them.
+*/
+#define lua_strlen(L,i)		lua_rawlen(L, (i))
+
+#define lua_objlen(L,i)		lua_rawlen(L, (i))
+
+#define lua_equal(L,idx1,idx2)		lua_compare(L,(idx1),(idx2),LUA_OPEQ)
+#define lua_lessthan(L,idx1,idx2)	lua_compare(L,(idx1),(idx2),LUA_OPLT)
+
+/*
+@@ LUA_COMPAT_MODULE controls compatibility with previous
+** module functions 'module' (Lua) and 'luaL_register' (C).
+*/
+#define LUA_COMPAT_MODULE
+
+#endif				/* } */
+
+/* }================================================================== */
+
+
+
+/*
+@@ LUAI_BITSINT defines the number of bits in an int.
+** CHANGE here if Lua cannot automatically detect the number of bits of
+** your machine. Probably you do not need to change this.
+*/
+/* avoid overflows in comparison */
+#if INT_MAX-20 < 32760		/* { */
+#define LUAI_BITSINT	16
+#elif INT_MAX > 2147483640L	/* }{ */
+/* int has at least 32 bits */
+#define LUAI_BITSINT	32
+#else				/* }{ */
+#error "you must define LUA_BITSINT with number of bits in an integer"
+#endif				/* } */
+
+
+/*
+@@ LUA_INT32 is an signed integer with exactly 32 bits.
+@@ LUAI_UMEM is an unsigned integer big enough to count the total
+@* memory used by Lua.
+@@ LUAI_MEM is a signed integer big enough to count the total memory
+@* used by Lua.
+** CHANGE here if for some weird reason the default definitions are not
+** good enough for your machine. Probably you do not need to change
+** this.
+*/
+#if LUAI_BITSINT >= 32		/* { */
+#define LUA_INT32	int
+#define LUAI_UMEM	size_t
+#define LUAI_MEM	ptrdiff_t
+#else				/* }{ */
+/* 16-bit ints */
+#define LUA_INT32	long
+#define LUAI_UMEM	unsigned long
+#define LUAI_MEM	long
+#endif				/* } */
+
+
+/*
+@@ LUAI_MAXSTACK limits the size of the Lua stack.
+** CHANGE it if you need a different limit. This limit is arbitrary;
+** its only purpose is to stop Lua to consume unlimited stack
+** space (and to reserve some numbers for pseudo-indices).
+*/
+#if LUAI_BITSINT >= 32
+#define LUAI_MAXSTACK		1000000
+#else
+#define LUAI_MAXSTACK		15000
+#endif
+
+/* reserve some space for error handling */
+#define LUAI_FIRSTPSEUDOIDX	(-LUAI_MAXSTACK - 1000)
+
+
+
+
+/*
+@@ LUAL_BUFFERSIZE is the buffer size used by the lauxlib buffer system.
+** CHANGE it if it uses too much C-stack space.
+*/
+#define LUAL_BUFFERSIZE		BUFSIZ
+
+
+
+
+/*
+** {==================================================================
+@@ LUA_NUMBER is the type of numbers in Lua.
+** CHANGE the following definitions only if you want to build Lua
+** with a number type different from double. You may also need to
+** change lua_number2int & lua_number2integer.
+** ===================================================================
+*/
+
+#define LUA_NUMBER_DOUBLE
+#define LUA_NUMBER	double
+
+/*
+@@ LUAI_UACNUMBER is the result of an 'usual argument conversion'
+@* over a number.
+*/
+#define LUAI_UACNUMBER	double
+
+
+/*
+@@ LUA_NUMBER_SCAN is the format for reading numbers.
+@@ LUA_NUMBER_FMT is the format for writing numbers.
+@@ lua_number2str converts a number to a string.
+@@ LUAI_MAXNUMBER2STR is maximum size of previous conversion.
+*/
+#define LUA_NUMBER_SCAN		"%lf"
+#define LUA_NUMBER_FMT		"%.14g"
+#define lua_number2str(s,n)	sprintf((s), LUA_NUMBER_FMT, (n))
+#define LUAI_MAXNUMBER2STR	32 /* 16 digits, sign, point, and \0 */
+
+
+/*
+@@ l_mathop allows the addition of an 'l' or 'f' to all math operations
+*/
+#define l_mathop(x)		(x)
+
+
+/*
+@@ lua_str2number converts a decimal numeric string to a number.
+@@ lua_strx2number converts an hexadecimal numeric string to a number.
+** In C99, 'strtod' does both conversions. C89, however, has no function
+** to convert floating hexadecimal strings to numbers. For these
+** systems, you can leave 'lua_strx2number' undefined and Lua will
+** provide its own implementation.
+*/
+#define lua_str2number(s,p)	strtod((s), (p))
+
+#if defined(LUA_USE_STRTODHEX)
+#define lua_strx2number(s,p)	strtod((s), (p))
+#endif
+
+
+/*
+@@ The luai_num* macros define the primitive operations over numbers.
+*/
+
+/* the following operations need the math library */
+#if defined(lobject_c) || defined(lvm_c)
+#include <math.h>
+#define luai_nummod(L,a,b)	((a) - l_mathop(floor)((a)/(b))*(b))
+#define luai_numpow(L,a,b)	(l_mathop(pow)(a,b))
+#endif
+
+/* these are quite standard operations */
+#if defined(LUA_CORE)
+#define luai_numadd(L,a,b)	((a)+(b))
+#define luai_numsub(L,a,b)	((a)-(b))
+#define luai_nummul(L,a,b)	((a)*(b))
+#define luai_numdiv(L,a,b)	((a)/(b))
+#define luai_numunm(L,a)	(-(a))
+#define luai_numeq(a,b)		((a)==(b))
+#define luai_numlt(L,a,b)	((a)<(b))
+#define luai_numle(L,a,b)	((a)<=(b))
+#define luai_numisnan(L,a)	(!luai_numeq((a), (a)))
+#endif
+
+
+
+/*
+@@ LUA_INTEGER is the integral type used by lua_pushinteger/lua_tointeger.
+** CHANGE that if ptrdiff_t is not adequate on your machine. (On most
+** machines, ptrdiff_t gives a good choice between int or long.)
+*/
+#define LUA_INTEGER	ptrdiff_t
+
+/*
+@@ LUA_UNSIGNED is the integral type used by lua_pushunsigned/lua_tounsigned.
+** It must have at least 32 bits.
+*/
+#define LUA_UNSIGNED	unsigned LUA_INT32
+
+
+
+/*
+** Some tricks with doubles
+*/
+
+#if defined(LUA_NUMBER_DOUBLE) && !defined(LUA_ANSI)	/* { */
+/*
+** The next definitions activate some tricks to speed up the
+** conversion from doubles to integer types, mainly to LUA_UNSIGNED.
+**
+@@ LUA_MSASMTRICK uses Microsoft assembler to avoid clashes with a
+** DirectX idiosyncrasy.
+**
+@@ LUA_IEEE754TRICK uses a trick that should work on any machine
+** using IEEE754 with a 32-bit integer type.
+**
+@@ LUA_IEEELL extends the trick to LUA_INTEGER; should only be
+** defined when LUA_INTEGER is a 32-bit integer.
+**
+@@ LUA_IEEEENDIAN is the endianness of doubles in your machine
+** (0 for little endian, 1 for big endian); if not defined, Lua will
+** check it dynamically for LUA_IEEE754TRICK (but not for LUA_NANTRICK).
+**
+@@ LUA_NANTRICK controls the use of a trick to pack all types into
+** a single double value, using NaN values to represent non-number
+** values. The trick only works on 32-bit machines (ints and pointers
+** are 32-bit values) with numbers represented as IEEE 754-2008 doubles
+** with conventional endianess (12345678 or 87654321), in CPUs that do
+** not produce signaling NaN values (all NaNs are quiet).
+*/
+
+/* Microsoft compiler on a Pentium (32 bit) ? */
+#if defined(LUA_WIN) && defined(_MSC_VER) && defined(_M_IX86)	/* { */
+
+#define LUA_MSASMTRICK
+#define LUA_IEEEENDIAN		0
+#define LUA_NANTRICK
+
+
+/* pentium 32 bits? */
+#elif defined(__i386__) || defined(__i386) || defined(__X86__) /* }{ */
+
+#define LUA_IEEE754TRICK
+#define LUA_IEEELL
+#define LUA_IEEEENDIAN		0
+#define LUA_NANTRICK
+
+/* pentium 64 bits? */
+#elif defined(__x86_64)						/* }{ */
+
+#define LUA_IEEE754TRICK
+#define LUA_IEEEENDIAN		0
+
+#elif defined(__POWERPC__) || defined(__ppc__)			/* }{ */
+
+#define LUA_IEEE754TRICK
+#define LUA_IEEEENDIAN		1
+
+#else								/* }{ */
+
+/* assume IEEE754 and a 32-bit integer type */
+#define LUA_IEEE754TRICK
+
+#endif								/* } */
+
+#endif							/* } */
+
+/* }================================================================== */
+
+
+
+
+/* =================================================================== */
+
+/*
+** Local configuration. You can use this space to add your redefinitions
+** without modifying the main part of the file.
+*/
+
+
+
+#endif
+
diff --git a/ext/lua/includes/lualib.h b/ext/lua/includes/lualib.h
new file mode 100644
index 0000000..9fd126b
--- /dev/null
+++ b/ext/lua/includes/lualib.h
@@ -0,0 +1,55 @@
+/*
+** $Id: lualib.h,v 1.43 2011/12/08 12:11:37 roberto Exp $
+** Lua standard libraries
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lualib_h
+#define lualib_h
+
+#include "lua.h"
+
+
+
+LUAMOD_API int (luaopen_base) (lua_State *L);
+
+#define LUA_COLIBNAME	"coroutine"
+LUAMOD_API int (luaopen_coroutine) (lua_State *L);
+
+#define LUA_TABLIBNAME	"table"
+LUAMOD_API int (luaopen_table) (lua_State *L);
+
+#define LUA_IOLIBNAME	"io"
+LUAMOD_API int (luaopen_io) (lua_State *L);
+
+#define LUA_OSLIBNAME	"os"
+LUAMOD_API int (luaopen_os) (lua_State *L);
+
+#define LUA_STRLIBNAME	"string"
+LUAMOD_API int (luaopen_string) (lua_State *L);
+
+#define LUA_BITLIBNAME	"bit32"
+LUAMOD_API int (luaopen_bit32) (lua_State *L);
+
+#define LUA_MATHLIBNAME	"math"
+LUAMOD_API int (luaopen_math) (lua_State *L);
+
+#define LUA_DBLIBNAME	"debug"
+LUAMOD_API int (luaopen_debug) (lua_State *L);
+
+#define LUA_LOADLIBNAME	"package"
+LUAMOD_API int (luaopen_package) (lua_State *L);
+
+
+/* open all previous libraries */
+LUALIB_API void (luaL_openlibs) (lua_State *L);
+
+
+
+#if !defined(lua_assert)
+#define lua_assert(x)	((void)0)
+#endif
+
+
+#endif
diff --git a/ext/lua/includes/lundump.h b/ext/lua/includes/lundump.h
new file mode 100644
index 0000000..2b8acce
--- /dev/null
+++ b/ext/lua/includes/lundump.h
@@ -0,0 +1,28 @@
+/*
+** $Id: lundump.h,v 1.39 2012/05/08 13:53:33 roberto Exp $
+** load precompiled Lua chunks
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lundump_h
+#define lundump_h
+
+#include "lobject.h"
+#include "lzio.h"
+
+/* load one chunk; from lundump.c */
+LUAI_FUNC Closure* luaU_undump (lua_State* L, ZIO* Z, Mbuffer* buff, const char* name);
+
+/* make header; from lundump.c */
+LUAI_FUNC void luaU_header (lu_byte* h);
+
+/* dump one chunk; from ldump.c */
+LUAI_FUNC int luaU_dump (lua_State* L, const Proto* f, lua_Writer w, void* data, int strip);
+
+/* data to catch conversion errors */
+#define LUAC_TAIL		"\x19\x93\r\n\x1a\n"
+
+/* size in bytes of header of binary files */
+#define LUAC_HEADERSIZE		(sizeof(LUA_SIGNATURE)-sizeof(char)+2+6+sizeof(LUAC_TAIL)-sizeof(char))
+
+#endif
diff --git a/ext/lua/includes/lvm.h b/ext/lua/includes/lvm.h
new file mode 100644
index 0000000..07e25f9
--- /dev/null
+++ b/ext/lua/includes/lvm.h
@@ -0,0 +1,44 @@
+/*
+** $Id: lvm.h,v 2.18 2013/01/08 14:06:55 roberto Exp $
+** Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+#ifndef lvm_h
+#define lvm_h
+
+
+#include "ldo.h"
+#include "lobject.h"
+#include "ltm.h"
+
+
+#define tostring(L,o) (ttisstring(o) || (luaV_tostring(L, o)))
+
+#define tonumber(o,n)	(ttisnumber(o) || (((o) = luaV_tonumber(o,n)) != NULL))
+
+#define equalobj(L,o1,o2)  (ttisequal(o1, o2) && luaV_equalobj_(L, o1, o2))
+
+#define luaV_rawequalobj(o1,o2)		equalobj(NULL,o1,o2)
+
+
+/* not to called directly */
+LUAI_FUNC int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2);
+
+
+LUAI_FUNC int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r);
+LUAI_FUNC int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r);
+LUAI_FUNC const TValue *luaV_tonumber (const TValue *obj, TValue *n);
+LUAI_FUNC int luaV_tostring (lua_State *L, StkId obj);
+LUAI_FUNC void luaV_gettable (lua_State *L, const TValue *t, TValue *key,
+                                            StkId val);
+LUAI_FUNC void luaV_settable (lua_State *L, const TValue *t, TValue *key,
+                                            StkId val);
+LUAI_FUNC void luaV_finishOp (lua_State *L);
+LUAI_FUNC void luaV_execute (lua_State *L);
+LUAI_FUNC void luaV_concat (lua_State *L, int total);
+LUAI_FUNC void luaV_arith (lua_State *L, StkId ra, const TValue *rb,
+                           const TValue *rc, TMS op);
+LUAI_FUNC void luaV_objlen (lua_State *L, StkId ra, const TValue *rb);
+
+#endif
diff --git a/ext/lua/includes/lzio.h b/ext/lua/includes/lzio.h
new file mode 100644
index 0000000..0868230
--- /dev/null
+++ b/ext/lua/includes/lzio.h
@@ -0,0 +1,65 @@
+/*
+** $Id: lzio.h,v 1.26 2011/07/15 12:48:03 roberto Exp $
+** Buffered streams
+** See Copyright Notice in lua.h
+*/
+
+
+#ifndef lzio_h
+#define lzio_h
+
+#include "lua.h"
+
+#include "lmem.h"
+
+
+#define EOZ	(-1)			/* end of stream */
+
+typedef struct Zio ZIO;
+
+#define zgetc(z)  (((z)->n--)>0 ?  cast_uchar(*(z)->p++) : luaZ_fill(z))
+
+
+typedef struct Mbuffer {
+  char *buffer;
+  size_t n;
+  size_t buffsize;
+} Mbuffer;
+
+#define luaZ_initbuffer(L, buff) ((buff)->buffer = NULL, (buff)->buffsize = 0)
+
+#define luaZ_buffer(buff)	((buff)->buffer)
+#define luaZ_sizebuffer(buff)	((buff)->buffsize)
+#define luaZ_bufflen(buff)	((buff)->n)
+
+#define luaZ_resetbuffer(buff) ((buff)->n = 0)
+
+
+#define luaZ_resizebuffer(L, buff, size) \
+	(luaM_reallocvector(L, (buff)->buffer, (buff)->buffsize, size, char), \
+	(buff)->buffsize = size)
+
+#define luaZ_freebuffer(L, buff)	luaZ_resizebuffer(L, buff, 0)
+
+
+LUAI_FUNC char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n);
+LUAI_FUNC void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader,
+                                        void *data);
+LUAI_FUNC size_t luaZ_read (ZIO* z, void* b, size_t n);	/* read next n bytes */
+
+
+
+/* --------- Private Part ------------------ */
+
+struct Zio {
+  size_t n;			/* bytes still unread */
+  const char *p;		/* current position in buffer */
+  lua_Reader reader;		/* reader function */
+  void* data;			/* additional data */
+  lua_State *L;			/* Lua state (for reader) */
+};
+
+
+LUAI_FUNC int luaZ_fill (ZIO *z);
+
+#endif
diff --git a/ext/lua/includes/readline/chardefs.h b/ext/lua/includes/readline/chardefs.h
new file mode 100644
index 0000000..2a9403a
--- /dev/null
+++ b/ext/lua/includes/readline/chardefs.h
@@ -0,0 +1,152 @@
+/* chardefs.h -- Character definitions for readline. */
+
+/* Copyright (C) 1994-2009 Free Software Foundation, Inc.
+
+   This file is part of the GNU Readline Library (Readline), a library
+   for reading lines of text with interactive input and history editing.
+
+   Readline is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   Readline is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with Readline.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _CHARDEFS_H_
+#define _CHARDEFS_H_
+
+#include <ctype.h>
+
+#include <string.h>
+
+#ifndef whitespace
+#define whitespace(c) (((c) == ' ') || ((c) == '\t'))
+#endif
+
+#ifdef CTRL
+#  undef CTRL
+#endif
+#ifdef UNCTRL
+#  undef UNCTRL
+#endif
+
+/* Some character stuff. */
+#define control_character_threshold 0x020   /* Smaller than this is control. */
+#define control_character_mask 0x1f	    /* 0x20 - 1 */
+#define meta_character_threshold 0x07f	    /* Larger than this is Meta. */
+#define control_character_bit 0x40	    /* 0x000000, must be off. */
+#define meta_character_bit 0x080	    /* x0000000, must be on. */
+#define largest_char 255		    /* Largest character value. */
+
+#define CTRL_CHAR(c) ((c) < control_character_threshold && (((c) & 0x80) == 0))
+#define META_CHAR(c) ((c) > meta_character_threshold && (c) <= largest_char)
+
+#define CTRL(c) ((c) & control_character_mask)
+#define META(c) ((c) | meta_character_bit)
+
+#define UNMETA(c) ((c) & (~meta_character_bit))
+#define UNCTRL(c) _rl_to_upper(((c)|control_character_bit))
+
+#if defined STDC_HEADERS || (!defined (isascii) && !defined (HAVE_ISASCII))
+#  define IN_CTYPE_DOMAIN(c) 1
+#else
+#  define IN_CTYPE_DOMAIN(c) isascii(c)
+#endif
+
+#if !defined (isxdigit) && !defined (HAVE_ISXDIGIT)
+#  define isxdigit(c)   (isdigit((c)) || ((c) >= 'a' && (c) <= 'f') || ((c) >= 'A' && (c) <= 'F'))
+#endif
+
+#if defined (CTYPE_NON_ASCII)
+#  define NON_NEGATIVE(c) 1
+#else
+#  define NON_NEGATIVE(c) ((unsigned char)(c) == (c))
+#endif
+
+/* Some systems define these; we want our definitions. */
+#undef ISPRINT
+
+/* Beware:  these only work with single-byte ASCII characters. */
+
+#define ISALNUM(c)	(IN_CTYPE_DOMAIN (c) && isalnum (c))
+#define ISALPHA(c)	(IN_CTYPE_DOMAIN (c) && isalpha (c))
+#define ISDIGIT(c)	(IN_CTYPE_DOMAIN (c) && isdigit (c))
+#define ISLOWER(c)	(IN_CTYPE_DOMAIN (c) && islower (c))
+#define ISPRINT(c)	(IN_CTYPE_DOMAIN (c) && isprint (c))
+#define ISUPPER(c)	(IN_CTYPE_DOMAIN (c) && isupper (c))
+#define ISXDIGIT(c)	(IN_CTYPE_DOMAIN (c) && isxdigit (c))
+
+#define _rl_lowercase_p(c)	(NON_NEGATIVE(c) && ISLOWER(c))
+#define _rl_uppercase_p(c)	(NON_NEGATIVE(c) && ISUPPER(c))
+#define _rl_digit_p(c)		((c) >= '0' && (c) <= '9')
+
+#define _rl_pure_alphabetic(c)	(NON_NEGATIVE(c) && ISALPHA(c))
+#define ALPHABETIC(c)		(NON_NEGATIVE(c) && ISALNUM(c))
+
+#ifndef _rl_to_upper
+#  define _rl_to_upper(c) (_rl_lowercase_p(c) ? toupper((unsigned char)c) : (c))
+#  define _rl_to_lower(c) (_rl_uppercase_p(c) ? tolower((unsigned char)c) : (c))
+#endif
+
+#ifndef _rl_digit_value
+#  define _rl_digit_value(x) ((x) - '0')
+#endif
+
+#ifndef _rl_isident
+#  define _rl_isident(c) (ISALNUM(c) || (c) == '_')
+#endif
+
+#ifndef ISOCTAL
+#  define ISOCTAL(c)	((c) >= '0' && (c) <= '7')
+#endif
+#define OCTVALUE(c)	((c) - '0')
+
+#define HEXVALUE(c) \
+  (((c) >= 'a' && (c) <= 'f') \
+  	? (c)-'a'+10 \
+  	: (c) >= 'A' && (c) <= 'F' ? (c)-'A'+10 : (c)-'0')
+
+#ifndef NEWLINE
+#define NEWLINE '\n'
+#endif
+
+#ifndef RETURN
+#define RETURN CTRL('M')
+#endif
+
+#ifndef RUBOUT
+#define RUBOUT 0x7f
+#endif
+
+#ifndef TAB
+#define TAB '\t'
+#endif
+
+#ifdef ABORT_CHAR
+#undef ABORT_CHAR
+#endif
+#define ABORT_CHAR CTRL('G')
+
+#ifdef PAGE
+#undef PAGE
+#endif
+#define PAGE CTRL('L')
+
+#ifdef SPACE
+#undef SPACE
+#endif
+#define SPACE ' '	/* XXX - was 0x20 */
+
+#ifdef ESC
+#undef ESC
+#endif
+#define ESC CTRL('[')
+
+#endif  /* _CHARDEFS_H_ */
diff --git a/ext/lua/includes/readline/history.h b/ext/lua/includes/readline/history.h
new file mode 100644
index 0000000..a9e907a
--- /dev/null
+++ b/ext/lua/includes/readline/history.h
@@ -0,0 +1,267 @@
+/* history.h -- the names of functions that you can call in history. */
+
+/* Copyright (C) 1989-2009 Free Software Foundation, Inc.
+
+   This file contains the GNU History Library (History), a set of
+   routines for managing the text of previously typed lines.
+
+   History is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   History is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with History.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _HISTORY_H_
+#define _HISTORY_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <time.h>		/* XXX - for history timestamp code */
+
+#if defined READLINE_LIBRARY
+#  include "rlstdc.h"
+#  include "rltypedefs.h"
+#else
+#  include <stdio.h>
+#  include <readline/rlstdc.h>
+#  include <readline/rltypedefs.h>
+#endif
+
+#ifdef __STDC__
+typedef void *histdata_t;
+#else
+typedef char *histdata_t;
+#endif
+
+/* The structure used to store a history entry. */
+typedef struct _hist_entry {
+  char *line;
+  char *timestamp;		/* char * rather than time_t for read/write */
+  histdata_t data;
+} HIST_ENTRY;
+
+/* Size of the history-library-managed space in history entry HS. */
+#define HISTENT_BYTES(hs)	(strlen ((hs)->line) + strlen ((hs)->timestamp))
+
+/* A structure used to pass the current state of the history stuff around. */
+typedef struct _hist_state {
+  HIST_ENTRY **entries;		/* Pointer to the entries themselves. */
+  int offset;			/* The location pointer within this array. */
+  int length;			/* Number of elements within this array. */
+  int size;			/* Number of slots allocated to this array. */
+  int flags;
+} HISTORY_STATE;
+
+/* Flag values for the `flags' member of HISTORY_STATE. */
+#define HS_STIFLED	0x01
+
+/* Initialization and state management. */
+
+/* Begin a session in which the history functions might be used.  This
+   just initializes the interactive variables. */
+extern void using_history PARAMS((void));
+
+/* Return the current HISTORY_STATE of the history. */
+extern HISTORY_STATE *history_get_history_state PARAMS((void));
+
+/* Set the state of the current history array to STATE. */
+extern void history_set_history_state PARAMS((HISTORY_STATE *));
+
+/* Manage the history list. */
+
+/* Place STRING at the end of the history list.
+   The associated data field (if any) is set to NULL. */
+extern void add_history PARAMS((const char *));
+
+/* Change the timestamp associated with the most recent history entry to
+   STRING. */
+extern void add_history_time PARAMS((const char *));
+
+/* A reasonably useless function, only here for completeness.  WHICH
+   is the magic number that tells us which element to delete.  The
+   elements are numbered from 0. */
+extern HIST_ENTRY *remove_history PARAMS((int));
+
+/* Free the history entry H and return any application-specific data
+   associated with it. */
+extern histdata_t free_history_entry PARAMS((HIST_ENTRY *));
+
+/* Make the history entry at WHICH have LINE and DATA.  This returns
+   the old entry so you can dispose of the data.  In the case of an
+   invalid WHICH, a NULL pointer is returned. */
+extern HIST_ENTRY *replace_history_entry PARAMS((int, const char *, histdata_t));
+
+/* Clear the history list and start over. */
+extern void clear_history PARAMS((void));
+
+/* Stifle the history list, remembering only MAX number of entries. */
+extern void stifle_history PARAMS((int));
+
+/* Stop stifling the history.  This returns the previous amount the
+   history was stifled by.  The value is positive if the history was
+   stifled, negative if it wasn't. */
+extern int unstifle_history PARAMS((void));
+
+/* Return 1 if the history is stifled, 0 if it is not. */
+extern int history_is_stifled PARAMS((void));
+
+/* Information about the history list. */
+
+/* Return a NULL terminated array of HIST_ENTRY which is the current input
+   history.  Element 0 of this list is the beginning of time.  If there
+   is no history, return NULL. */
+extern HIST_ENTRY **history_list PARAMS((void));
+
+/* Returns the number which says what history element we are now
+   looking at.  */
+extern int where_history PARAMS((void));
+  
+/* Return the history entry at the current position, as determined by
+   history_offset.  If there is no entry there, return a NULL pointer. */
+extern HIST_ENTRY *current_history PARAMS((void));
+
+/* Return the history entry which is logically at OFFSET in the history
+   array.  OFFSET is relative to history_base. */
+extern HIST_ENTRY *history_get PARAMS((int));
+
+/* Return the timestamp associated with the HIST_ENTRY * passed as an
+   argument */
+extern time_t history_get_time PARAMS((HIST_ENTRY *));
+
+/* Return the number of bytes that the primary history entries are using.
+   This just adds up the lengths of the_history->lines. */
+extern int history_total_bytes PARAMS((void));
+
+/* Moving around the history list. */
+
+/* Set the position in the history list to POS. */
+extern int history_set_pos PARAMS((int));
+
+/* Back up history_offset to the previous history entry, and return
+   a pointer to that entry.  If there is no previous entry, return
+   a NULL pointer. */
+extern HIST_ENTRY *previous_history PARAMS((void));
+
+/* Move history_offset forward to the next item in the input_history,
+   and return the a pointer to that entry.  If there is no next entry,
+   return a NULL pointer. */
+extern HIST_ENTRY *next_history PARAMS((void));
+
+/* Searching the history list. */
+
+/* Search the history for STRING, starting at history_offset.
+   If DIRECTION < 0, then the search is through previous entries,
+   else through subsequent.  If the string is found, then
+   current_history () is the history entry, and the value of this function
+   is the offset in the line of that history entry that the string was
+   found in.  Otherwise, nothing is changed, and a -1 is returned. */
+extern int history_search PARAMS((const char *, int));
+
+/* Search the history for STRING, starting at history_offset.
+   The search is anchored: matching lines must begin with string.
+   DIRECTION is as in history_search(). */
+extern int history_search_prefix PARAMS((const char *, int));
+
+/* Search for STRING in the history list, starting at POS, an
+   absolute index into the list.  DIR, if negative, says to search
+   backwards from POS, else forwards.
+   Returns the absolute index of the history element where STRING
+   was found, or -1 otherwise. */
+extern int history_search_pos PARAMS((const char *, int, int));
+
+/* Managing the history file. */
+
+/* Add the contents of FILENAME to the history list, a line at a time.
+   If FILENAME is NULL, then read from ~/.history.  Returns 0 if
+   successful, or errno if not. */
+extern int read_history PARAMS((const char *));
+
+/* Read a range of lines from FILENAME, adding them to the history list.
+   Start reading at the FROM'th line and end at the TO'th.  If FROM
+   is zero, start at the beginning.  If TO is less than FROM, read
+   until the end of the file.  If FILENAME is NULL, then read from
+   ~/.history.  Returns 0 if successful, or errno if not. */
+extern int read_history_range PARAMS((const char *, int, int));
+
+/* Write the current history to FILENAME.  If FILENAME is NULL,
+   then write the history list to ~/.history.  Values returned
+   are as in read_history ().  */
+extern int write_history PARAMS((const char *));
+
+/* Append NELEMENT entries to FILENAME.  The entries appended are from
+   the end of the list minus NELEMENTs up to the end of the list. */
+extern int append_history PARAMS((int, const char *));
+
+/* Truncate the history file, leaving only the last NLINES lines. */
+extern int history_truncate_file PARAMS((const char *, int));
+
+/* History expansion. */
+
+/* Expand the string STRING, placing the result into OUTPUT, a pointer
+   to a string.  Returns:
+
+   0) If no expansions took place (or, if the only change in
+      the text was the de-slashifying of the history expansion
+      character)
+   1) If expansions did take place
+  -1) If there was an error in expansion.
+   2) If the returned line should just be printed.
+
+  If an error ocurred in expansion, then OUTPUT contains a descriptive
+  error message. */
+extern int history_expand PARAMS((char *, char **));
+
+/* Extract a string segment consisting of the FIRST through LAST
+   arguments present in STRING.  Arguments are broken up as in
+   the shell. */
+extern char *history_arg_extract PARAMS((int, int, const char *));
+
+/* Return the text of the history event beginning at the current
+   offset into STRING.  Pass STRING with *INDEX equal to the
+   history_expansion_char that begins this specification.
+   DELIMITING_QUOTE is a character that is allowed to end the string
+   specification for what to search for in addition to the normal
+   characters `:', ` ', `\t', `\n', and sometimes `?'. */
+extern char *get_history_event PARAMS((const char *, int *, int));
+
+/* Return an array of tokens, much as the shell might.  The tokens are
+   parsed out of STRING. */
+extern char **history_tokenize PARAMS((const char *));
+
+/* Exported history variables. */
+extern int history_base;
+extern int history_length;
+extern int history_max_entries;
+extern char history_expansion_char;
+extern char history_subst_char;
+extern char *history_word_delimiters;
+extern char history_comment_char;
+extern char *history_no_expand_chars;
+extern char *history_search_delimiter_chars;
+extern int history_quotes_inhibit_expansion;
+
+extern int history_write_timestamps;
+
+/* Backwards compatibility */
+extern int max_input_history;
+
+/* If set, this function is called to decide whether or not a particular
+   history expansion should be treated as a special case for the calling
+   application and not expanded. */
+extern rl_linebuf_func_t *history_inhibit_expansion_function;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* !_HISTORY_H_ */
diff --git a/ext/lua/includes/readline/keymaps.h b/ext/lua/includes/readline/keymaps.h
new file mode 100644
index 0000000..af8d5d9
--- /dev/null
+++ b/ext/lua/includes/readline/keymaps.h
@@ -0,0 +1,97 @@
+/* keymaps.h -- Manipulation of readline keymaps. */
+
+/* Copyright (C) 1987, 1989, 1992 Free Software Foundation, Inc.
+
+   This file is part of the GNU Readline Library (Readline), a library
+   for reading lines of text with interactive input and history editing.      
+
+   Readline is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   Readline is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with Readline.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _KEYMAPS_H_
+#define _KEYMAPS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined (READLINE_LIBRARY)
+#  include "rlstdc.h"
+#  include "chardefs.h"
+#  include "rltypedefs.h"
+#else
+#  include <readline/rlstdc.h>
+#  include <readline/chardefs.h>
+#  include <readline/rltypedefs.h>
+#endif
+
+/* A keymap contains one entry for each key in the ASCII set.
+   Each entry consists of a type and a pointer.
+   FUNCTION is the address of a function to run, or the
+   address of a keymap to indirect through.
+   TYPE says which kind of thing FUNCTION is. */
+typedef struct _keymap_entry {
+  char type;
+  rl_command_func_t *function;
+} KEYMAP_ENTRY;
+
+/* This must be large enough to hold bindings for all of the characters
+   in a desired character set (e.g, 128 for ASCII, 256 for ISO Latin-x,
+   and so on) plus one for subsequence matching. */
+#define KEYMAP_SIZE 257
+#define ANYOTHERKEY KEYMAP_SIZE-1
+
+typedef KEYMAP_ENTRY KEYMAP_ENTRY_ARRAY[KEYMAP_SIZE];
+typedef KEYMAP_ENTRY *Keymap;
+
+/* The values that TYPE can have in a keymap entry. */
+#define ISFUNC 0
+#define ISKMAP 1
+#define ISMACR 2
+
+extern KEYMAP_ENTRY_ARRAY emacs_standard_keymap, emacs_meta_keymap, emacs_ctlx_keymap;
+extern KEYMAP_ENTRY_ARRAY vi_insertion_keymap, vi_movement_keymap;
+
+/* Return a new, empty keymap.
+   Free it with free() when you are done. */
+extern Keymap rl_make_bare_keymap PARAMS((void));
+
+/* Return a new keymap which is a copy of MAP. */
+extern Keymap rl_copy_keymap PARAMS((Keymap));
+
+/* Return a new keymap with the printing characters bound to rl_insert,
+   the lowercase Meta characters bound to run their equivalents, and
+   the Meta digits bound to produce numeric arguments. */
+extern Keymap rl_make_keymap PARAMS((void));
+
+/* Free the storage associated with a keymap. */
+extern void rl_discard_keymap PARAMS((Keymap));
+
+/* These functions actually appear in bind.c */
+
+/* Return the keymap corresponding to a given name.  Names look like
+   `emacs' or `emacs-meta' or `vi-insert'.  */
+extern Keymap rl_get_keymap_by_name PARAMS((const char *));
+
+/* Return the current keymap. */
+extern Keymap rl_get_keymap PARAMS((void));
+
+/* Set the current keymap to MAP. */
+extern void rl_set_keymap PARAMS((Keymap));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _KEYMAPS_H_ */
diff --git a/ext/lua/includes/readline/readline.h b/ext/lua/includes/readline/readline.h
new file mode 100644
index 0000000..ff6d3cc
--- /dev/null
+++ b/ext/lua/includes/readline/readline.h
@@ -0,0 +1,894 @@
+/* Readline.h -- the names of functions callable from within readline. */
+
+/* Copyright (C) 1987-2011 Free Software Foundation, Inc.
+
+   This file is part of the GNU Readline Library (Readline), a library
+   for reading lines of text with interactive input and history editing.      
+
+   Readline is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   Readline is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with Readline.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#if !defined (_READLINE_H_)
+#define _READLINE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined (READLINE_LIBRARY)
+#  include "rlstdc.h"
+#  include "rltypedefs.h"
+#  include "keymaps.h"
+#  include "tilde.h"
+#else
+#  include <stdio.h>
+#  include <readline/rlstdc.h>
+#  include <readline/rltypedefs.h>
+#  include <readline/keymaps.h>
+#  include <readline/tilde.h>
+#endif
+
+/* Hex-encoded Readline version number. */
+#define RL_READLINE_VERSION	0x0602		/* Readline 6.2 */
+#define RL_VERSION_MAJOR	6
+#define RL_VERSION_MINOR	2
+
+/* Readline data structures. */
+
+/* Maintaining the state of undo.  We remember individual deletes and inserts
+   on a chain of things to do. */
+
+/* The actions that undo knows how to undo.  Notice that UNDO_DELETE means
+   to insert some text, and UNDO_INSERT means to delete some text.   I.e.,
+   the code tells undo what to undo, not how to undo it. */
+enum undo_code { UNDO_DELETE, UNDO_INSERT, UNDO_BEGIN, UNDO_END };
+
+/* What an element of THE_UNDO_LIST looks like. */
+typedef struct undo_list {
+  struct undo_list *next;
+  int start, end;		/* Where the change took place. */
+  char *text;			/* The text to insert, if undoing a delete. */
+  enum undo_code what;		/* Delete, Insert, Begin, End. */
+} UNDO_LIST;
+
+/* The current undo list for RL_LINE_BUFFER. */
+extern UNDO_LIST *rl_undo_list;
+
+/* The data structure for mapping textual names to code addresses. */
+typedef struct _funmap {
+  const char *name;
+  rl_command_func_t *function;
+} FUNMAP;
+
+extern FUNMAP **funmap;
+
+/* **************************************************************** */
+/*								    */
+/*	     Functions available to bind to key sequences	    */
+/*								    */
+/* **************************************************************** */
+
+/* Bindable commands for numeric arguments. */
+extern int rl_digit_argument PARAMS((int, int));
+extern int rl_universal_argument PARAMS((int, int));
+
+/* Bindable commands for moving the cursor. */
+extern int rl_forward_byte PARAMS((int, int));
+extern int rl_forward_char PARAMS((int, int));
+extern int rl_forward PARAMS((int, int));
+extern int rl_backward_byte PARAMS((int, int));
+extern int rl_backward_char PARAMS((int, int));
+extern int rl_backward PARAMS((int, int));
+extern int rl_beg_of_line PARAMS((int, int));
+extern int rl_end_of_line PARAMS((int, int));
+extern int rl_forward_word PARAMS((int, int));
+extern int rl_backward_word PARAMS((int, int));
+extern int rl_refresh_line PARAMS((int, int));
+extern int rl_clear_screen PARAMS((int, int));
+extern int rl_skip_csi_sequence PARAMS((int, int));
+extern int rl_arrow_keys PARAMS((int, int));
+
+/* Bindable commands for inserting and deleting text. */
+extern int rl_insert PARAMS((int, int));
+extern int rl_quoted_insert PARAMS((int, int));
+extern int rl_tab_insert PARAMS((int, int));
+extern int rl_newline PARAMS((int, int));
+extern int rl_do_lowercase_version PARAMS((int, int));
+extern int rl_rubout PARAMS((int, int));
+extern int rl_delete PARAMS((int, int));
+extern int rl_rubout_or_delete PARAMS((int, int));
+extern int rl_delete_horizontal_space PARAMS((int, int));
+extern int rl_delete_or_show_completions PARAMS((int, int));
+extern int rl_insert_comment PARAMS((int, int));
+
+/* Bindable commands for changing case. */
+extern int rl_upcase_word PARAMS((int, int));
+extern int rl_downcase_word PARAMS((int, int));
+extern int rl_capitalize_word PARAMS((int, int));
+
+/* Bindable commands for transposing characters and words. */
+extern int rl_transpose_words PARAMS((int, int));
+extern int rl_transpose_chars PARAMS((int, int));
+
+/* Bindable commands for searching within a line. */
+extern int rl_char_search PARAMS((int, int));
+extern int rl_backward_char_search PARAMS((int, int));
+
+/* Bindable commands for readline's interface to the command history. */
+extern int rl_beginning_of_history PARAMS((int, int));
+extern int rl_end_of_history PARAMS((int, int));
+extern int rl_get_next_history PARAMS((int, int));
+extern int rl_get_previous_history PARAMS((int, int));
+
+/* Bindable commands for managing the mark and region. */
+extern int rl_set_mark PARAMS((int, int));
+extern int rl_exchange_point_and_mark PARAMS((int, int));
+
+/* Bindable commands to set the editing mode (emacs or vi). */
+extern int rl_vi_editing_mode PARAMS((int, int));
+extern int rl_emacs_editing_mode PARAMS((int, int));
+
+/* Bindable commands to change the insert mode (insert or overwrite) */
+extern int rl_overwrite_mode PARAMS((int, int));
+
+/* Bindable commands for managing key bindings. */
+extern int rl_re_read_init_file PARAMS((int, int));
+extern int rl_dump_functions PARAMS((int, int));
+extern int rl_dump_macros PARAMS((int, int));
+extern int rl_dump_variables PARAMS((int, int));
+
+/* Bindable commands for word completion. */
+extern int rl_complete PARAMS((int, int));
+extern int rl_possible_completions PARAMS((int, int));
+extern int rl_insert_completions PARAMS((int, int));
+extern int rl_old_menu_complete PARAMS((int, int));
+extern int rl_menu_complete PARAMS((int, int));
+extern int rl_backward_menu_complete PARAMS((int, int));
+
+/* Bindable commands for killing and yanking text, and managing the kill ring. */
+extern int rl_kill_word PARAMS((int, int));
+extern int rl_backward_kill_word PARAMS((int, int));
+extern int rl_kill_line PARAMS((int, int));
+extern int rl_backward_kill_line PARAMS((int, int));
+extern int rl_kill_full_line PARAMS((int, int));
+extern int rl_unix_word_rubout PARAMS((int, int));
+extern int rl_unix_filename_rubout PARAMS((int, int));
+extern int rl_unix_line_discard PARAMS((int, int));
+extern int rl_copy_region_to_kill PARAMS((int, int));
+extern int rl_kill_region PARAMS((int, int));
+extern int rl_copy_forward_word PARAMS((int, int));
+extern int rl_copy_backward_word PARAMS((int, int));
+extern int rl_yank PARAMS((int, int));
+extern int rl_yank_pop PARAMS((int, int));
+extern int rl_yank_nth_arg PARAMS((int, int));
+extern int rl_yank_last_arg PARAMS((int, int));
+/* Not available unless __CYGWIN__ is defined. */
+#ifdef __CYGWIN__
+extern int rl_paste_from_clipboard PARAMS((int, int));
+#endif
+
+/* Bindable commands for incremental searching. */
+extern int rl_reverse_search_history PARAMS((int, int));
+extern int rl_forward_search_history PARAMS((int, int));
+
+/* Bindable keyboard macro commands. */
+extern int rl_start_kbd_macro PARAMS((int, int));
+extern int rl_end_kbd_macro PARAMS((int, int));
+extern int rl_call_last_kbd_macro PARAMS((int, int));
+
+/* Bindable undo commands. */
+extern int rl_revert_line PARAMS((int, int));
+extern int rl_undo_command PARAMS((int, int));
+
+/* Bindable tilde expansion commands. */
+extern int rl_tilde_expand PARAMS((int, int));
+
+/* Bindable terminal control commands. */
+extern int rl_restart_output PARAMS((int, int));
+extern int rl_stop_output PARAMS((int, int));
+
+/* Miscellaneous bindable commands. */
+extern int rl_abort PARAMS((int, int));
+extern int rl_tty_status PARAMS((int, int));
+
+/* Bindable commands for incremental and non-incremental history searching. */
+extern int rl_history_search_forward PARAMS((int, int));
+extern int rl_history_search_backward PARAMS((int, int));
+extern int rl_noninc_forward_search PARAMS((int, int));
+extern int rl_noninc_reverse_search PARAMS((int, int));
+extern int rl_noninc_forward_search_again PARAMS((int, int));
+extern int rl_noninc_reverse_search_again PARAMS((int, int));
+
+/* Bindable command used when inserting a matching close character. */
+extern int rl_insert_close PARAMS((int, int));
+
+/* Not available unless READLINE_CALLBACKS is defined. */
+extern void rl_callback_handler_install PARAMS((const char *, rl_vcpfunc_t *));
+extern void rl_callback_read_char PARAMS((void));
+extern void rl_callback_handler_remove PARAMS((void));
+
+/* Things for vi mode. Not available unless readline is compiled -DVI_MODE. */
+/* VI-mode bindable commands. */
+extern int rl_vi_redo PARAMS((int, int));
+extern int rl_vi_undo PARAMS((int, int));
+extern int rl_vi_yank_arg PARAMS((int, int));
+extern int rl_vi_fetch_history PARAMS((int, int));
+extern int rl_vi_search_again PARAMS((int, int));
+extern int rl_vi_search PARAMS((int, int));
+extern int rl_vi_complete PARAMS((int, int));
+extern int rl_vi_tilde_expand PARAMS((int, int));
+extern int rl_vi_prev_word PARAMS((int, int));
+extern int rl_vi_next_word PARAMS((int, int));
+extern int rl_vi_end_word PARAMS((int, int));
+extern int rl_vi_insert_beg PARAMS((int, int));
+extern int rl_vi_append_mode PARAMS((int, int));
+extern int rl_vi_append_eol PARAMS((int, int));
+extern int rl_vi_eof_maybe PARAMS((int, int));
+extern int rl_vi_insertion_mode PARAMS((int, int));
+extern int rl_vi_insert_mode PARAMS((int, int));
+extern int rl_vi_movement_mode PARAMS((int, int));
+extern int rl_vi_arg_digit PARAMS((int, int));
+extern int rl_vi_change_case PARAMS((int, int));
+extern int rl_vi_put PARAMS((int, int));
+extern int rl_vi_column PARAMS((int, int));
+extern int rl_vi_delete_to PARAMS((int, int));
+extern int rl_vi_change_to PARAMS((int, int));
+extern int rl_vi_yank_to PARAMS((int, int));
+extern int rl_vi_rubout PARAMS((int, int));
+extern int rl_vi_delete PARAMS((int, int));
+extern int rl_vi_back_to_indent PARAMS((int, int));
+extern int rl_vi_first_print PARAMS((int, int));
+extern int rl_vi_char_search PARAMS((int, int));
+extern int rl_vi_match PARAMS((int, int));
+extern int rl_vi_change_char PARAMS((int, int));
+extern int rl_vi_subst PARAMS((int, int));
+extern int rl_vi_overstrike PARAMS((int, int));
+extern int rl_vi_overstrike_delete PARAMS((int, int));
+extern int rl_vi_replace PARAMS((int, int));
+extern int rl_vi_set_mark PARAMS((int, int));
+extern int rl_vi_goto_mark PARAMS((int, int));
+
+/* VI-mode utility functions. */
+extern int rl_vi_check PARAMS((void));
+extern int rl_vi_domove PARAMS((int, int *));
+extern int rl_vi_bracktype PARAMS((int));
+
+extern void rl_vi_start_inserting PARAMS((int, int, int));
+
+/* VI-mode pseudo-bindable commands, used as utility functions. */
+extern int rl_vi_fWord PARAMS((int, int));
+extern int rl_vi_bWord PARAMS((int, int));
+extern int rl_vi_eWord PARAMS((int, int));
+extern int rl_vi_fword PARAMS((int, int));
+extern int rl_vi_bword PARAMS((int, int));
+extern int rl_vi_eword PARAMS((int, int));
+
+/* **************************************************************** */
+/*								    */
+/*			Well Published Functions		    */
+/*								    */
+/* **************************************************************** */
+
+/* Readline functions. */
+/* Read a line of input.  Prompt with PROMPT.  A NULL PROMPT means none. */
+extern char *readline PARAMS((const char *));
+
+extern int rl_set_prompt PARAMS((const char *));
+extern int rl_expand_prompt PARAMS((char *));
+
+extern int rl_initialize PARAMS((void));
+
+/* Undocumented; unused by readline */
+extern int rl_discard_argument PARAMS((void));
+
+/* Utility functions to bind keys to readline commands. */
+extern int rl_add_defun PARAMS((const char *, rl_command_func_t *, int));
+extern int rl_bind_key PARAMS((int, rl_command_func_t *));
+extern int rl_bind_key_in_map PARAMS((int, rl_command_func_t *, Keymap));
+extern int rl_unbind_key PARAMS((int));
+extern int rl_unbind_key_in_map PARAMS((int, Keymap));
+extern int rl_bind_key_if_unbound PARAMS((int, rl_command_func_t *));
+extern int rl_bind_key_if_unbound_in_map PARAMS((int, rl_command_func_t *, Keymap));
+extern int rl_unbind_function_in_map PARAMS((rl_command_func_t *, Keymap));
+extern int rl_unbind_command_in_map PARAMS((const char *, Keymap));
+extern int rl_bind_keyseq PARAMS((const char *, rl_command_func_t *));
+extern int rl_bind_keyseq_in_map PARAMS((const char *, rl_command_func_t *, Keymap));
+extern int rl_bind_keyseq_if_unbound PARAMS((const char *, rl_command_func_t *));
+extern int rl_bind_keyseq_if_unbound_in_map PARAMS((const char *, rl_command_func_t *, Keymap));
+extern int rl_generic_bind PARAMS((int, const char *, char *, Keymap));
+
+extern char *rl_variable_value PARAMS((const char *));
+extern int rl_variable_bind PARAMS((const char *, const char *));
+
+/* Backwards compatibility, use rl_bind_keyseq_in_map instead. */
+extern int rl_set_key PARAMS((const char *, rl_command_func_t *, Keymap));
+
+/* Backwards compatibility, use rl_generic_bind instead. */
+extern int rl_macro_bind PARAMS((const char *, const char *, Keymap));
+
+/* Undocumented in the texinfo manual; not really useful to programs. */
+extern int rl_translate_keyseq PARAMS((const char *, char *, int *));
+extern char *rl_untranslate_keyseq PARAMS((int));
+
+extern rl_command_func_t *rl_named_function PARAMS((const char *));
+extern rl_command_func_t *rl_function_of_keyseq PARAMS((const char *, Keymap, int *));
+
+extern void rl_list_funmap_names PARAMS((void));
+extern char **rl_invoking_keyseqs_in_map PARAMS((rl_command_func_t *, Keymap));
+extern char **rl_invoking_keyseqs PARAMS((rl_command_func_t *));
+ 
+extern void rl_function_dumper PARAMS((int));
+extern void rl_macro_dumper PARAMS((int));
+extern void rl_variable_dumper PARAMS((int));
+
+extern int rl_read_init_file PARAMS((const char *));
+extern int rl_parse_and_bind PARAMS((char *));
+
+/* Functions for manipulating keymaps. */
+extern Keymap rl_make_bare_keymap PARAMS((void));
+extern Keymap rl_copy_keymap PARAMS((Keymap));
+extern Keymap rl_make_keymap PARAMS((void));
+extern void rl_discard_keymap PARAMS((Keymap));
+
+extern Keymap rl_get_keymap_by_name PARAMS((const char *));
+extern char *rl_get_keymap_name PARAMS((Keymap));
+extern void rl_set_keymap PARAMS((Keymap));
+extern Keymap rl_get_keymap PARAMS((void));
+/* Undocumented; used internally only. */
+extern void rl_set_keymap_from_edit_mode PARAMS((void));
+extern char *rl_get_keymap_name_from_edit_mode PARAMS((void));
+
+/* Functions for manipulating the funmap, which maps command names to functions. */
+extern int rl_add_funmap_entry PARAMS((const char *, rl_command_func_t *));
+extern const char **rl_funmap_names PARAMS((void));
+/* Undocumented, only used internally -- there is only one funmap, and this
+   function may be called only once. */
+extern void rl_initialize_funmap PARAMS((void));
+
+/* Utility functions for managing keyboard macros. */
+extern void rl_push_macro_input PARAMS((char *));
+
+/* Functions for undoing, from undo.c */
+extern void rl_add_undo PARAMS((enum undo_code, int, int, char *));
+extern void rl_free_undo_list PARAMS((void));
+extern int rl_do_undo PARAMS((void));
+extern int rl_begin_undo_group PARAMS((void));
+extern int rl_end_undo_group PARAMS((void));
+extern int rl_modifying PARAMS((int, int));
+
+/* Functions for redisplay. */
+extern void rl_redisplay PARAMS((void));
+extern int rl_on_new_line PARAMS((void));
+extern int rl_on_new_line_with_prompt PARAMS((void));
+extern int rl_forced_update_display PARAMS((void));
+extern int rl_clear_message PARAMS((void));
+extern int rl_reset_line_state PARAMS((void));
+extern int rl_crlf PARAMS((void));
+
+#if defined (USE_VARARGS) && defined (PREFER_STDARG)
+extern int rl_message (const char *, ...)  __rl_attribute__((__format__ (printf, 1, 2)));
+#else
+extern int rl_message ();
+#endif
+
+extern int rl_show_char PARAMS((int));
+
+/* Undocumented in texinfo manual. */
+extern int rl_character_len PARAMS((int, int));
+
+/* Save and restore internal prompt redisplay information. */
+extern void rl_save_prompt PARAMS((void));
+extern void rl_restore_prompt PARAMS((void));
+
+/* Modifying text. */
+extern void rl_replace_line PARAMS((const char *, int));
+extern int rl_insert_text PARAMS((const char *));
+extern int rl_delete_text PARAMS((int, int));
+extern int rl_kill_text PARAMS((int, int));
+extern char *rl_copy_text PARAMS((int, int));
+
+/* Terminal and tty mode management. */
+extern void rl_prep_terminal PARAMS((int));
+extern void rl_deprep_terminal PARAMS((void));
+extern void rl_tty_set_default_bindings PARAMS((Keymap));
+extern void rl_tty_unset_default_bindings PARAMS((Keymap));
+
+extern int rl_reset_terminal PARAMS((const char *));
+extern void rl_resize_terminal PARAMS((void));
+extern void rl_set_screen_size PARAMS((int, int));
+extern void rl_get_screen_size PARAMS((int *, int *));
+extern void rl_reset_screen_size PARAMS((void));
+
+extern char *rl_get_termcap PARAMS((const char *));
+
+/* Functions for character input. */
+extern int rl_stuff_char PARAMS((int));
+extern int rl_execute_next PARAMS((int));
+extern int rl_clear_pending_input PARAMS((void));
+extern int rl_read_key PARAMS((void));
+extern int rl_getc PARAMS((FILE *));
+extern int rl_set_keyboard_input_timeout PARAMS((int));
+
+/* `Public' utility functions . */
+extern void rl_extend_line_buffer PARAMS((int));
+extern int rl_ding PARAMS((void));
+extern int rl_alphabetic PARAMS((int));
+extern void rl_free PARAMS((void *));
+
+/* Readline signal handling, from signals.c */
+extern int rl_set_signals PARAMS((void));
+extern int rl_clear_signals PARAMS((void));
+extern void rl_cleanup_after_signal PARAMS((void));
+extern void rl_reset_after_signal PARAMS((void));
+extern void rl_free_line_state PARAMS((void));
+
+extern void rl_echo_signal_char PARAMS((int)); 
+
+extern int rl_set_paren_blink_timeout PARAMS((int));
+
+/* Undocumented. */
+extern int rl_maybe_save_line PARAMS((void));
+extern int rl_maybe_unsave_line PARAMS((void));
+extern int rl_maybe_replace_line PARAMS((void));
+
+/* Completion functions. */
+extern int rl_complete_internal PARAMS((int));
+extern void rl_display_match_list PARAMS((char **, int, int));
+
+extern char **rl_completion_matches PARAMS((const char *, rl_compentry_func_t *));
+extern char *rl_username_completion_function PARAMS((const char *, int));
+extern char *rl_filename_completion_function PARAMS((const char *, int));
+
+extern int rl_completion_mode PARAMS((rl_command_func_t *));
+
+#if 0
+/* Backwards compatibility (compat.c).  These will go away sometime. */
+extern void free_undo_list PARAMS((void));
+extern int maybe_save_line PARAMS((void));
+extern int maybe_unsave_line PARAMS((void));
+extern int maybe_replace_line PARAMS((void));
+
+extern int ding PARAMS((void));
+extern int alphabetic PARAMS((int));
+extern int crlf PARAMS((void));
+
+extern char **completion_matches PARAMS((char *, rl_compentry_func_t *));
+extern char *username_completion_function PARAMS((const char *, int));
+extern char *filename_completion_function PARAMS((const char *, int));
+#endif
+
+/* **************************************************************** */
+/*								    */
+/*			Well Published Variables		    */
+/*								    */
+/* **************************************************************** */
+
+/* The version of this incarnation of the readline library. */
+extern const char *rl_library_version;		/* e.g., "4.2" */
+extern int rl_readline_version;			/* e.g., 0x0402 */
+
+/* True if this is real GNU readline. */
+extern int rl_gnu_readline_p;
+
+/* Flags word encapsulating the current readline state. */
+extern int rl_readline_state;
+
+/* Says which editing mode readline is currently using.  1 means emacs mode;
+   0 means vi mode. */
+extern int rl_editing_mode;
+
+/* Insert or overwrite mode for emacs mode.  1 means insert mode; 0 means
+   overwrite mode.  Reset to insert mode on each input line. */
+extern int rl_insert_mode;
+
+/* The name of the calling program.  You should initialize this to
+   whatever was in argv[0].  It is used when parsing conditionals. */
+extern const char *rl_readline_name;
+
+/* The prompt readline uses.  This is set from the argument to
+   readline (), and should not be assigned to directly. */
+extern char *rl_prompt;
+
+/* The prompt string that is actually displayed by rl_redisplay.  Public so
+   applications can more easily supply their own redisplay functions. */
+extern char *rl_display_prompt;
+
+/* The line buffer that is in use. */
+extern char *rl_line_buffer;
+
+/* The location of point, and end. */
+extern int rl_point;
+extern int rl_end;
+
+/* The mark, or saved cursor position. */
+extern int rl_mark;
+
+/* Flag to indicate that readline has finished with the current input
+   line and should return it. */
+extern int rl_done;
+
+/* If set to a character value, that will be the next keystroke read. */
+extern int rl_pending_input;
+
+/* Non-zero if we called this function from _rl_dispatch().  It's present
+   so functions can find out whether they were called from a key binding
+   or directly from an application. */
+extern int rl_dispatching;
+
+/* Non-zero if the user typed a numeric argument before executing the
+   current function. */
+extern int rl_explicit_arg;
+
+/* The current value of the numeric argument specified by the user. */
+extern int rl_numeric_arg;
+
+/* The address of the last command function Readline executed. */
+extern rl_command_func_t *rl_last_func;
+
+/* The name of the terminal to use. */
+extern const char *rl_terminal_name;
+
+/* The input and output streams. */
+extern FILE *rl_instream;
+extern FILE *rl_outstream;
+
+/* If non-zero, Readline gives values of LINES and COLUMNS from the environment
+   greater precedence than values fetched from the kernel when computing the
+   screen dimensions. */
+extern int rl_prefer_env_winsize;
+
+/* If non-zero, then this is the address of a function to call just
+   before readline_internal () prints the first prompt. */
+extern rl_hook_func_t *rl_startup_hook;
+
+/* If non-zero, this is the address of a function to call just before
+   readline_internal_setup () returns and readline_internal starts
+   reading input characters. */
+extern rl_hook_func_t *rl_pre_input_hook;
+      
+/* The address of a function to call periodically while Readline is
+   awaiting character input, or NULL, for no event handling. */
+extern rl_hook_func_t *rl_event_hook;
+
+/* The address of the function to call to fetch a character from the current
+   Readline input stream */
+extern rl_getc_func_t *rl_getc_function;
+
+extern rl_voidfunc_t *rl_redisplay_function;
+
+extern rl_vintfunc_t *rl_prep_term_function;
+extern rl_voidfunc_t *rl_deprep_term_function;
+
+/* Dispatch variables. */
+extern Keymap rl_executing_keymap;
+extern Keymap rl_binding_keymap;
+
+/* Display variables. */
+/* If non-zero, readline will erase the entire line, including any prompt,
+   if the only thing typed on an otherwise-blank line is something bound to
+   rl_newline. */
+extern int rl_erase_empty_line;
+
+/* If non-zero, the application has already printed the prompt (rl_prompt)
+   before calling readline, so readline should not output it the first time
+   redisplay is done. */
+extern int rl_already_prompted;
+
+/* A non-zero value means to read only this many characters rather than
+   up to a character bound to accept-line. */
+extern int rl_num_chars_to_read;
+
+/* The text of a currently-executing keyboard macro. */
+extern char *rl_executing_macro;
+
+/* Variables to control readline signal handling. */
+/* If non-zero, readline will install its own signal handlers for
+   SIGINT, SIGTERM, SIGQUIT, SIGALRM, SIGTSTP, SIGTTIN, and SIGTTOU. */
+extern int rl_catch_signals;
+
+/* If non-zero, readline will install a signal handler for SIGWINCH
+   that also attempts to call any calling application's SIGWINCH signal
+   handler.  Note that the terminal is not cleaned up before the
+   application's signal handler is called; use rl_cleanup_after_signal()
+   to do that. */
+extern int rl_catch_sigwinch;
+
+/* Completion variables. */
+/* Pointer to the generator function for completion_matches ().
+   NULL means to use rl_filename_completion_function (), the default
+   filename completer. */
+extern rl_compentry_func_t *rl_completion_entry_function;
+
+/* Optional generator for menu completion.  Default is
+   rl_completion_entry_function (rl_filename_completion_function). */
+ extern rl_compentry_func_t *rl_menu_completion_entry_function;
+
+/* If rl_ignore_some_completions_function is non-NULL it is the address
+   of a function to call after all of the possible matches have been
+   generated, but before the actual completion is done to the input line.
+   The function is called with one argument; a NULL terminated array
+   of (char *).  If your function removes any of the elements, they
+   must be free()'ed. */
+extern rl_compignore_func_t *rl_ignore_some_completions_function;
+
+/* Pointer to alternative function to create matches.
+   Function is called with TEXT, START, and END.
+   START and END are indices in RL_LINE_BUFFER saying what the boundaries
+   of TEXT are.
+   If this function exists and returns NULL then call the value of
+   rl_completion_entry_function to try to match, otherwise use the
+   array of strings returned. */
+extern rl_completion_func_t *rl_attempted_completion_function;
+
+/* The basic list of characters that signal a break between words for the
+   completer routine.  The initial contents of this variable is what
+   breaks words in the shell, i.e. "n\"\\'`@$>". */
+extern const char *rl_basic_word_break_characters;
+
+/* The list of characters that signal a break between words for
+   rl_complete_internal.  The default list is the contents of
+   rl_basic_word_break_characters.  */
+extern /*const*/ char *rl_completer_word_break_characters;
+
+/* Hook function to allow an application to set the completion word
+   break characters before readline breaks up the line.  Allows
+   position-dependent word break characters. */
+extern rl_cpvfunc_t *rl_completion_word_break_hook;
+
+/* List of characters which can be used to quote a substring of the line.
+   Completion occurs on the entire substring, and within the substring   
+   rl_completer_word_break_characters are treated as any other character,
+   unless they also appear within this list. */
+extern const char *rl_completer_quote_characters;
+
+/* List of quote characters which cause a word break. */
+extern const char *rl_basic_quote_characters;
+
+/* List of characters that need to be quoted in filenames by the completer. */
+extern const char *rl_filename_quote_characters;
+
+/* List of characters that are word break characters, but should be left
+   in TEXT when it is passed to the completion function.  The shell uses
+   this to help determine what kind of completing to do. */
+extern const char *rl_special_prefixes;
+
+/* If non-zero, then this is the address of a function to call when
+   completing on a directory name.  The function is called with
+   the address of a string (the current directory name) as an arg.  It
+   changes what is displayed when the possible completions are printed
+   or inserted.  The directory completion hook should perform
+   any necessary dequoting.  This function should return 1 if it modifies
+   the directory name pointer passed as an argument.  If the directory
+   completion hook returns 0, it should not modify the directory name
+   pointer passed as an argument. */
+extern rl_icppfunc_t *rl_directory_completion_hook;
+
+/* If non-zero, this is the address of a function to call when completing
+   a directory name.  This function takes the address of the directory name
+   to be modified as an argument.  Unlike rl_directory_completion_hook, it
+   only modifies the directory name used in opendir(2), not what is displayed
+   when the possible completions are printed or inserted.  If set, it takes
+   precedence over rl_directory_completion_hook.  The directory rewrite
+   hook should perform any necessary dequoting.  This function has the same
+   return value properties as the directory_completion_hook.
+
+   I'm not happy with how this works yet, so it's undocumented.  I'm trying
+   it in bash to see how well it goes. */
+extern rl_icppfunc_t *rl_directory_rewrite_hook;
+
+/* If non-zero, this is the address of a function to call when reading
+   directory entries from the filesystem for completion and comparing
+   them to the partial word to be completed.  The function should
+   either return its first argument (if no conversion takes place) or
+   newly-allocated memory.  This can, for instance, convert filenames
+   between character sets for comparison against what's typed at the
+   keyboard.  The returned value is what is added to the list of
+   matches.  The second argument is the length of the filename to be
+   converted. */
+extern rl_dequote_func_t *rl_filename_rewrite_hook;
+
+/* Backwards compatibility with previous versions of readline. */
+#define rl_symbolic_link_hook rl_directory_completion_hook
+
+/* If non-zero, then this is the address of a function to call when
+   completing a word would normally display the list of possible matches.
+   This function is called instead of actually doing the display.
+   It takes three arguments: (char **matches, int num_matches, int max_length)
+   where MATCHES is the array of strings that matched, NUM_MATCHES is the
+   number of strings in that array, and MAX_LENGTH is the length of the
+   longest string in that array. */
+extern rl_compdisp_func_t *rl_completion_display_matches_hook;
+
+/* Non-zero means that the results of the matches are to be treated
+   as filenames.  This is ALWAYS zero on entry, and can only be changed
+   within a completion entry finder function. */
+extern int rl_filename_completion_desired;
+
+/* Non-zero means that the results of the matches are to be quoted using
+   double quotes (or an application-specific quoting mechanism) if the
+   filename contains any characters in rl_word_break_chars.  This is
+   ALWAYS non-zero on entry, and can only be changed within a completion
+   entry finder function. */
+extern int rl_filename_quoting_desired;
+
+/* Set to a function to quote a filename in an application-specific fashion.
+   Called with the text to quote, the type of match found (single or multiple)
+   and a pointer to the quoting character to be used, which the function can
+   reset if desired. */
+extern rl_quote_func_t *rl_filename_quoting_function;
+
+/* Function to call to remove quoting characters from a filename.  Called
+   before completion is attempted, so the embedded quotes do not interfere
+   with matching names in the file system. */
+extern rl_dequote_func_t *rl_filename_dequoting_function;
+
+/* Function to call to decide whether or not a word break character is
+   quoted.  If a character is quoted, it does not break words for the
+   completer. */
+extern rl_linebuf_func_t *rl_char_is_quoted_p;
+
+/* Non-zero means to suppress normal filename completion after the
+   user-specified completion function has been called. */
+extern int rl_attempted_completion_over;
+
+/* Set to a character describing the type of completion being attempted by
+   rl_complete_internal; available for use by application completion
+   functions. */
+extern int rl_completion_type;
+
+/* Set to the last key used to invoke one of the completion functions */
+extern int rl_completion_invoking_key;
+
+/* Up to this many items will be displayed in response to a
+   possible-completions call.  After that, we ask the user if she
+   is sure she wants to see them all.  The default value is 100. */
+extern int rl_completion_query_items;
+
+/* Character appended to completed words when at the end of the line.  The
+   default is a space.  Nothing is added if this is '\0'. */
+extern int rl_completion_append_character;
+
+/* If set to non-zero by an application completion function,
+   rl_completion_append_character will not be appended. */
+extern int rl_completion_suppress_append;
+
+/* Set to any quote character readline thinks it finds before any application
+   completion function is called. */
+extern int rl_completion_quote_character;
+
+/* Set to a non-zero value if readline found quoting anywhere in the word to
+   be completed; set before any application completion function is called. */
+extern int rl_completion_found_quote;
+
+/* If non-zero, the completion functions don't append any closing quote.
+   This is set to 0 by rl_complete_internal and may be changed by an
+   application-specific completion function. */
+extern int rl_completion_suppress_quote;
+
+/* If non-zero, readline will sort the completion matches.  On by default. */
+extern int rl_sort_completion_matches;
+
+/* If non-zero, a slash will be appended to completed filenames that are
+   symbolic links to directory names, subject to the value of the
+   mark-directories variable (which is user-settable).  This exists so
+   that application completion functions can override the user's preference
+   (set via the mark-symlinked-directories variable) if appropriate.
+   It's set to the value of _rl_complete_mark_symlink_dirs in
+   rl_complete_internal before any application-specific completion
+   function is called, so without that function doing anything, the user's
+   preferences are honored. */
+extern int rl_completion_mark_symlink_dirs;
+
+/* If non-zero, then disallow duplicates in the matches. */
+extern int rl_ignore_completion_duplicates;
+
+/* If this is non-zero, completion is (temporarily) inhibited, and the
+   completion character will be inserted as any other. */
+extern int rl_inhibit_completion;
+
+/* Input error; can be returned by (*rl_getc_function) if readline is reading
+   a top-level command (RL_ISSTATE (RL_STATE_READCMD)). */
+#define READERR			(-2)
+
+/* Definitions available for use by readline clients. */
+#define RL_PROMPT_START_IGNORE	'\001'
+#define RL_PROMPT_END_IGNORE	'\002'
+
+/* Possible values for do_replace argument to rl_filename_quoting_function,
+   called by rl_complete_internal. */
+#define NO_MATCH        0
+#define SINGLE_MATCH    1
+#define MULT_MATCH      2
+
+/* Possible state values for rl_readline_state */
+#define RL_STATE_NONE		0x000000		/* no state; before first call */
+
+#define RL_STATE_INITIALIZING	0x0000001	/* initializing */
+#define RL_STATE_INITIALIZED	0x0000002	/* initialization done */
+#define RL_STATE_TERMPREPPED	0x0000004	/* terminal is prepped */
+#define RL_STATE_READCMD	0x0000008	/* reading a command key */
+#define RL_STATE_METANEXT	0x0000010	/* reading input after ESC */
+#define RL_STATE_DISPATCHING	0x0000020	/* dispatching to a command */
+#define RL_STATE_MOREINPUT	0x0000040	/* reading more input in a command function */
+#define RL_STATE_ISEARCH	0x0000080	/* doing incremental search */
+#define RL_STATE_NSEARCH	0x0000100	/* doing non-inc search */
+#define RL_STATE_SEARCH		0x0000200	/* doing a history search */
+#define RL_STATE_NUMERICARG	0x0000400	/* reading numeric argument */
+#define RL_STATE_MACROINPUT	0x0000800	/* getting input from a macro */
+#define RL_STATE_MACRODEF	0x0001000	/* defining keyboard macro */
+#define RL_STATE_OVERWRITE	0x0002000	/* overwrite mode */
+#define RL_STATE_COMPLETING	0x0004000	/* doing completion */
+#define RL_STATE_SIGHANDLER	0x0008000	/* in readline sighandler */
+#define RL_STATE_UNDOING	0x0010000	/* doing an undo */
+#define RL_STATE_INPUTPENDING	0x0020000	/* rl_execute_next called */
+#define RL_STATE_TTYCSAVED	0x0040000	/* tty special chars saved */
+#define RL_STATE_CALLBACK	0x0080000	/* using the callback interface */
+#define RL_STATE_VIMOTION	0x0100000	/* reading vi motion arg */
+#define RL_STATE_MULTIKEY	0x0200000	/* reading multiple-key command */
+#define RL_STATE_VICMDONCE	0x0400000	/* entered vi command mode at least once */
+#define RL_STATE_REDISPLAYING	0x0800000	/* updating terminal display */
+
+#define RL_STATE_DONE		0x1000000	/* done; accepted line */
+
+#define RL_SETSTATE(x)		(rl_readline_state |= (x))
+#define RL_UNSETSTATE(x)	(rl_readline_state &= ~(x))
+#define RL_ISSTATE(x)		(rl_readline_state & (x))
+
+struct readline_state {
+  /* line state */
+  int point;
+  int end;
+  int mark;
+  char *buffer;
+  int buflen;
+  UNDO_LIST *ul;
+  char *prompt;
+
+  /* global state */
+  int rlstate;
+  int done;
+  Keymap kmap;
+
+  /* input state */
+  rl_command_func_t *lastfunc;
+  int insmode;
+  int edmode;
+  int kseqlen;
+  FILE *inf;
+  FILE *outf;
+  int pendingin;
+  char *macro;
+
+  /* signal state */
+  int catchsigs;
+  int catchsigwinch;
+
+  /* search state */
+
+  /* completion state */
+
+  /* options state */
+
+  /* reserved for future expansion, so the struct size doesn't change */
+  char reserved[64];
+};
+
+extern int rl_save_state PARAMS((struct readline_state *));
+extern int rl_restore_state PARAMS((struct readline_state *));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _READLINE_H_ */
diff --git a/ext/lua/includes/readline/rlconf.h b/ext/lua/includes/readline/rlconf.h
new file mode 100644
index 0000000..39f94db
--- /dev/null
+++ b/ext/lua/includes/readline/rlconf.h
@@ -0,0 +1,61 @@
+/* rlconf.h -- readline configuration definitions */
+
+/* Copyright (C) 1992-2009 Free Software Foundation, Inc.
+
+   This file is part of the GNU Readline Library (Readline), a library
+   for reading lines of text with interactive input and history editing.      
+
+   Readline is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   Readline is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with Readline.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#if !defined (_RLCONF_H_)
+#define _RLCONF_H_
+
+/* Define this if you want the vi-mode editing available. */
+#define VI_MODE
+
+/* Define this to get an indication of file type when listing completions. */
+#define VISIBLE_STATS
+
+/* This definition is needed by readline.c, rltty.c, and signals.c. */
+/* If on, then readline handles signals in a way that doesn't screw. */
+#define HANDLE_SIGNALS
+
+/* Ugly but working hack for binding prefix meta. */
+#define PREFIX_META_HACK
+
+/* The next-to-last-ditch effort file name for a user-specific init file. */
+#define DEFAULT_INPUTRC "~/.inputrc"
+
+/* The ultimate last-ditch filenname for an init file -- system-wide. */
+#define SYS_INPUTRC "/etc/inputrc"
+
+/* If defined, expand tabs to spaces. */
+#define DISPLAY_TABS
+
+/* If defined, use the terminal escape sequence to move the cursor forward
+   over a character when updating the line rather than rewriting it. */
+/* #define HACK_TERMCAP_MOTION */
+
+/* The string inserted by the `insert comment' command. */
+#define RL_COMMENT_BEGIN_DEFAULT "#"
+
+/* Define this if you want code that allows readline to be used in an
+   X `callback' style. */
+#define READLINE_CALLBACKS
+
+/* Define this if you want the cursor to indicate insert or overwrite mode. */
+/* #define CURSOR_MODE */
+
+#endif /* _RLCONF_H_ */
diff --git a/ext/lua/includes/readline/rlstdc.h b/ext/lua/includes/readline/rlstdc.h
new file mode 100644
index 0000000..59d570b
--- /dev/null
+++ b/ext/lua/includes/readline/rlstdc.h
@@ -0,0 +1,45 @@
+/* stdc.h -- macros to make source compile on both ANSI C and K&R C compilers. */
+
+/* Copyright (C) 1993-2009 Free Software Foundation, Inc.
+
+   This file is part of the GNU Readline Library (Readline), a library
+   for reading lines of text with interactive input and history editing.      
+
+   Readline is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   Readline is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with Readline.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#if !defined (_RL_STDC_H_)
+#define _RL_STDC_H_
+
+/* Adapted from BSD /usr/include/sys/cdefs.h. */
+
+/* A function can be defined using prototypes and compile on both ANSI C
+   and traditional C compilers with something like this:
+	extern char *func PARAMS((char *, char *, int)); */
+
+#if !defined (PARAMS)
+#  if defined (__STDC__) || defined (__GNUC__) || defined (__cplusplus)
+#    define PARAMS(protos) protos
+#  else
+#    define PARAMS(protos) ()
+#  endif
+#endif
+
+#if defined(__GNUC__) && __GNUC__ >= 2
+#  define __rl_attribute__(x) __attribute__(x)
+#else
+#  define __rl_attribute__(x)
+#endif
+
+#endif /* !_RL_STDC_H_ */
diff --git a/ext/lua/includes/readline/rltypedefs.h b/ext/lua/includes/readline/rltypedefs.h
new file mode 100644
index 0000000..60f29a1
--- /dev/null
+++ b/ext/lua/includes/readline/rltypedefs.h
@@ -0,0 +1,93 @@
+/* rltypedefs.h -- Type declarations for readline functions. */
+
+/* Copyright (C) 2000-2009 Free Software Foundation, Inc.
+
+   This file is part of the GNU Readline Library (Readline), a library
+   for reading lines of text with interactive input and history editing.      
+
+   Readline is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   Readline is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with Readline.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#ifndef _RL_TYPEDEFS_H_
+#define _RL_TYPEDEFS_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* Old-style */
+
+#if !defined (_FUNCTION_DEF)
+#  define _FUNCTION_DEF
+
+typedef int Function ();
+typedef void VFunction ();
+typedef char *CPFunction ();
+typedef char **CPPFunction ();
+
+#endif /* _FUNCTION_DEF */
+
+/* New style. */
+
+#if !defined (_RL_FUNCTION_TYPEDEF)
+#  define _RL_FUNCTION_TYPEDEF
+
+/* Bindable functions */
+typedef int rl_command_func_t PARAMS((int, int));
+
+/* Typedefs for the completion system */
+typedef char *rl_compentry_func_t PARAMS((const char *, int));
+typedef char **rl_completion_func_t PARAMS((const char *, int, int));
+
+typedef char *rl_quote_func_t PARAMS((char *, int, char *));
+typedef char *rl_dequote_func_t PARAMS((char *, int));
+
+typedef int rl_compignore_func_t PARAMS((char **));
+
+typedef void rl_compdisp_func_t PARAMS((char **, int, int));
+
+/* Type for input and pre-read hook functions like rl_event_hook */
+typedef int rl_hook_func_t PARAMS((void));
+
+/* Input function type */
+typedef int rl_getc_func_t PARAMS((FILE *));
+
+/* Generic function that takes a character buffer (which could be the readline
+   line buffer) and an index into it (which could be rl_point) and returns
+   an int. */
+typedef int rl_linebuf_func_t PARAMS((char *, int));
+
+/* `Generic' function pointer typedefs */
+typedef int rl_intfunc_t PARAMS((int));
+#define rl_ivoidfunc_t rl_hook_func_t
+typedef int rl_icpfunc_t PARAMS((char *));
+typedef int rl_icppfunc_t PARAMS((char **));
+
+typedef void rl_voidfunc_t PARAMS((void));
+typedef void rl_vintfunc_t PARAMS((int));
+typedef void rl_vcpfunc_t PARAMS((char *));
+typedef void rl_vcppfunc_t PARAMS((char **));
+
+typedef char *rl_cpvfunc_t PARAMS((void));
+typedef char *rl_cpifunc_t PARAMS((int));
+typedef char *rl_cpcpfunc_t PARAMS((char  *));
+typedef char *rl_cpcppfunc_t PARAMS((char  **));
+
+#endif /* _RL_FUNCTION_TYPEDEF */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _RL_TYPEDEFS_H_ */
diff --git a/ext/lua/includes/readline/tilde.h b/ext/lua/includes/readline/tilde.h
new file mode 100644
index 0000000..e26dd04
--- /dev/null
+++ b/ext/lua/includes/readline/tilde.h
@@ -0,0 +1,80 @@
+/* tilde.h: Externally available variables and function in libtilde.a. */
+
+/* Copyright (C) 1992-2009 Free Software Foundation, Inc.
+
+   This file contains the Readline Library (Readline), a set of
+   routines for providing Emacs style line input to programs that ask
+   for it.
+
+   Readline is free software: you can redistribute it and/or modify
+   it under the terms of the GNU General Public License as published by
+   the Free Software Foundation, either version 3 of the License, or
+   (at your option) any later version.
+
+   Readline is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+   GNU General Public License for more details.
+
+   You should have received a copy of the GNU General Public License
+   along with Readline.  If not, see <http://www.gnu.org/licenses/>.
+*/
+
+#if !defined (_TILDE_H_)
+#  define _TILDE_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* A function can be defined using prototypes and compile on both ANSI C
+   and traditional C compilers with something like this:
+	extern char *func PARAMS((char *, char *, int)); */
+
+#if !defined (PARAMS)
+#  if defined (__STDC__) || defined (__GNUC__) || defined (__cplusplus)
+#    define PARAMS(protos) protos
+#  else
+#    define PARAMS(protos) ()
+#  endif
+#endif
+
+typedef char *tilde_hook_func_t PARAMS((char *));
+
+/* If non-null, this contains the address of a function that the application
+   wants called before trying the standard tilde expansions.  The function
+   is called with the text sans tilde, and returns a malloc()'ed string
+   which is the expansion, or a NULL pointer if the expansion fails. */
+extern tilde_hook_func_t *tilde_expansion_preexpansion_hook;
+
+/* If non-null, this contains the address of a function to call if the
+   standard meaning for expanding a tilde fails.  The function is called
+   with the text (sans tilde, as in "foo"), and returns a malloc()'ed string
+   which is the expansion, or a NULL pointer if there is no expansion. */
+extern tilde_hook_func_t *tilde_expansion_failure_hook;
+
+/* When non-null, this is a NULL terminated array of strings which
+   are duplicates for a tilde prefix.  Bash uses this to expand
+   `=~' and `:~'. */
+extern char **tilde_additional_prefixes;
+
+/* When non-null, this is a NULL terminated array of strings which match
+   the end of a username, instead of just "/".  Bash sets this to
+   `:' and `=~'. */
+extern char **tilde_additional_suffixes;
+
+/* Return a new string which is the result of tilde expanding STRING. */
+extern char *tilde_expand PARAMS((const char *));
+
+/* Do the work of tilde expansion on FILENAME.  FILENAME starts with a
+   tilde.  If there is no expansion, call tilde_expansion_failure_hook. */
+extern char *tilde_expand_word PARAMS((const char *));
+
+/* Find the portion of the string beginning with ~ that should be expanded. */
+extern char *tilde_find_word PARAMS((const char *, int, int *));
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif /* _TILDE_H_ */
diff --git a/ext/lua/src/lapi.c b/ext/lua/src/lapi.c
new file mode 100644
index 0000000..791d854
--- /dev/null
+++ b/ext/lua/src/lapi.c
@@ -0,0 +1,1284 @@
+/*
+** $Id: lapi.c,v 2.171 2013/03/16 21:10:18 roberto Exp $
+** Lua API
+** See Copyright Notice in lua.h
+*/
+
+
+#include <stdarg.h>
+#include <string.h>
+
+#define lapi_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lapi.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lundump.h"
+#include "lvm.h"
+
+
+
+const char lua_ident[] =
+  "$LuaVersion: " LUA_COPYRIGHT " $"
+  "$LuaAuthors: " LUA_AUTHORS " $";
+
+
+/* value at a non-valid index */
+#define NONVALIDVALUE		cast(TValue *, luaO_nilobject)
+
+/* corresponding test */
+#define isvalid(o)	((o) != luaO_nilobject)
+
+/* test for pseudo index */
+#define ispseudo(i)		((i) <= LUA_REGISTRYINDEX)
+
+/* test for valid but not pseudo index */
+#define isstackindex(i, o)	(isvalid(o) && !ispseudo(i))
+
+#define api_checkvalidindex(L, o)  api_check(L, isvalid(o), "invalid index")
+
+#define api_checkstackindex(L, i, o)  \
+	api_check(L, isstackindex(i, o), "index not in the stack")
+
+
+static TValue *index2addr (lua_State *L, int idx) {
+  CallInfo *ci = L->ci;
+  if (idx > 0) {
+    TValue *o = ci->func + idx;
+    api_check(L, idx <= ci->top - (ci->func + 1), "unacceptable index");
+    if (o >= L->top) return NONVALIDVALUE;
+    else return o;
+  }
+  else if (!ispseudo(idx)) {  /* negative index */
+    api_check(L, idx != 0 && -idx <= L->top - (ci->func + 1), "invalid index");
+    return L->top + idx;
+  }
+  else if (idx == LUA_REGISTRYINDEX)
+    return &G(L)->l_registry;
+  else {  /* upvalues */
+    idx = LUA_REGISTRYINDEX - idx;
+    api_check(L, idx <= MAXUPVAL + 1, "upvalue index too large");
+    if (ttislcf(ci->func))  /* light C function? */
+      return NONVALIDVALUE;  /* it has no upvalues */
+    else {
+      CClosure *func = clCvalue(ci->func);
+      return (idx <= func->nupvalues) ? &func->upvalue[idx-1] : NONVALIDVALUE;
+    }
+  }
+}
+
+
+/*
+** to be called by 'lua_checkstack' in protected mode, to grow stack
+** capturing memory errors
+*/
+static void growstack (lua_State *L, void *ud) {
+  int size = *(int *)ud;
+  luaD_growstack(L, size);
+}
+
+
+LUA_API int lua_checkstack (lua_State *L, int size) {
+  int res;
+  CallInfo *ci = L->ci;
+  lua_lock(L);
+  if (L->stack_last - L->top > size)  /* stack large enough? */
+    res = 1;  /* yes; check is OK */
+  else {  /* no; need to grow stack */
+    int inuse = cast_int(L->top - L->stack) + EXTRA_STACK;
+    if (inuse > LUAI_MAXSTACK - size)  /* can grow without overflow? */
+      res = 0;  /* no */
+    else  /* try to grow stack */
+      res = (luaD_rawrunprotected(L, &growstack, &size) == LUA_OK);
+  }
+  if (res && ci->top < L->top + size)
+    ci->top = L->top + size;  /* adjust frame top */
+  lua_unlock(L);
+  return res;
+}
+
+
+LUA_API void lua_xmove (lua_State *from, lua_State *to, int n) {
+  int i;
+  if (from == to) return;
+  lua_lock(to);
+  api_checknelems(from, n);
+  api_check(from, G(from) == G(to), "moving among independent states");
+  api_check(from, to->ci->top - to->top >= n, "not enough elements to move");
+  from->top -= n;
+  for (i = 0; i < n; i++) {
+    setobj2s(to, to->top++, from->top + i);
+  }
+  lua_unlock(to);
+}
+
+
+LUA_API lua_CFunction lua_atpanic (lua_State *L, lua_CFunction panicf) {
+  lua_CFunction old;
+  lua_lock(L);
+  old = G(L)->panic;
+  G(L)->panic = panicf;
+  lua_unlock(L);
+  return old;
+}
+
+
+LUA_API const lua_Number *lua_version (lua_State *L) {
+  static const lua_Number version = LUA_VERSION_NUM;
+  if (L == NULL) return &version;
+  else return G(L)->version;
+}
+
+
+
+/*
+** basic stack manipulation
+*/
+
+
+/*
+** convert an acceptable stack index into an absolute index
+*/
+LUA_API int lua_absindex (lua_State *L, int idx) {
+  return (idx > 0 || ispseudo(idx))
+         ? idx
+         : cast_int(L->top - L->ci->func + idx);
+}
+
+
+LUA_API int lua_gettop (lua_State *L) {
+  return cast_int(L->top - (L->ci->func + 1));
+}
+
+
+LUA_API void lua_settop (lua_State *L, int idx) {
+  StkId func = L->ci->func;
+  lua_lock(L);
+  if (idx >= 0) {
+    api_check(L, idx <= L->stack_last - (func + 1), "new top too large");
+    while (L->top < (func + 1) + idx)
+      setnilvalue(L->top++);
+    L->top = (func + 1) + idx;
+  }
+  else {
+    api_check(L, -(idx+1) <= (L->top - (func + 1)), "invalid new top");
+    L->top += idx+1;  /* `subtract' index (index is negative) */
+  }
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_remove (lua_State *L, int idx) {
+  StkId p;
+  lua_lock(L);
+  p = index2addr(L, idx);
+  api_checkstackindex(L, idx, p);
+  while (++p < L->top) setobjs2s(L, p-1, p);
+  L->top--;
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_insert (lua_State *L, int idx) {
+  StkId p;
+  StkId q;
+  lua_lock(L);
+  p = index2addr(L, idx);
+  api_checkstackindex(L, idx, p);
+  for (q = L->top; q > p; q--)  /* use L->top as a temporary */
+    setobjs2s(L, q, q - 1);
+  setobjs2s(L, p, L->top);
+  lua_unlock(L);
+}
+
+
+static void moveto (lua_State *L, TValue *fr, int idx) {
+  TValue *to = index2addr(L, idx);
+  api_checkvalidindex(L, to);
+  setobj(L, to, fr);
+  if (idx < LUA_REGISTRYINDEX)  /* function upvalue? */
+    luaC_barrier(L, clCvalue(L->ci->func), fr);
+  /* LUA_REGISTRYINDEX does not need gc barrier
+     (collector revisits it before finishing collection) */
+}
+
+
+LUA_API void lua_replace (lua_State *L, int idx) {
+  lua_lock(L);
+  api_checknelems(L, 1);
+  moveto(L, L->top - 1, idx);
+  L->top--;
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_copy (lua_State *L, int fromidx, int toidx) {
+  TValue *fr;
+  lua_lock(L);
+  fr = index2addr(L, fromidx);
+  moveto(L, fr, toidx);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_pushvalue (lua_State *L, int idx) {
+  lua_lock(L);
+  setobj2s(L, L->top, index2addr(L, idx));
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+
+/*
+** access functions (stack -> C)
+*/
+
+
+LUA_API int lua_type (lua_State *L, int idx) {
+  StkId o = index2addr(L, idx);
+  return (isvalid(o) ? ttypenv(o) : LUA_TNONE);
+}
+
+
+LUA_API const char *lua_typename (lua_State *L, int t) {
+  UNUSED(L);
+  return ttypename(t);
+}
+
+
+LUA_API int lua_iscfunction (lua_State *L, int idx) {
+  StkId o = index2addr(L, idx);
+  return (ttislcf(o) || (ttisCclosure(o)));
+}
+
+
+LUA_API int lua_isnumber (lua_State *L, int idx) {
+  TValue n;
+  const TValue *o = index2addr(L, idx);
+  return tonumber(o, &n);
+}
+
+
+LUA_API int lua_isstring (lua_State *L, int idx) {
+  int t = lua_type(L, idx);
+  return (t == LUA_TSTRING || t == LUA_TNUMBER);
+}
+
+
+LUA_API int lua_isuserdata (lua_State *L, int idx) {
+  const TValue *o = index2addr(L, idx);
+  return (ttisuserdata(o) || ttislightuserdata(o));
+}
+
+
+LUA_API int lua_rawequal (lua_State *L, int index1, int index2) {
+  StkId o1 = index2addr(L, index1);
+  StkId o2 = index2addr(L, index2);
+  return (isvalid(o1) && isvalid(o2)) ? luaV_rawequalobj(o1, o2) : 0;
+}
+
+
+LUA_API void lua_arith (lua_State *L, int op) {
+  StkId o1;  /* 1st operand */
+  StkId o2;  /* 2nd operand */
+  lua_lock(L);
+  if (op != LUA_OPUNM) /* all other operations expect two operands */
+    api_checknelems(L, 2);
+  else {  /* for unary minus, add fake 2nd operand */
+    api_checknelems(L, 1);
+    setobjs2s(L, L->top, L->top - 1);
+    L->top++;
+  }
+  o1 = L->top - 2;
+  o2 = L->top - 1;
+  if (ttisnumber(o1) && ttisnumber(o2)) {
+    setnvalue(o1, luaO_arith(op, nvalue(o1), nvalue(o2)));
+  }
+  else
+    luaV_arith(L, o1, o1, o2, cast(TMS, op - LUA_OPADD + TM_ADD));
+  L->top--;
+  lua_unlock(L);
+}
+
+
+LUA_API int lua_compare (lua_State *L, int index1, int index2, int op) {
+  StkId o1, o2;
+  int i = 0;
+  lua_lock(L);  /* may call tag method */
+  o1 = index2addr(L, index1);
+  o2 = index2addr(L, index2);
+  if (isvalid(o1) && isvalid(o2)) {
+    switch (op) {
+      case LUA_OPEQ: i = equalobj(L, o1, o2); break;
+      case LUA_OPLT: i = luaV_lessthan(L, o1, o2); break;
+      case LUA_OPLE: i = luaV_lessequal(L, o1, o2); break;
+      default: api_check(L, 0, "invalid option");
+    }
+  }
+  lua_unlock(L);
+  return i;
+}
+
+
+LUA_API lua_Number lua_tonumberx (lua_State *L, int idx, int *isnum) {
+  TValue n;
+  const TValue *o = index2addr(L, idx);
+  if (tonumber(o, &n)) {
+    if (isnum) *isnum = 1;
+    return nvalue(o);
+  }
+  else {
+    if (isnum) *isnum = 0;
+    return 0;
+  }
+}
+
+
+LUA_API lua_Integer lua_tointegerx (lua_State *L, int idx, int *isnum) {
+  TValue n;
+  const TValue *o = index2addr(L, idx);
+  if (tonumber(o, &n)) {
+    lua_Integer res;
+    lua_Number num = nvalue(o);
+    lua_number2integer(res, num);
+    if (isnum) *isnum = 1;
+    return res;
+  }
+  else {
+    if (isnum) *isnum = 0;
+    return 0;
+  }
+}
+
+
+LUA_API lua_Unsigned lua_tounsignedx (lua_State *L, int idx, int *isnum) {
+  TValue n;
+  const TValue *o = index2addr(L, idx);
+  if (tonumber(o, &n)) {
+    lua_Unsigned res;
+    lua_Number num = nvalue(o);
+    lua_number2unsigned(res, num);
+    if (isnum) *isnum = 1;
+    return res;
+  }
+  else {
+    if (isnum) *isnum = 0;
+    return 0;
+  }
+}
+
+
+LUA_API int lua_toboolean (lua_State *L, int idx) {
+  const TValue *o = index2addr(L, idx);
+  return !l_isfalse(o);
+}
+
+
+LUA_API const char *lua_tolstring (lua_State *L, int idx, size_t *len) {
+  StkId o = index2addr(L, idx);
+  if (!ttisstring(o)) {
+    lua_lock(L);  /* `luaV_tostring' may create a new string */
+    if (!luaV_tostring(L, o)) {  /* conversion failed? */
+      if (len != NULL) *len = 0;
+      lua_unlock(L);
+      return NULL;
+    }
+    luaC_checkGC(L);
+    o = index2addr(L, idx);  /* previous call may reallocate the stack */
+    lua_unlock(L);
+  }
+  if (len != NULL) *len = tsvalue(o)->len;
+  return svalue(o);
+}
+
+
+LUA_API size_t lua_rawlen (lua_State *L, int idx) {
+  StkId o = index2addr(L, idx);
+  switch (ttypenv(o)) {
+    case LUA_TSTRING: return tsvalue(o)->len;
+    case LUA_TUSERDATA: return uvalue(o)->len;
+    case LUA_TTABLE: return luaH_getn(hvalue(o));
+    default: return 0;
+  }
+}
+
+
+LUA_API lua_CFunction lua_tocfunction (lua_State *L, int idx) {
+  StkId o = index2addr(L, idx);
+  if (ttislcf(o)) return fvalue(o);
+  else if (ttisCclosure(o))
+    return clCvalue(o)->f;
+  else return NULL;  /* not a C function */
+}
+
+
+LUA_API void *lua_touserdata (lua_State *L, int idx) {
+  StkId o = index2addr(L, idx);
+  switch (ttypenv(o)) {
+    case LUA_TUSERDATA: return (rawuvalue(o) + 1);
+    case LUA_TLIGHTUSERDATA: return pvalue(o);
+    default: return NULL;
+  }
+}
+
+
+LUA_API lua_State *lua_tothread (lua_State *L, int idx) {
+  StkId o = index2addr(L, idx);
+  return (!ttisthread(o)) ? NULL : thvalue(o);
+}
+
+
+LUA_API const void *lua_topointer (lua_State *L, int idx) {
+  StkId o = index2addr(L, idx);
+  switch (ttype(o)) {
+    case LUA_TTABLE: return hvalue(o);
+    case LUA_TLCL: return clLvalue(o);
+    case LUA_TCCL: return clCvalue(o);
+    case LUA_TLCF: return cast(void *, cast(size_t, fvalue(o)));
+    case LUA_TTHREAD: return thvalue(o);
+    case LUA_TUSERDATA:
+    case LUA_TLIGHTUSERDATA:
+      return lua_touserdata(L, idx);
+    default: return NULL;
+  }
+}
+
+
+
+/*
+** push functions (C -> stack)
+*/
+
+
+LUA_API void lua_pushnil (lua_State *L) {
+  lua_lock(L);
+  setnilvalue(L->top);
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_pushnumber (lua_State *L, lua_Number n) {
+  lua_lock(L);
+  setnvalue(L->top, n);
+  luai_checknum(L, L->top,
+    luaG_runerror(L, "C API - attempt to push a signaling NaN"));
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_pushinteger (lua_State *L, lua_Integer n) {
+  lua_lock(L);
+  setnvalue(L->top, cast_num(n));
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_pushunsigned (lua_State *L, lua_Unsigned u) {
+  lua_Number n;
+  lua_lock(L);
+  n = lua_unsigned2number(u);
+  setnvalue(L->top, n);
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API const char *lua_pushlstring (lua_State *L, const char *s, size_t len) {
+  TString *ts;
+  lua_lock(L);
+  luaC_checkGC(L);
+  ts = luaS_newlstr(L, s, len);
+  setsvalue2s(L, L->top, ts);
+  api_incr_top(L);
+  lua_unlock(L);
+  return getstr(ts);
+}
+
+
+LUA_API const char *lua_pushstring (lua_State *L, const char *s) {
+  if (s == NULL) {
+    lua_pushnil(L);
+    return NULL;
+  }
+  else {
+    TString *ts;
+    lua_lock(L);
+    luaC_checkGC(L);
+    ts = luaS_new(L, s);
+    setsvalue2s(L, L->top, ts);
+    api_incr_top(L);
+    lua_unlock(L);
+    return getstr(ts);
+  }
+}
+
+
+LUA_API const char *lua_pushvfstring (lua_State *L, const char *fmt,
+                                      va_list argp) {
+  const char *ret;
+  lua_lock(L);
+  luaC_checkGC(L);
+  ret = luaO_pushvfstring(L, fmt, argp);
+  lua_unlock(L);
+  return ret;
+}
+
+
+LUA_API const char *lua_pushfstring (lua_State *L, const char *fmt, ...) {
+  const char *ret;
+  va_list argp;
+  lua_lock(L);
+  luaC_checkGC(L);
+  va_start(argp, fmt);
+  ret = luaO_pushvfstring(L, fmt, argp);
+  va_end(argp);
+  lua_unlock(L);
+  return ret;
+}
+
+
+LUA_API void lua_pushcclosure (lua_State *L, lua_CFunction fn, int n) {
+  lua_lock(L);
+  if (n == 0) {
+    setfvalue(L->top, fn);
+  }
+  else {
+    Closure *cl;
+    api_checknelems(L, n);
+    api_check(L, n <= MAXUPVAL, "upvalue index too large");
+    luaC_checkGC(L);
+    cl = luaF_newCclosure(L, n);
+    cl->c.f = fn;
+    L->top -= n;
+    while (n--)
+      setobj2n(L, &cl->c.upvalue[n], L->top + n);
+    setclCvalue(L, L->top, cl);
+  }
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_pushboolean (lua_State *L, int b) {
+  lua_lock(L);
+  setbvalue(L->top, (b != 0));  /* ensure that true is 1 */
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_pushlightuserdata (lua_State *L, void *p) {
+  lua_lock(L);
+  setpvalue(L->top, p);
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API int lua_pushthread (lua_State *L) {
+  lua_lock(L);
+  setthvalue(L, L->top, L);
+  api_incr_top(L);
+  lua_unlock(L);
+  return (G(L)->mainthread == L);
+}
+
+
+
+/*
+** get functions (Lua -> stack)
+*/
+
+
+LUA_API void lua_getglobal (lua_State *L, const char *var) {
+  Table *reg = hvalue(&G(L)->l_registry);
+  const TValue *gt;  /* global table */
+  lua_lock(L);
+  gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
+  setsvalue2s(L, L->top++, luaS_new(L, var));
+  luaV_gettable(L, gt, L->top - 1, L->top - 1);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_gettable (lua_State *L, int idx) {
+  StkId t;
+  lua_lock(L);
+  t = index2addr(L, idx);
+  luaV_gettable(L, t, L->top - 1, L->top - 1);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_getfield (lua_State *L, int idx, const char *k) {
+  StkId t;
+  lua_lock(L);
+  t = index2addr(L, idx);
+  setsvalue2s(L, L->top, luaS_new(L, k));
+  api_incr_top(L);
+  luaV_gettable(L, t, L->top - 1, L->top - 1);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_rawget (lua_State *L, int idx) {
+  StkId t;
+  lua_lock(L);
+  t = index2addr(L, idx);
+  api_check(L, ttistable(t), "table expected");
+  setobj2s(L, L->top - 1, luaH_get(hvalue(t), L->top - 1));
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_rawgeti (lua_State *L, int idx, int n) {
+  StkId t;
+  lua_lock(L);
+  t = index2addr(L, idx);
+  api_check(L, ttistable(t), "table expected");
+  setobj2s(L, L->top, luaH_getint(hvalue(t), n));
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_rawgetp (lua_State *L, int idx, const void *p) {
+  StkId t;
+  TValue k;
+  lua_lock(L);
+  t = index2addr(L, idx);
+  api_check(L, ttistable(t), "table expected");
+  setpvalue(&k, cast(void *, p));
+  setobj2s(L, L->top, luaH_get(hvalue(t), &k));
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_createtable (lua_State *L, int narray, int nrec) {
+  Table *t;
+  lua_lock(L);
+  luaC_checkGC(L);
+  t = luaH_new(L);
+  sethvalue(L, L->top, t);
+  api_incr_top(L);
+  if (narray > 0 || nrec > 0)
+    luaH_resize(L, t, narray, nrec);
+  lua_unlock(L);
+}
+
+
+LUA_API int lua_getmetatable (lua_State *L, int objindex) {
+  const TValue *obj;
+  Table *mt = NULL;
+  int res;
+  lua_lock(L);
+  obj = index2addr(L, objindex);
+  switch (ttypenv(obj)) {
+    case LUA_TTABLE:
+      mt = hvalue(obj)->metatable;
+      break;
+    case LUA_TUSERDATA:
+      mt = uvalue(obj)->metatable;
+      break;
+    default:
+      mt = G(L)->mt[ttypenv(obj)];
+      break;
+  }
+  if (mt == NULL)
+    res = 0;
+  else {
+    sethvalue(L, L->top, mt);
+    api_incr_top(L);
+    res = 1;
+  }
+  lua_unlock(L);
+  return res;
+}
+
+
+LUA_API void lua_getuservalue (lua_State *L, int idx) {
+  StkId o;
+  lua_lock(L);
+  o = index2addr(L, idx);
+  api_check(L, ttisuserdata(o), "userdata expected");
+  if (uvalue(o)->env) {
+    sethvalue(L, L->top, uvalue(o)->env);
+  } else
+    setnilvalue(L->top);
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+/*
+** set functions (stack -> Lua)
+*/
+
+
+LUA_API void lua_setglobal (lua_State *L, const char *var) {
+  Table *reg = hvalue(&G(L)->l_registry);
+  const TValue *gt;  /* global table */
+  lua_lock(L);
+  api_checknelems(L, 1);
+  gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
+  setsvalue2s(L, L->top++, luaS_new(L, var));
+  luaV_settable(L, gt, L->top - 1, L->top - 2);
+  L->top -= 2;  /* pop value and key */
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_settable (lua_State *L, int idx) {
+  StkId t;
+  lua_lock(L);
+  api_checknelems(L, 2);
+  t = index2addr(L, idx);
+  luaV_settable(L, t, L->top - 2, L->top - 1);
+  L->top -= 2;  /* pop index and value */
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_setfield (lua_State *L, int idx, const char *k) {
+  StkId t;
+  lua_lock(L);
+  api_checknelems(L, 1);
+  t = index2addr(L, idx);
+  setsvalue2s(L, L->top++, luaS_new(L, k));
+  luaV_settable(L, t, L->top - 1, L->top - 2);
+  L->top -= 2;  /* pop value and key */
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_rawset (lua_State *L, int idx) {
+  StkId t;
+  lua_lock(L);
+  api_checknelems(L, 2);
+  t = index2addr(L, idx);
+  api_check(L, ttistable(t), "table expected");
+  setobj2t(L, luaH_set(L, hvalue(t), L->top-2), L->top-1);
+  invalidateTMcache(hvalue(t));
+  luaC_barrierback(L, gcvalue(t), L->top-1);
+  L->top -= 2;
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_rawseti (lua_State *L, int idx, int n) {
+  StkId t;
+  lua_lock(L);
+  api_checknelems(L, 1);
+  t = index2addr(L, idx);
+  api_check(L, ttistable(t), "table expected");
+  luaH_setint(L, hvalue(t), n, L->top - 1);
+  luaC_barrierback(L, gcvalue(t), L->top-1);
+  L->top--;
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_rawsetp (lua_State *L, int idx, const void *p) {
+  StkId t;
+  TValue k;
+  lua_lock(L);
+  api_checknelems(L, 1);
+  t = index2addr(L, idx);
+  api_check(L, ttistable(t), "table expected");
+  setpvalue(&k, cast(void *, p));
+  setobj2t(L, luaH_set(L, hvalue(t), &k), L->top - 1);
+  luaC_barrierback(L, gcvalue(t), L->top - 1);
+  L->top--;
+  lua_unlock(L);
+}
+
+
+LUA_API int lua_setmetatable (lua_State *L, int objindex) {
+  TValue *obj;
+  Table *mt;
+  lua_lock(L);
+  api_checknelems(L, 1);
+  obj = index2addr(L, objindex);
+  if (ttisnil(L->top - 1))
+    mt = NULL;
+  else {
+    api_check(L, ttistable(L->top - 1), "table expected");
+    mt = hvalue(L->top - 1);
+  }
+  switch (ttypenv(obj)) {
+    case LUA_TTABLE: {
+      hvalue(obj)->metatable = mt;
+      if (mt) {
+        luaC_objbarrierback(L, gcvalue(obj), mt);
+        luaC_checkfinalizer(L, gcvalue(obj), mt);
+      }
+      break;
+    }
+    case LUA_TUSERDATA: {
+      uvalue(obj)->metatable = mt;
+      if (mt) {
+        luaC_objbarrier(L, rawuvalue(obj), mt);
+        luaC_checkfinalizer(L, gcvalue(obj), mt);
+      }
+      break;
+    }
+    default: {
+      G(L)->mt[ttypenv(obj)] = mt;
+      break;
+    }
+  }
+  L->top--;
+  lua_unlock(L);
+  return 1;
+}
+
+
+LUA_API void lua_setuservalue (lua_State *L, int idx) {
+  StkId o;
+  lua_lock(L);
+  api_checknelems(L, 1);
+  o = index2addr(L, idx);
+  api_check(L, ttisuserdata(o), "userdata expected");
+  if (ttisnil(L->top - 1))
+    uvalue(o)->env = NULL;
+  else {
+    api_check(L, ttistable(L->top - 1), "table expected");
+    uvalue(o)->env = hvalue(L->top - 1);
+    luaC_objbarrier(L, gcvalue(o), hvalue(L->top - 1));
+  }
+  L->top--;
+  lua_unlock(L);
+}
+
+
+/*
+** `load' and `call' functions (run Lua code)
+*/
+
+
+#define checkresults(L,na,nr) \
+     api_check(L, (nr) == LUA_MULTRET || (L->ci->top - L->top >= (nr) - (na)), \
+	"results from function overflow current stack size")
+
+
+LUA_API int lua_getctx (lua_State *L, int *ctx) {
+  if (L->ci->callstatus & CIST_YIELDED) {
+    if (ctx) *ctx = L->ci->u.c.ctx;
+    return L->ci->u.c.status;
+  }
+  else return LUA_OK;
+}
+
+
+LUA_API void lua_callk (lua_State *L, int nargs, int nresults, int ctx,
+                        lua_CFunction k) {
+  StkId func;
+  lua_lock(L);
+  api_check(L, k == NULL || !isLua(L->ci),
+    "cannot use continuations inside hooks");
+  api_checknelems(L, nargs+1);
+  api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread");
+  checkresults(L, nargs, nresults);
+  func = L->top - (nargs+1);
+  if (k != NULL && L->nny == 0) {  /* need to prepare continuation? */
+    L->ci->u.c.k = k;  /* save continuation */
+    L->ci->u.c.ctx = ctx;  /* save context */
+    luaD_call(L, func, nresults, 1);  /* do the call */
+  }
+  else  /* no continuation or no yieldable */
+    luaD_call(L, func, nresults, 0);  /* just do the call */
+  adjustresults(L, nresults);
+  lua_unlock(L);
+}
+
+
+
+/*
+** Execute a protected call.
+*/
+struct CallS {  /* data to `f_call' */
+  StkId func;
+  int nresults;
+};
+
+
+static void f_call (lua_State *L, void *ud) {
+  struct CallS *c = cast(struct CallS *, ud);
+  luaD_call(L, c->func, c->nresults, 0);
+}
+
+
+
+LUA_API int lua_pcallk (lua_State *L, int nargs, int nresults, int errfunc,
+                        int ctx, lua_CFunction k) {
+  struct CallS c;
+  int status;
+  ptrdiff_t func;
+  lua_lock(L);
+  api_check(L, k == NULL || !isLua(L->ci),
+    "cannot use continuations inside hooks");
+  api_checknelems(L, nargs+1);
+  api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread");
+  checkresults(L, nargs, nresults);
+  if (errfunc == 0)
+    func = 0;
+  else {
+    StkId o = index2addr(L, errfunc);
+    api_checkstackindex(L, errfunc, o);
+    func = savestack(L, o);
+  }
+  c.func = L->top - (nargs+1);  /* function to be called */
+  if (k == NULL || L->nny > 0) {  /* no continuation or no yieldable? */
+    c.nresults = nresults;  /* do a 'conventional' protected call */
+    status = luaD_pcall(L, f_call, &c, savestack(L, c.func), func);
+  }
+  else {  /* prepare continuation (call is already protected by 'resume') */
+    CallInfo *ci = L->ci;
+    ci->u.c.k = k;  /* save continuation */
+    ci->u.c.ctx = ctx;  /* save context */
+    /* save information for error recovery */
+    ci->extra = savestack(L, c.func);
+    ci->u.c.old_allowhook = L->allowhook;
+    ci->u.c.old_errfunc = L->errfunc;
+    L->errfunc = func;
+    /* mark that function may do error recovery */
+    ci->callstatus |= CIST_YPCALL;
+    luaD_call(L, c.func, nresults, 1);  /* do the call */
+    ci->callstatus &= ~CIST_YPCALL;
+    L->errfunc = ci->u.c.old_errfunc;
+    status = LUA_OK;  /* if it is here, there were no errors */
+  }
+  adjustresults(L, nresults);
+  lua_unlock(L);
+  return status;
+}
+
+
+LUA_API int lua_load (lua_State *L, lua_Reader reader, void *data,
+                      const char *chunkname, const char *mode) {
+  ZIO z;
+  int status;
+  lua_lock(L);
+  if (!chunkname) chunkname = "?";
+  luaZ_init(L, &z, reader, data);
+  status = luaD_protectedparser(L, &z, chunkname, mode);
+  if (status == LUA_OK) {  /* no errors? */
+    LClosure *f = clLvalue(L->top - 1);  /* get newly created function */
+    if (f->nupvalues == 1) {  /* does it have one upvalue? */
+      /* get global table from registry */
+      Table *reg = hvalue(&G(L)->l_registry);
+      const TValue *gt = luaH_getint(reg, LUA_RIDX_GLOBALS);
+      /* set global table as 1st upvalue of 'f' (may be LUA_ENV) */
+      setobj(L, f->upvals[0]->v, gt);
+      luaC_barrier(L, f->upvals[0], gt);
+    }
+  }
+  lua_unlock(L);
+  return status;
+}
+
+
+LUA_API int lua_dump (lua_State *L, lua_Writer writer, void *data) {
+  int status;
+  TValue *o;
+  lua_lock(L);
+  api_checknelems(L, 1);
+  o = L->top - 1;
+  if (isLfunction(o))
+    status = luaU_dump(L, getproto(o), writer, data, 0);
+  else
+    status = 1;
+  lua_unlock(L);
+  return status;
+}
+
+
+LUA_API int lua_status (lua_State *L) {
+  return L->status;
+}
+
+
+/*
+** Garbage-collection function
+*/
+
+LUA_API int lua_gc (lua_State *L, int what, int data) {
+  int res = 0;
+  global_State *g;
+  lua_lock(L);
+  g = G(L);
+  switch (what) {
+    case LUA_GCSTOP: {
+      g->gcrunning = 0;
+      break;
+    }
+    case LUA_GCRESTART: {
+      luaE_setdebt(g, 0);
+      g->gcrunning = 1;
+      break;
+    }
+    case LUA_GCCOLLECT: {
+      luaC_fullgc(L, 0);
+      break;
+    }
+    case LUA_GCCOUNT: {
+      /* GC values are expressed in Kbytes: #bytes/2^10 */
+      res = cast_int(gettotalbytes(g) >> 10);
+      break;
+    }
+    case LUA_GCCOUNTB: {
+      res = cast_int(gettotalbytes(g) & 0x3ff);
+      break;
+    }
+    case LUA_GCSTEP: {
+      if (g->gckind == KGC_GEN) {  /* generational mode? */
+        res = (g->GCestimate == 0);  /* true if it will do major collection */
+        luaC_forcestep(L);  /* do a single step */
+      }
+      else {
+       lu_mem debt = cast(lu_mem, data) * 1024 - GCSTEPSIZE;
+       if (g->gcrunning)
+         debt += g->GCdebt;  /* include current debt */
+       luaE_setdebt(g, debt);
+       luaC_forcestep(L);
+       if (g->gcstate == GCSpause)  /* end of cycle? */
+         res = 1;  /* signal it */
+      }
+      break;
+    }
+    case LUA_GCSETPAUSE: {
+      res = g->gcpause;
+      g->gcpause = data;
+      break;
+    }
+    case LUA_GCSETMAJORINC: {
+      res = g->gcmajorinc;
+      g->gcmajorinc = data;
+      break;
+    }
+    case LUA_GCSETSTEPMUL: {
+      res = g->gcstepmul;
+      g->gcstepmul = data;
+      break;
+    }
+    case LUA_GCISRUNNING: {
+      res = g->gcrunning;
+      break;
+    }
+    case LUA_GCGEN: {  /* change collector to generational mode */
+      luaC_changemode(L, KGC_GEN);
+      break;
+    }
+    case LUA_GCINC: {  /* change collector to incremental mode */
+      luaC_changemode(L, KGC_NORMAL);
+      break;
+    }
+    default: res = -1;  /* invalid option */
+  }
+  lua_unlock(L);
+  return res;
+}
+
+
+
+/*
+** miscellaneous functions
+*/
+
+
+LUA_API int lua_error (lua_State *L) {
+  lua_lock(L);
+  api_checknelems(L, 1);
+  luaG_errormsg(L);
+  /* code unreachable; will unlock when control actually leaves the kernel */
+  return 0;  /* to avoid warnings */
+}
+
+
+LUA_API int lua_next (lua_State *L, int idx) {
+  StkId t;
+  int more;
+  lua_lock(L);
+  t = index2addr(L, idx);
+  api_check(L, ttistable(t), "table expected");
+  more = luaH_next(L, hvalue(t), L->top - 1);
+  if (more) {
+    api_incr_top(L);
+  }
+  else  /* no more elements */
+    L->top -= 1;  /* remove key */
+  lua_unlock(L);
+  return more;
+}
+
+
+LUA_API void lua_concat (lua_State *L, int n) {
+  lua_lock(L);
+  api_checknelems(L, n);
+  if (n >= 2) {
+    luaC_checkGC(L);
+    luaV_concat(L, n);
+  }
+  else if (n == 0) {  /* push empty string */
+    setsvalue2s(L, L->top, luaS_newlstr(L, "", 0));
+    api_incr_top(L);
+  }
+  /* else n == 1; nothing to do */
+  lua_unlock(L);
+}
+
+
+LUA_API void lua_len (lua_State *L, int idx) {
+  StkId t;
+  lua_lock(L);
+  t = index2addr(L, idx);
+  luaV_objlen(L, L->top, t);
+  api_incr_top(L);
+  lua_unlock(L);
+}
+
+
+LUA_API lua_Alloc lua_getallocf (lua_State *L, void **ud) {
+  lua_Alloc f;
+  lua_lock(L);
+  if (ud) *ud = G(L)->ud;
+  f = G(L)->frealloc;
+  lua_unlock(L);
+  return f;
+}
+
+
+LUA_API void lua_setallocf (lua_State *L, lua_Alloc f, void *ud) {
+  lua_lock(L);
+  G(L)->ud = ud;
+  G(L)->frealloc = f;
+  lua_unlock(L);
+}
+
+
+LUA_API void *lua_newuserdata (lua_State *L, size_t size) {
+  Udata *u;
+  lua_lock(L);
+  luaC_checkGC(L);
+  u = luaS_newudata(L, size, NULL);
+  setuvalue(L, L->top, u);
+  api_incr_top(L);
+  lua_unlock(L);
+  return u + 1;
+}
+
+
+
+static const char *aux_upvalue (StkId fi, int n, TValue **val,
+                                GCObject **owner) {
+  switch (ttype(fi)) {
+    case LUA_TCCL: {  /* C closure */
+      CClosure *f = clCvalue(fi);
+      if (!(1 <= n && n <= f->nupvalues)) return NULL;
+      *val = &f->upvalue[n-1];
+      if (owner) *owner = obj2gco(f);
+      return "";
+    }
+    case LUA_TLCL: {  /* Lua closure */
+      LClosure *f = clLvalue(fi);
+      TString *name;
+      Proto *p = f->p;
+      if (!(1 <= n && n <= p->sizeupvalues)) return NULL;
+      *val = f->upvals[n-1]->v;
+      if (owner) *owner = obj2gco(f->upvals[n - 1]);
+      name = p->upvalues[n-1].name;
+      return (name == NULL) ? "" : getstr(name);
+    }
+    default: return NULL;  /* not a closure */
+  }
+}
+
+
+LUA_API const char *lua_getupvalue (lua_State *L, int funcindex, int n) {
+  const char *name;
+  TValue *val = NULL;  /* to avoid warnings */
+  lua_lock(L);
+  name = aux_upvalue(index2addr(L, funcindex), n, &val, NULL);
+  if (name) {
+    setobj2s(L, L->top, val);
+    api_incr_top(L);
+  }
+  lua_unlock(L);
+  return name;
+}
+
+
+LUA_API const char *lua_setupvalue (lua_State *L, int funcindex, int n) {
+  const char *name;
+  TValue *val = NULL;  /* to avoid warnings */
+  GCObject *owner = NULL;  /* to avoid warnings */
+  StkId fi;
+  lua_lock(L);
+  fi = index2addr(L, funcindex);
+  api_checknelems(L, 1);
+  name = aux_upvalue(fi, n, &val, &owner);
+  if (name) {
+    L->top--;
+    setobj(L, val, L->top);
+    luaC_barrier(L, owner, L->top);
+  }
+  lua_unlock(L);
+  return name;
+}
+
+
+static UpVal **getupvalref (lua_State *L, int fidx, int n, LClosure **pf) {
+  LClosure *f;
+  StkId fi = index2addr(L, fidx);
+  api_check(L, ttisLclosure(fi), "Lua function expected");
+  f = clLvalue(fi);
+  api_check(L, (1 <= n && n <= f->p->sizeupvalues), "invalid upvalue index");
+  if (pf) *pf = f;
+  return &f->upvals[n - 1];  /* get its upvalue pointer */
+}
+
+
+LUA_API void *lua_upvalueid (lua_State *L, int fidx, int n) {
+  StkId fi = index2addr(L, fidx);
+  switch (ttype(fi)) {
+    case LUA_TLCL: {  /* lua closure */
+      return *getupvalref(L, fidx, n, NULL);
+    }
+    case LUA_TCCL: {  /* C closure */
+      CClosure *f = clCvalue(fi);
+      api_check(L, 1 <= n && n <= f->nupvalues, "invalid upvalue index");
+      return &f->upvalue[n - 1];
+    }
+    default: {
+      api_check(L, 0, "closure expected");
+      return NULL;
+    }
+  }
+}
+
+
+LUA_API void lua_upvaluejoin (lua_State *L, int fidx1, int n1,
+                                            int fidx2, int n2) {
+  LClosure *f1;
+  UpVal **up1 = getupvalref(L, fidx1, n1, &f1);
+  UpVal **up2 = getupvalref(L, fidx2, n2, NULL);
+  *up1 = *up2;
+  luaC_objbarrier(L, f1, *up2);
+}
+
diff --git a/ext/lua/src/lauxlib.c b/ext/lua/src/lauxlib.c
new file mode 100644
index 0000000..2e989d6
--- /dev/null
+++ b/ext/lua/src/lauxlib.c
@@ -0,0 +1,959 @@
+/*
+** $Id: lauxlib.c,v 1.248 2013/03/21 13:54:57 roberto Exp $
+** Auxiliary functions for building Lua libraries
+** See Copyright Notice in lua.h
+*/
+
+
+#include <errno.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+
+/* This file uses only the official API of Lua.
+** Any function declared here could be written as an application function.
+*/
+
+#define lauxlib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+
+
+/*
+** {======================================================
+** Traceback
+** =======================================================
+*/
+
+
+#define LEVELS1	12	/* size of the first part of the stack */
+#define LEVELS2	10	/* size of the second part of the stack */
+
+
+
+/*
+** search for 'objidx' in table at index -1.
+** return 1 + string at top if find a good name.
+*/
+static int findfield (lua_State *L, int objidx, int level) {
+  if (level == 0 || !lua_istable(L, -1))
+    return 0;  /* not found */
+  lua_pushnil(L);  /* start 'next' loop */
+  while (lua_next(L, -2)) {  /* for each pair in table */
+    if (lua_type(L, -2) == LUA_TSTRING) {  /* ignore non-string keys */
+      if (lua_rawequal(L, objidx, -1)) {  /* found object? */
+        lua_pop(L, 1);  /* remove value (but keep name) */
+        return 1;
+      }
+      else if (findfield(L, objidx, level - 1)) {  /* try recursively */
+        lua_remove(L, -2);  /* remove table (but keep name) */
+        lua_pushliteral(L, ".");
+        lua_insert(L, -2);  /* place '.' between the two names */
+        lua_concat(L, 3);
+        return 1;
+      }
+    }
+    lua_pop(L, 1);  /* remove value */
+  }
+  return 0;  /* not found */
+}
+
+
+static int pushglobalfuncname (lua_State *L, lua_Debug *ar) {
+  int top = lua_gettop(L);
+  lua_getinfo(L, "f", ar);  /* push function */
+  lua_pushglobaltable(L);
+  if (findfield(L, top + 1, 2)) {
+    lua_copy(L, -1, top + 1);  /* move name to proper place */
+    lua_pop(L, 2);  /* remove pushed values */
+    return 1;
+  }
+  else {
+    lua_settop(L, top);  /* remove function and global table */
+    return 0;
+  }
+}
+
+
+static void pushfuncname (lua_State *L, lua_Debug *ar) {
+  if (*ar->namewhat != '\0')  /* is there a name? */
+    lua_pushfstring(L, "function " LUA_QS, ar->name);
+  else if (*ar->what == 'm')  /* main? */
+      lua_pushliteral(L, "main chunk");
+  else if (*ar->what == 'C') {
+    if (pushglobalfuncname(L, ar)) {
+      lua_pushfstring(L, "function " LUA_QS, lua_tostring(L, -1));
+      lua_remove(L, -2);  /* remove name */
+    }
+    else
+      lua_pushliteral(L, "?");
+  }
+  else
+    lua_pushfstring(L, "function <%s:%d>", ar->short_src, ar->linedefined);
+}
+
+
+static int countlevels (lua_State *L) {
+  lua_Debug ar;
+  int li = 1, le = 1;
+  /* find an upper bound */
+  while (lua_getstack(L, le, &ar)) { li = le; le *= 2; }
+  /* do a binary search */
+  while (li < le) {
+    int m = (li + le)/2;
+    if (lua_getstack(L, m, &ar)) li = m + 1;
+    else le = m;
+  }
+  return le - 1;
+}
+
+
+LUALIB_API void luaL_traceback (lua_State *L, lua_State *L1,
+                                const char *msg, int level) {
+  lua_Debug ar;
+  int top = lua_gettop(L);
+  int numlevels = countlevels(L1);
+  int mark = (numlevels > LEVELS1 + LEVELS2) ? LEVELS1 : 0;
+  if (msg) lua_pushfstring(L, "%s\n", msg);
+  lua_pushliteral(L, "stack traceback:");
+  while (lua_getstack(L1, level++, &ar)) {
+    if (level == mark) {  /* too many levels? */
+      lua_pushliteral(L, "\n\t...");  /* add a '...' */
+      level = numlevels - LEVELS2;  /* and skip to last ones */
+    }
+    else {
+      lua_getinfo(L1, "Slnt", &ar);
+      lua_pushfstring(L, "\n\t%s:", ar.short_src);
+      if (ar.currentline > 0)
+        lua_pushfstring(L, "%d:", ar.currentline);
+      lua_pushliteral(L, " in ");
+      pushfuncname(L, &ar);
+      if (ar.istailcall)
+        lua_pushliteral(L, "\n\t(...tail calls...)");
+      lua_concat(L, lua_gettop(L) - top);
+    }
+  }
+  lua_concat(L, lua_gettop(L) - top);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Error-report functions
+** =======================================================
+*/
+
+LUALIB_API int luaL_argerror (lua_State *L, int narg, const char *extramsg) {
+  lua_Debug ar;
+  if (!lua_getstack(L, 0, &ar))  /* no stack frame? */
+    return luaL_error(L, "bad argument #%d (%s)", narg, extramsg);
+  lua_getinfo(L, "n", &ar);
+  if (strcmp(ar.namewhat, "method") == 0) {
+    narg--;  /* do not count `self' */
+    if (narg == 0)  /* error is in the self argument itself? */
+      return luaL_error(L, "calling " LUA_QS " on bad self (%s)",
+                           ar.name, extramsg);
+  }
+  if (ar.name == NULL)
+    ar.name = (pushglobalfuncname(L, &ar)) ? lua_tostring(L, -1) : "?";
+  return luaL_error(L, "bad argument #%d to " LUA_QS " (%s)",
+                        narg, ar.name, extramsg);
+}
+
+
+static int typeerror (lua_State *L, int narg, const char *tname) {
+  const char *msg = lua_pushfstring(L, "%s expected, got %s",
+                                    tname, luaL_typename(L, narg));
+  return luaL_argerror(L, narg, msg);
+}
+
+
+static void tag_error (lua_State *L, int narg, int tag) {
+  typeerror(L, narg, lua_typename(L, tag));
+}
+
+
+LUALIB_API void luaL_where (lua_State *L, int level) {
+  lua_Debug ar;
+  if (lua_getstack(L, level, &ar)) {  /* check function at level */
+    lua_getinfo(L, "Sl", &ar);  /* get info about it */
+    if (ar.currentline > 0) {  /* is there info? */
+      lua_pushfstring(L, "%s:%d: ", ar.short_src, ar.currentline);
+      return;
+    }
+  }
+  lua_pushliteral(L, "");  /* else, no information available... */
+}
+
+
+LUALIB_API int luaL_error (lua_State *L, const char *fmt, ...) {
+  va_list argp;
+  va_start(argp, fmt);
+  luaL_where(L, 1);
+  lua_pushvfstring(L, fmt, argp);
+  va_end(argp);
+  lua_concat(L, 2);
+  return lua_error(L);
+}
+
+
+LUALIB_API int luaL_fileresult (lua_State *L, int stat, const char *fname) {
+  int en = errno;  /* calls to Lua API may change this value */
+  if (stat) {
+    lua_pushboolean(L, 1);
+    return 1;
+  }
+  else {
+    lua_pushnil(L);
+    if (fname)
+      lua_pushfstring(L, "%s: %s", fname, strerror(en));
+    else
+      lua_pushstring(L, strerror(en));
+    lua_pushinteger(L, en);
+    return 3;
+  }
+}
+
+
+#if !defined(inspectstat)	/* { */
+
+#if defined(LUA_USE_POSIX)
+
+#include <sys/wait.h>
+
+/*
+** use appropriate macros to interpret 'pclose' return status
+*/
+#define inspectstat(stat,what)  \
+   if (WIFEXITED(stat)) { stat = WEXITSTATUS(stat); } \
+   else if (WIFSIGNALED(stat)) { stat = WTERMSIG(stat); what = "signal"; }
+
+#else
+
+#define inspectstat(stat,what)  /* no op */
+
+#endif
+
+#endif				/* } */
+
+
+LUALIB_API int luaL_execresult (lua_State *L, int stat) {
+  const char *what = "exit";  /* type of termination */
+  if (stat == -1)  /* error? */
+    return luaL_fileresult(L, 0, NULL);
+  else {
+    inspectstat(stat, what);  /* interpret result */
+    if (*what == 'e' && stat == 0)  /* successful termination? */
+      lua_pushboolean(L, 1);
+    else
+      lua_pushnil(L);
+    lua_pushstring(L, what);
+    lua_pushinteger(L, stat);
+    return 3;  /* return true/nil,what,code */
+  }
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Userdata's metatable manipulation
+** =======================================================
+*/
+
+LUALIB_API int luaL_newmetatable (lua_State *L, const char *tname) {
+  luaL_getmetatable(L, tname);  /* try to get metatable */
+  if (!lua_isnil(L, -1))  /* name already in use? */
+    return 0;  /* leave previous value on top, but return 0 */
+  lua_pop(L, 1);
+  lua_newtable(L);  /* create metatable */
+  lua_pushvalue(L, -1);
+  lua_setfield(L, LUA_REGISTRYINDEX, tname);  /* registry.name = metatable */
+  return 1;
+}
+
+
+LUALIB_API void luaL_setmetatable (lua_State *L, const char *tname) {
+  luaL_getmetatable(L, tname);
+  lua_setmetatable(L, -2);
+}
+
+
+LUALIB_API void *luaL_testudata (lua_State *L, int ud, const char *tname) {
+  void *p = lua_touserdata(L, ud);
+  if (p != NULL) {  /* value is a userdata? */
+    if (lua_getmetatable(L, ud)) {  /* does it have a metatable? */
+      luaL_getmetatable(L, tname);  /* get correct metatable */
+      if (!lua_rawequal(L, -1, -2))  /* not the same? */
+        p = NULL;  /* value is a userdata with wrong metatable */
+      lua_pop(L, 2);  /* remove both metatables */
+      return p;
+    }
+  }
+  return NULL;  /* value is not a userdata with a metatable */
+}
+
+
+LUALIB_API void *luaL_checkudata (lua_State *L, int ud, const char *tname) {
+  void *p = luaL_testudata(L, ud, tname);
+  if (p == NULL) typeerror(L, ud, tname);
+  return p;
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Argument check functions
+** =======================================================
+*/
+
+LUALIB_API int luaL_checkoption (lua_State *L, int narg, const char *def,
+                                 const char *const lst[]) {
+  const char *name = (def) ? luaL_optstring(L, narg, def) :
+                             luaL_checkstring(L, narg);
+  int i;
+  for (i=0; lst[i]; i++)
+    if (strcmp(lst[i], name) == 0)
+      return i;
+  return luaL_argerror(L, narg,
+                       lua_pushfstring(L, "invalid option " LUA_QS, name));
+}
+
+
+LUALIB_API void luaL_checkstack (lua_State *L, int space, const char *msg) {
+  /* keep some extra space to run error routines, if needed */
+  const int extra = LUA_MINSTACK;
+  if (!lua_checkstack(L, space + extra)) {
+    if (msg)
+      luaL_error(L, "stack overflow (%s)", msg);
+    else
+      luaL_error(L, "stack overflow");
+  }
+}
+
+
+LUALIB_API void luaL_checktype (lua_State *L, int narg, int t) {
+  if (lua_type(L, narg) != t)
+    tag_error(L, narg, t);
+}
+
+
+LUALIB_API void luaL_checkany (lua_State *L, int narg) {
+  if (lua_type(L, narg) == LUA_TNONE)
+    luaL_argerror(L, narg, "value expected");
+}
+
+
+LUALIB_API const char *luaL_checklstring (lua_State *L, int narg, size_t *len) {
+  const char *s = lua_tolstring(L, narg, len);
+  if (!s) tag_error(L, narg, LUA_TSTRING);
+  return s;
+}
+
+
+LUALIB_API const char *luaL_optlstring (lua_State *L, int narg,
+                                        const char *def, size_t *len) {
+  if (lua_isnoneornil(L, narg)) {
+    if (len)
+      *len = (def ? strlen(def) : 0);
+    return def;
+  }
+  else return luaL_checklstring(L, narg, len);
+}
+
+
+LUALIB_API lua_Number luaL_checknumber (lua_State *L, int narg) {
+  int isnum;
+  lua_Number d = lua_tonumberx(L, narg, &isnum);
+  if (!isnum)
+    tag_error(L, narg, LUA_TNUMBER);
+  return d;
+}
+
+
+LUALIB_API lua_Number luaL_optnumber (lua_State *L, int narg, lua_Number def) {
+  return luaL_opt(L, luaL_checknumber, narg, def);
+}
+
+
+LUALIB_API lua_Integer luaL_checkinteger (lua_State *L, int narg) {
+  int isnum;
+  lua_Integer d = lua_tointegerx(L, narg, &isnum);
+  if (!isnum)
+    tag_error(L, narg, LUA_TNUMBER);
+  return d;
+}
+
+
+LUALIB_API lua_Unsigned luaL_checkunsigned (lua_State *L, int narg) {
+  int isnum;
+  lua_Unsigned d = lua_tounsignedx(L, narg, &isnum);
+  if (!isnum)
+    tag_error(L, narg, LUA_TNUMBER);
+  return d;
+}
+
+
+LUALIB_API lua_Integer luaL_optinteger (lua_State *L, int narg,
+                                                      lua_Integer def) {
+  return luaL_opt(L, luaL_checkinteger, narg, def);
+}
+
+
+LUALIB_API lua_Unsigned luaL_optunsigned (lua_State *L, int narg,
+                                                        lua_Unsigned def) {
+  return luaL_opt(L, luaL_checkunsigned, narg, def);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Generic Buffer manipulation
+** =======================================================
+*/
+
+/*
+** check whether buffer is using a userdata on the stack as a temporary
+** buffer
+*/
+#define buffonstack(B)	((B)->b != (B)->initb)
+
+
+/*
+** returns a pointer to a free area with at least 'sz' bytes
+*/
+LUALIB_API char *luaL_prepbuffsize (luaL_Buffer *B, size_t sz) {
+  lua_State *L = B->L;
+  if (B->size - B->n < sz) {  /* not enough space? */
+    char *newbuff;
+    size_t newsize = B->size * 2;  /* double buffer size */
+    if (newsize - B->n < sz)  /* not big enough? */
+      newsize = B->n + sz;
+    if (newsize < B->n || newsize - B->n < sz)
+      luaL_error(L, "buffer too large");
+    /* create larger buffer */
+    newbuff = (char *)lua_newuserdata(L, newsize * sizeof(char));
+    /* move content to new buffer */
+    memcpy(newbuff, B->b, B->n * sizeof(char));
+    if (buffonstack(B))
+      lua_remove(L, -2);  /* remove old buffer */
+    B->b = newbuff;
+    B->size = newsize;
+  }
+  return &B->b[B->n];
+}
+
+
+LUALIB_API void luaL_addlstring (luaL_Buffer *B, const char *s, size_t l) {
+  char *b = luaL_prepbuffsize(B, l);
+  memcpy(b, s, l * sizeof(char));
+  luaL_addsize(B, l);
+}
+
+
+LUALIB_API void luaL_addstring (luaL_Buffer *B, const char *s) {
+  luaL_addlstring(B, s, strlen(s));
+}
+
+
+LUALIB_API void luaL_pushresult (luaL_Buffer *B) {
+  lua_State *L = B->L;
+  lua_pushlstring(L, B->b, B->n);
+  if (buffonstack(B))
+    lua_remove(L, -2);  /* remove old buffer */
+}
+
+
+LUALIB_API void luaL_pushresultsize (luaL_Buffer *B, size_t sz) {
+  luaL_addsize(B, sz);
+  luaL_pushresult(B);
+}
+
+
+LUALIB_API void luaL_addvalue (luaL_Buffer *B) {
+  lua_State *L = B->L;
+  size_t l;
+  const char *s = lua_tolstring(L, -1, &l);
+  if (buffonstack(B))
+    lua_insert(L, -2);  /* put value below buffer */
+  luaL_addlstring(B, s, l);
+  lua_remove(L, (buffonstack(B)) ? -2 : -1);  /* remove value */
+}
+
+
+LUALIB_API void luaL_buffinit (lua_State *L, luaL_Buffer *B) {
+  B->L = L;
+  B->b = B->initb;
+  B->n = 0;
+  B->size = LUAL_BUFFERSIZE;
+}
+
+
+LUALIB_API char *luaL_buffinitsize (lua_State *L, luaL_Buffer *B, size_t sz) {
+  luaL_buffinit(L, B);
+  return luaL_prepbuffsize(B, sz);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Reference system
+** =======================================================
+*/
+
+/* index of free-list header */
+#define freelist	0
+
+
+LUALIB_API int luaL_ref (lua_State *L, int t) {
+  int ref;
+  if (lua_isnil(L, -1)) {
+    lua_pop(L, 1);  /* remove from stack */
+    return LUA_REFNIL;  /* `nil' has a unique fixed reference */
+  }
+  t = lua_absindex(L, t);
+  lua_rawgeti(L, t, freelist);  /* get first free element */
+  ref = (int)lua_tointeger(L, -1);  /* ref = t[freelist] */
+  lua_pop(L, 1);  /* remove it from stack */
+  if (ref != 0) {  /* any free element? */
+    lua_rawgeti(L, t, ref);  /* remove it from list */
+    lua_rawseti(L, t, freelist);  /* (t[freelist] = t[ref]) */
+  }
+  else  /* no free elements */
+    ref = (int)lua_rawlen(L, t) + 1;  /* get a new reference */
+  lua_rawseti(L, t, ref);
+  return ref;
+}
+
+
+LUALIB_API void luaL_unref (lua_State *L, int t, int ref) {
+  if (ref >= 0) {
+    t = lua_absindex(L, t);
+    lua_rawgeti(L, t, freelist);
+    lua_rawseti(L, t, ref);  /* t[ref] = t[freelist] */
+    lua_pushinteger(L, ref);
+    lua_rawseti(L, t, freelist);  /* t[freelist] = ref */
+  }
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Load functions
+** =======================================================
+*/
+
+typedef struct LoadF {
+  int n;  /* number of pre-read characters */
+  FILE *f;  /* file being read */
+  char buff[LUAL_BUFFERSIZE];  /* area for reading file */
+} LoadF;
+
+
+static const char *getF (lua_State *L, void *ud, size_t *size) {
+  LoadF *lf = (LoadF *)ud;
+  (void)L;  /* not used */
+  if (lf->n > 0) {  /* are there pre-read characters to be read? */
+    *size = lf->n;  /* return them (chars already in buffer) */
+    lf->n = 0;  /* no more pre-read characters */
+  }
+  else {  /* read a block from file */
+    /* 'fread' can return > 0 *and* set the EOF flag. If next call to
+       'getF' called 'fread', it might still wait for user input.
+       The next check avoids this problem. */
+    if (feof(lf->f)) return NULL;
+    *size = fread(lf->buff, 1, sizeof(lf->buff), lf->f);  /* read block */
+  }
+  return lf->buff;
+}
+
+
+static int errfile (lua_State *L, const char *what, int fnameindex) {
+  const char *serr = strerror(errno);
+  const char *filename = lua_tostring(L, fnameindex) + 1;
+  lua_pushfstring(L, "cannot %s %s: %s", what, filename, serr);
+  lua_remove(L, fnameindex);
+  return LUA_ERRFILE;
+}
+
+
+static int skipBOM (LoadF *lf) {
+  const char *p = "\xEF\xBB\xBF";  /* Utf8 BOM mark */
+  int c;
+  lf->n = 0;
+  do {
+    c = getc(lf->f);
+    if (c == EOF || c != *(const unsigned char *)p++) return c;
+    lf->buff[lf->n++] = c;  /* to be read by the parser */
+  } while (*p != '\0');
+  lf->n = 0;  /* prefix matched; discard it */
+  return getc(lf->f);  /* return next character */
+}
+
+
+/*
+** reads the first character of file 'f' and skips an optional BOM mark
+** in its beginning plus its first line if it starts with '#'. Returns
+** true if it skipped the first line.  In any case, '*cp' has the
+** first "valid" character of the file (after the optional BOM and
+** a first-line comment).
+*/
+static int skipcomment (LoadF *lf, int *cp) {
+  int c = *cp = skipBOM(lf);
+  if (c == '#') {  /* first line is a comment (Unix exec. file)? */
+    do {  /* skip first line */
+      c = getc(lf->f);
+    } while (c != EOF && c != '\n') ;
+    *cp = getc(lf->f);  /* skip end-of-line, if present */
+    return 1;  /* there was a comment */
+  }
+  else return 0;  /* no comment */
+}
+
+
+LUALIB_API int luaL_loadfilex (lua_State *L, const char *filename,
+                                             const char *mode) {
+  LoadF lf;
+  int status, readstatus;
+  int c;
+  int fnameindex = lua_gettop(L) + 1;  /* index of filename on the stack */
+  if (filename == NULL) {
+    lua_pushliteral(L, "=stdin");
+    lf.f = stdin;
+  }
+  else {
+    lua_pushfstring(L, "@%s", filename);
+    lf.f = fopen(filename, "r");
+    if (lf.f == NULL) return errfile(L, "open", fnameindex);
+  }
+  if (skipcomment(&lf, &c))  /* read initial portion */
+    lf.buff[lf.n++] = '\n';  /* add line to correct line numbers */
+  if (c == LUA_SIGNATURE[0] && filename) {  /* binary file? */
+    lf.f = freopen(filename, "rb", lf.f);  /* reopen in binary mode */
+    if (lf.f == NULL) return errfile(L, "reopen", fnameindex);
+    skipcomment(&lf, &c);  /* re-read initial portion */
+  }
+  if (c != EOF)
+    lf.buff[lf.n++] = c;  /* 'c' is the first character of the stream */
+  status = lua_load(L, getF, &lf, lua_tostring(L, -1), mode);
+  readstatus = ferror(lf.f);
+  if (filename) fclose(lf.f);  /* close file (even in case of errors) */
+  if (readstatus) {
+    lua_settop(L, fnameindex);  /* ignore results from `lua_load' */
+    return errfile(L, "read", fnameindex);
+  }
+  lua_remove(L, fnameindex);
+  return status;
+}
+
+
+typedef struct LoadS {
+  const char *s;
+  size_t size;
+} LoadS;
+
+
+static const char *getS (lua_State *L, void *ud, size_t *size) {
+  LoadS *ls = (LoadS *)ud;
+  (void)L;  /* not used */
+  if (ls->size == 0) return NULL;
+  *size = ls->size;
+  ls->size = 0;
+  return ls->s;
+}
+
+
+LUALIB_API int luaL_loadbufferx (lua_State *L, const char *buff, size_t size,
+                                 const char *name, const char *mode) {
+  LoadS ls;
+  ls.s = buff;
+  ls.size = size;
+  return lua_load(L, getS, &ls, name, mode);
+}
+
+
+LUALIB_API int luaL_loadstring (lua_State *L, const char *s) {
+  return luaL_loadbuffer(L, s, strlen(s), s);
+}
+
+/* }====================================================== */
+
+
+
+LUALIB_API int luaL_getmetafield (lua_State *L, int obj, const char *event) {
+  if (!lua_getmetatable(L, obj))  /* no metatable? */
+    return 0;
+  lua_pushstring(L, event);
+  lua_rawget(L, -2);
+  if (lua_isnil(L, -1)) {
+    lua_pop(L, 2);  /* remove metatable and metafield */
+    return 0;
+  }
+  else {
+    lua_remove(L, -2);  /* remove only metatable */
+    return 1;
+  }
+}
+
+
+LUALIB_API int luaL_callmeta (lua_State *L, int obj, const char *event) {
+  obj = lua_absindex(L, obj);
+  if (!luaL_getmetafield(L, obj, event))  /* no metafield? */
+    return 0;
+  lua_pushvalue(L, obj);
+  lua_call(L, 1, 1);
+  return 1;
+}
+
+
+LUALIB_API int luaL_len (lua_State *L, int idx) {
+  int l;
+  int isnum;
+  lua_len(L, idx);
+  l = (int)lua_tointegerx(L, -1, &isnum);
+  if (!isnum)
+    luaL_error(L, "object length is not a number");
+  lua_pop(L, 1);  /* remove object */
+  return l;
+}
+
+
+LUALIB_API const char *luaL_tolstring (lua_State *L, int idx, size_t *len) {
+  if (!luaL_callmeta(L, idx, "__tostring")) {  /* no metafield? */
+    switch (lua_type(L, idx)) {
+      case LUA_TNUMBER:
+      case LUA_TSTRING:
+        lua_pushvalue(L, idx);
+        break;
+      case LUA_TBOOLEAN:
+        lua_pushstring(L, (lua_toboolean(L, idx) ? "true" : "false"));
+        break;
+      case LUA_TNIL:
+        lua_pushliteral(L, "nil");
+        break;
+      default:
+        lua_pushfstring(L, "%s: %p", luaL_typename(L, idx),
+                                            lua_topointer(L, idx));
+        break;
+    }
+  }
+  return lua_tolstring(L, -1, len);
+}
+
+
+/*
+** {======================================================
+** Compatibility with 5.1 module functions
+** =======================================================
+*/
+#if defined(LUA_COMPAT_MODULE)
+
+static const char *luaL_findtable (lua_State *L, int idx,
+                                   const char *fname, int szhint) {
+  const char *e;
+  if (idx) lua_pushvalue(L, idx);
+  do {
+    e = strchr(fname, '.');
+    if (e == NULL) e = fname + strlen(fname);
+    lua_pushlstring(L, fname, e - fname);
+    lua_rawget(L, -2);
+    if (lua_isnil(L, -1)) {  /* no such field? */
+      lua_pop(L, 1);  /* remove this nil */
+      lua_createtable(L, 0, (*e == '.' ? 1 : szhint)); /* new table for field */
+      lua_pushlstring(L, fname, e - fname);
+      lua_pushvalue(L, -2);
+      lua_settable(L, -4);  /* set new table into field */
+    }
+    else if (!lua_istable(L, -1)) {  /* field has a non-table value? */
+      lua_pop(L, 2);  /* remove table and value */
+      return fname;  /* return problematic part of the name */
+    }
+    lua_remove(L, -2);  /* remove previous table */
+    fname = e + 1;
+  } while (*e == '.');
+  return NULL;
+}
+
+
+/*
+** Count number of elements in a luaL_Reg list.
+*/
+static int libsize (const luaL_Reg *l) {
+  int size = 0;
+  for (; l && l->name; l++) size++;
+  return size;
+}
+
+
+/*
+** Find or create a module table with a given name. The function
+** first looks at the _LOADED table and, if that fails, try a
+** global variable with that name. In any case, leaves on the stack
+** the module table.
+*/
+LUALIB_API void luaL_pushmodule (lua_State *L, const char *modname,
+                                 int sizehint) {
+  luaL_findtable(L, LUA_REGISTRYINDEX, "_LOADED", 1);  /* get _LOADED table */
+  lua_getfield(L, -1, modname);  /* get _LOADED[modname] */
+  if (!lua_istable(L, -1)) {  /* not found? */
+    lua_pop(L, 1);  /* remove previous result */
+    /* try global variable (and create one if it does not exist) */
+    lua_pushglobaltable(L);
+    if (luaL_findtable(L, 0, modname, sizehint) != NULL)
+      luaL_error(L, "name conflict for module " LUA_QS, modname);
+    lua_pushvalue(L, -1);
+    lua_setfield(L, -3, modname);  /* _LOADED[modname] = new table */
+  }
+  lua_remove(L, -2);  /* remove _LOADED table */
+}
+
+
+LUALIB_API void luaL_openlib (lua_State *L, const char *libname,
+                               const luaL_Reg *l, int nup) {
+  luaL_checkversion(L);
+  if (libname) {
+    luaL_pushmodule(L, libname, libsize(l));  /* get/create library table */
+    lua_insert(L, -(nup + 1));  /* move library table to below upvalues */
+  }
+  if (l)
+    luaL_setfuncs(L, l, nup);
+  else
+    lua_pop(L, nup);  /* remove upvalues */
+}
+
+#endif
+/* }====================================================== */
+
+/*
+** set functions from list 'l' into table at top - 'nup'; each
+** function gets the 'nup' elements at the top as upvalues.
+** Returns with only the table at the stack.
+*/
+LUALIB_API void luaL_setfuncs (lua_State *L, const luaL_Reg *l, int nup) {
+  luaL_checkversion(L);
+  luaL_checkstack(L, nup, "too many upvalues");
+  for (; l->name != NULL; l++) {  /* fill the table with given functions */
+    int i;
+    for (i = 0; i < nup; i++)  /* copy upvalues to the top */
+      lua_pushvalue(L, -nup);
+    lua_pushcclosure(L, l->func, nup);  /* closure with those upvalues */
+    lua_setfield(L, -(nup + 2), l->name);
+  }
+  lua_pop(L, nup);  /* remove upvalues */
+}
+
+
+/*
+** ensure that stack[idx][fname] has a table and push that table
+** into the stack
+*/
+LUALIB_API int luaL_getsubtable (lua_State *L, int idx, const char *fname) {
+  lua_getfield(L, idx, fname);
+  if (lua_istable(L, -1)) return 1;  /* table already there */
+  else {
+    lua_pop(L, 1);  /* remove previous result */
+    idx = lua_absindex(L, idx);
+    lua_newtable(L);
+    lua_pushvalue(L, -1);  /* copy to be left at top */
+    lua_setfield(L, idx, fname);  /* assign new table to field */
+    return 0;  /* false, because did not find table there */
+  }
+}
+
+
+/*
+** stripped-down 'require'. Calls 'openf' to open a module,
+** registers the result in 'package.loaded' table and, if 'glb'
+** is true, also registers the result in the global table.
+** Leaves resulting module on the top.
+*/
+LUALIB_API void luaL_requiref (lua_State *L, const char *modname,
+                               lua_CFunction openf, int glb) {
+  lua_pushcfunction(L, openf);
+  lua_pushstring(L, modname);  /* argument to open function */
+  lua_call(L, 1, 1);  /* open module */
+  luaL_getsubtable(L, LUA_REGISTRYINDEX, "_LOADED");
+  lua_pushvalue(L, -2);  /* make copy of module (call result) */
+  lua_setfield(L, -2, modname);  /* _LOADED[modname] = module */
+  lua_pop(L, 1);  /* remove _LOADED table */
+  if (glb) {
+    lua_pushvalue(L, -1);  /* copy of 'mod' */
+    lua_setglobal(L, modname);  /* _G[modname] = module */
+  }
+}
+
+
+LUALIB_API const char *luaL_gsub (lua_State *L, const char *s, const char *p,
+                                                               const char *r) {
+  const char *wild;
+  size_t l = strlen(p);
+  luaL_Buffer b;
+  luaL_buffinit(L, &b);
+  while ((wild = strstr(s, p)) != NULL) {
+    luaL_addlstring(&b, s, wild - s);  /* push prefix */
+    luaL_addstring(&b, r);  /* push replacement in place of pattern */
+    s = wild + l;  /* continue after `p' */
+  }
+  luaL_addstring(&b, s);  /* push last suffix */
+  luaL_pushresult(&b);
+  return lua_tostring(L, -1);
+}
+
+
+static void *l_alloc (void *ud, void *ptr, size_t osize, size_t nsize) {
+  (void)ud; (void)osize;  /* not used */
+  if (nsize == 0) {
+    free(ptr);
+    return NULL;
+  }
+  else
+    return realloc(ptr, nsize);
+}
+
+
+static int panic (lua_State *L) {
+  luai_writestringerror("PANIC: unprotected error in call to Lua API (%s)\n",
+                   lua_tostring(L, -1));
+  return 0;  /* return to Lua to abort */
+}
+
+
+LUALIB_API lua_State *luaL_newstate (void) {
+  lua_State *L = lua_newstate(l_alloc, NULL);
+  if (L) lua_atpanic(L, &panic);
+  return L;
+}
+
+
+LUALIB_API void luaL_checkversion_ (lua_State *L, lua_Number ver) {
+  const lua_Number *v = lua_version(L);
+  if (v != lua_version(NULL))
+    luaL_error(L, "multiple Lua VMs detected");
+  else if (*v != ver)
+    luaL_error(L, "version mismatch: app. needs %f, Lua core provides %f",
+                  ver, *v);
+  /* check conversions number -> integer types */
+  lua_pushnumber(L, -(lua_Number)0x1234);
+  if (lua_tointeger(L, -1) != -0x1234 ||
+      lua_tounsigned(L, -1) != (lua_Unsigned)-0x1234)
+    luaL_error(L, "bad conversion number->int;"
+                  " must recompile Lua with proper settings");
+  lua_pop(L, 1);
+}
+
diff --git a/ext/lua/src/lbaselib.c b/ext/lua/src/lbaselib.c
new file mode 100644
index 0000000..540e9a5
--- /dev/null
+++ b/ext/lua/src/lbaselib.c
@@ -0,0 +1,458 @@
+/*
+** $Id: lbaselib.c,v 1.276 2013/02/21 13:44:53 roberto Exp $
+** Basic library
+** See Copyright Notice in lua.h
+*/
+
+
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define lbaselib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+static int luaB_print (lua_State *L) {
+  int n = lua_gettop(L);  /* number of arguments */
+  int i;
+  lua_getglobal(L, "tostring");
+  for (i=1; i<=n; i++) {
+    const char *s;
+    size_t l;
+    lua_pushvalue(L, -1);  /* function to be called */
+    lua_pushvalue(L, i);   /* value to print */
+    lua_call(L, 1, 1);
+    s = lua_tolstring(L, -1, &l);  /* get result */
+    if (s == NULL)
+      return luaL_error(L,
+         LUA_QL("tostring") " must return a string to " LUA_QL("print"));
+    if (i>1) luai_writestring("\t", 1);
+    luai_writestring(s, l);
+    lua_pop(L, 1);  /* pop result */
+  }
+  luai_writeline();
+  return 0;
+}
+
+
+#define SPACECHARS	" \f\n\r\t\v"
+
+static int luaB_tonumber (lua_State *L) {
+  if (lua_isnoneornil(L, 2)) {  /* standard conversion */
+    int isnum;
+    lua_Number n = lua_tonumberx(L, 1, &isnum);
+    if (isnum) {
+      lua_pushnumber(L, n);
+      return 1;
+    }  /* else not a number; must be something */
+    luaL_checkany(L, 1);
+  }
+  else {
+    size_t l;
+    const char *s = luaL_checklstring(L, 1, &l);
+    const char *e = s + l;  /* end point for 's' */
+    int base = luaL_checkint(L, 2);
+    int neg = 0;
+    luaL_argcheck(L, 2 <= base && base <= 36, 2, "base out of range");
+    s += strspn(s, SPACECHARS);  /* skip initial spaces */
+    if (*s == '-') { s++; neg = 1; }  /* handle signal */
+    else if (*s == '+') s++;
+    if (isalnum((unsigned char)*s)) {
+      lua_Number n = 0;
+      do {
+        int digit = (isdigit((unsigned char)*s)) ? *s - '0'
+                       : toupper((unsigned char)*s) - 'A' + 10;
+        if (digit >= base) break;  /* invalid numeral; force a fail */
+        n = n * (lua_Number)base + (lua_Number)digit;
+        s++;
+      } while (isalnum((unsigned char)*s));
+      s += strspn(s, SPACECHARS);  /* skip trailing spaces */
+      if (s == e) {  /* no invalid trailing characters? */
+        lua_pushnumber(L, (neg) ? -n : n);
+        return 1;
+      }  /* else not a number */
+    }  /* else not a number */
+  }
+  lua_pushnil(L);  /* not a number */
+  return 1;
+}
+
+
+static int luaB_error (lua_State *L) {
+  int level = luaL_optint(L, 2, 1);
+  lua_settop(L, 1);
+  if (lua_isstring(L, 1) && level > 0) {  /* add extra information? */
+    luaL_where(L, level);
+    lua_pushvalue(L, 1);
+    lua_concat(L, 2);
+  }
+  return lua_error(L);
+}
+
+
+static int luaB_getmetatable (lua_State *L) {
+  luaL_checkany(L, 1);
+  if (!lua_getmetatable(L, 1)) {
+    lua_pushnil(L);
+    return 1;  /* no metatable */
+  }
+  luaL_getmetafield(L, 1, "__metatable");
+  return 1;  /* returns either __metatable field (if present) or metatable */
+}
+
+
+static int luaB_setmetatable (lua_State *L) {
+  int t = lua_type(L, 2);
+  luaL_checktype(L, 1, LUA_TTABLE);
+  luaL_argcheck(L, t == LUA_TNIL || t == LUA_TTABLE, 2,
+                    "nil or table expected");
+  if (luaL_getmetafield(L, 1, "__metatable"))
+    return luaL_error(L, "cannot change a protected metatable");
+  lua_settop(L, 2);
+  lua_setmetatable(L, 1);
+  return 1;
+}
+
+
+static int luaB_rawequal (lua_State *L) {
+  luaL_checkany(L, 1);
+  luaL_checkany(L, 2);
+  lua_pushboolean(L, lua_rawequal(L, 1, 2));
+  return 1;
+}
+
+
+static int luaB_rawlen (lua_State *L) {
+  int t = lua_type(L, 1);
+  luaL_argcheck(L, t == LUA_TTABLE || t == LUA_TSTRING, 1,
+                   "table or string expected");
+  lua_pushinteger(L, lua_rawlen(L, 1));
+  return 1;
+}
+
+
+static int luaB_rawget (lua_State *L) {
+  luaL_checktype(L, 1, LUA_TTABLE);
+  luaL_checkany(L, 2);
+  lua_settop(L, 2);
+  lua_rawget(L, 1);
+  return 1;
+}
+
+static int luaB_rawset (lua_State *L) {
+  luaL_checktype(L, 1, LUA_TTABLE);
+  luaL_checkany(L, 2);
+  luaL_checkany(L, 3);
+  lua_settop(L, 3);
+  lua_rawset(L, 1);
+  return 1;
+}
+
+
+static int luaB_collectgarbage (lua_State *L) {
+  static const char *const opts[] = {"stop", "restart", "collect",
+    "count", "step", "setpause", "setstepmul",
+    "setmajorinc", "isrunning", "generational", "incremental", NULL};
+  static const int optsnum[] = {LUA_GCSTOP, LUA_GCRESTART, LUA_GCCOLLECT,
+    LUA_GCCOUNT, LUA_GCSTEP, LUA_GCSETPAUSE, LUA_GCSETSTEPMUL,
+    LUA_GCSETMAJORINC, LUA_GCISRUNNING, LUA_GCGEN, LUA_GCINC};
+  int o = optsnum[luaL_checkoption(L, 1, "collect", opts)];
+  int ex = luaL_optint(L, 2, 0);
+  int res = lua_gc(L, o, ex);
+  switch (o) {
+    case LUA_GCCOUNT: {
+      int b = lua_gc(L, LUA_GCCOUNTB, 0);
+      lua_pushnumber(L, res + ((lua_Number)b/1024));
+      lua_pushinteger(L, b);
+      return 2;
+    }
+    case LUA_GCSTEP: case LUA_GCISRUNNING: {
+      lua_pushboolean(L, res);
+      return 1;
+    }
+    default: {
+      lua_pushinteger(L, res);
+      return 1;
+    }
+  }
+}
+
+
+static int luaB_type (lua_State *L) {
+  luaL_checkany(L, 1);
+  lua_pushstring(L, luaL_typename(L, 1));
+  return 1;
+}
+
+
+static int pairsmeta (lua_State *L, const char *method, int iszero,
+                      lua_CFunction iter) {
+  if (!luaL_getmetafield(L, 1, method)) {  /* no metamethod? */
+    luaL_checktype(L, 1, LUA_TTABLE);  /* argument must be a table */
+    lua_pushcfunction(L, iter);  /* will return generator, */
+    lua_pushvalue(L, 1);  /* state, */
+    if (iszero) lua_pushinteger(L, 0);  /* and initial value */
+    else lua_pushnil(L);
+  }
+  else {
+    lua_pushvalue(L, 1);  /* argument 'self' to metamethod */
+    lua_call(L, 1, 3);  /* get 3 values from metamethod */
+  }
+  return 3;
+}
+
+
+static int luaB_next (lua_State *L) {
+  luaL_checktype(L, 1, LUA_TTABLE);
+  lua_settop(L, 2);  /* create a 2nd argument if there isn't one */
+  if (lua_next(L, 1))
+    return 2;
+  else {
+    lua_pushnil(L);
+    return 1;
+  }
+}
+
+
+static int luaB_pairs (lua_State *L) {
+  return pairsmeta(L, "__pairs", 0, luaB_next);
+}
+
+
+static int ipairsaux (lua_State *L) {
+  int i = luaL_checkint(L, 2);
+  luaL_checktype(L, 1, LUA_TTABLE);
+  i++;  /* next value */
+  lua_pushinteger(L, i);
+  lua_rawgeti(L, 1, i);
+  return (lua_isnil(L, -1)) ? 1 : 2;
+}
+
+
+static int luaB_ipairs (lua_State *L) {
+  return pairsmeta(L, "__ipairs", 1, ipairsaux);
+}
+
+
+static int load_aux (lua_State *L, int status, int envidx) {
+  if (status == LUA_OK) {
+    if (envidx != 0) {  /* 'env' parameter? */
+      lua_pushvalue(L, envidx);  /* environment for loaded function */
+      if (!lua_setupvalue(L, -2, 1))  /* set it as 1st upvalue */
+        lua_pop(L, 1);  /* remove 'env' if not used by previous call */
+    }
+    return 1;
+  }
+  else {  /* error (message is on top of the stack) */
+    lua_pushnil(L);
+    lua_insert(L, -2);  /* put before error message */
+    return 2;  /* return nil plus error message */
+  }
+}
+
+
+static int luaB_loadfile (lua_State *L) {
+  const char *fname = luaL_optstring(L, 1, NULL);
+  const char *mode = luaL_optstring(L, 2, NULL);
+  int env = (!lua_isnone(L, 3) ? 3 : 0);  /* 'env' index or 0 if no 'env' */
+  int status = luaL_loadfilex(L, fname, mode);
+  return load_aux(L, status, env);
+}
+
+
+/*
+** {======================================================
+** Generic Read function
+** =======================================================
+*/
+
+
+/*
+** reserved slot, above all arguments, to hold a copy of the returned
+** string to avoid it being collected while parsed. 'load' has four
+** optional arguments (chunk, source name, mode, and environment).
+*/
+#define RESERVEDSLOT	5
+
+
+/*
+** Reader for generic `load' function: `lua_load' uses the
+** stack for internal stuff, so the reader cannot change the
+** stack top. Instead, it keeps its resulting string in a
+** reserved slot inside the stack.
+*/
+static const char *generic_reader (lua_State *L, void *ud, size_t *size) {
+  (void)(ud);  /* not used */
+  luaL_checkstack(L, 2, "too many nested functions");
+  lua_pushvalue(L, 1);  /* get function */
+  lua_call(L, 0, 1);  /* call it */
+  if (lua_isnil(L, -1)) {
+    lua_pop(L, 1);  /* pop result */
+    *size = 0;
+    return NULL;
+  }
+  else if (!lua_isstring(L, -1))
+    luaL_error(L, "reader function must return a string");
+  lua_replace(L, RESERVEDSLOT);  /* save string in reserved slot */
+  return lua_tolstring(L, RESERVEDSLOT, size);
+}
+
+
+static int luaB_load (lua_State *L) {
+  int status;
+  size_t l;
+  const char *s = lua_tolstring(L, 1, &l);
+  const char *mode = luaL_optstring(L, 3, "bt");
+  int env = (!lua_isnone(L, 4) ? 4 : 0);  /* 'env' index or 0 if no 'env' */
+  if (s != NULL) {  /* loading a string? */
+    const char *chunkname = luaL_optstring(L, 2, s);
+    status = luaL_loadbufferx(L, s, l, chunkname, mode);
+  }
+  else {  /* loading from a reader function */
+    const char *chunkname = luaL_optstring(L, 2, "=(load)");
+    luaL_checktype(L, 1, LUA_TFUNCTION);
+    lua_settop(L, RESERVEDSLOT);  /* create reserved slot */
+    status = lua_load(L, generic_reader, NULL, chunkname, mode);
+  }
+  return load_aux(L, status, env);
+}
+
+/* }====================================================== */
+
+
+static int dofilecont (lua_State *L) {
+  return lua_gettop(L) - 1;
+}
+
+
+static int luaB_dofile (lua_State *L) {
+  const char *fname = luaL_optstring(L, 1, NULL);
+  lua_settop(L, 1);
+  if (luaL_loadfile(L, fname) != LUA_OK)
+    return lua_error(L);
+  lua_callk(L, 0, LUA_MULTRET, 0, dofilecont);
+  return dofilecont(L);
+}
+
+
+static int luaB_assert (lua_State *L) {
+  if (!lua_toboolean(L, 1))
+    return luaL_error(L, "%s", luaL_optstring(L, 2, "assertion failed!"));
+  return lua_gettop(L);
+}
+
+
+static int luaB_select (lua_State *L) {
+  int n = lua_gettop(L);
+  if (lua_type(L, 1) == LUA_TSTRING && *lua_tostring(L, 1) == '#') {
+    lua_pushinteger(L, n-1);
+    return 1;
+  }
+  else {
+    int i = luaL_checkint(L, 1);
+    if (i < 0) i = n + i;
+    else if (i > n) i = n;
+    luaL_argcheck(L, 1 <= i, 1, "index out of range");
+    return n - i;
+  }
+}
+
+
+static int finishpcall (lua_State *L, int status) {
+  if (!lua_checkstack(L, 1)) {  /* no space for extra boolean? */
+    lua_settop(L, 0);  /* create space for return values */
+    lua_pushboolean(L, 0);
+    lua_pushstring(L, "stack overflow");
+    return 2;  /* return false, msg */
+  }
+  lua_pushboolean(L, status);  /* first result (status) */
+  lua_replace(L, 1);  /* put first result in first slot */
+  return lua_gettop(L);
+}
+
+
+static int pcallcont (lua_State *L) {
+  int status = lua_getctx(L, NULL);
+  return finishpcall(L, (status == LUA_YIELD));
+}
+
+
+static int luaB_pcall (lua_State *L) {
+  int status;
+  luaL_checkany(L, 1);
+  lua_pushnil(L);
+  lua_insert(L, 1);  /* create space for status result */
+  status = lua_pcallk(L, lua_gettop(L) - 2, LUA_MULTRET, 0, 0, pcallcont);
+  return finishpcall(L, (status == LUA_OK));
+}
+
+
+static int luaB_xpcall (lua_State *L) {
+  int status;
+  int n = lua_gettop(L);
+  luaL_argcheck(L, n >= 2, 2, "value expected");
+  lua_pushvalue(L, 1);  /* exchange function... */
+  lua_copy(L, 2, 1);  /* ...and error handler */
+  lua_replace(L, 2);
+  status = lua_pcallk(L, n - 2, LUA_MULTRET, 1, 0, pcallcont);
+  return finishpcall(L, (status == LUA_OK));
+}
+
+
+static int luaB_tostring (lua_State *L) {
+  luaL_checkany(L, 1);
+  luaL_tolstring(L, 1, NULL);
+  return 1;
+}
+
+
+static const luaL_Reg base_funcs[] = {
+  {"assert", luaB_assert},
+  {"collectgarbage", luaB_collectgarbage},
+  {"dofile", luaB_dofile},
+  {"error", luaB_error},
+  {"getmetatable", luaB_getmetatable},
+  {"ipairs", luaB_ipairs},
+  {"loadfile", luaB_loadfile},
+  {"load", luaB_load},
+#if defined(LUA_COMPAT_LOADSTRING)
+  {"loadstring", luaB_load},
+#endif
+  {"next", luaB_next},
+  {"pairs", luaB_pairs},
+  {"pcall", luaB_pcall},
+  {"print", luaB_print},
+  {"rawequal", luaB_rawequal},
+  {"rawlen", luaB_rawlen},
+  {"rawget", luaB_rawget},
+  {"rawset", luaB_rawset},
+  {"select", luaB_select},
+  {"setmetatable", luaB_setmetatable},
+  {"tonumber", luaB_tonumber},
+  {"tostring", luaB_tostring},
+  {"type", luaB_type},
+  {"xpcall", luaB_xpcall},
+  {NULL, NULL}
+};
+
+
+LUAMOD_API int luaopen_base (lua_State *L) {
+  /* set global _G */
+  lua_pushglobaltable(L);
+  lua_pushglobaltable(L);
+  lua_setfield(L, -2, "_G");
+  /* open lib into global table */
+  luaL_setfuncs(L, base_funcs, 0);
+  lua_pushliteral(L, LUA_VERSION);
+  lua_setfield(L, -2, "_VERSION");  /* set global _VERSION */
+  return 1;
+}
+
diff --git a/ext/lua/src/lbitlib.c b/ext/lua/src/lbitlib.c
new file mode 100644
index 0000000..9637532
--- /dev/null
+++ b/ext/lua/src/lbitlib.c
@@ -0,0 +1,211 @@
+/*
+** $Id: lbitlib.c,v 1.18 2013/03/19 13:19:12 roberto Exp $
+** Standard library for bitwise operations
+** See Copyright Notice in lua.h
+*/
+
+#define lbitlib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+/* number of bits to consider in a number */
+#if !defined(LUA_NBITS)
+#define LUA_NBITS	32
+#endif
+
+
+#define ALLONES		(~(((~(lua_Unsigned)0) << (LUA_NBITS - 1)) << 1))
+
+/* macro to trim extra bits */
+#define trim(x)		((x) & ALLONES)
+
+
+/* builds a number with 'n' ones (1 <= n <= LUA_NBITS) */
+#define mask(n)		(~((ALLONES << 1) << ((n) - 1)))
+
+
+typedef lua_Unsigned b_uint;
+
+
+
+static b_uint andaux (lua_State *L) {
+  int i, n = lua_gettop(L);
+  b_uint r = ~(b_uint)0;
+  for (i = 1; i <= n; i++)
+    r &= luaL_checkunsigned(L, i);
+  return trim(r);
+}
+
+
+static int b_and (lua_State *L) {
+  b_uint r = andaux(L);
+  lua_pushunsigned(L, r);
+  return 1;
+}
+
+
+static int b_test (lua_State *L) {
+  b_uint r = andaux(L);
+  lua_pushboolean(L, r != 0);
+  return 1;
+}
+
+
+static int b_or (lua_State *L) {
+  int i, n = lua_gettop(L);
+  b_uint r = 0;
+  for (i = 1; i <= n; i++)
+    r |= luaL_checkunsigned(L, i);
+  lua_pushunsigned(L, trim(r));
+  return 1;
+}
+
+
+static int b_xor (lua_State *L) {
+  int i, n = lua_gettop(L);
+  b_uint r = 0;
+  for (i = 1; i <= n; i++)
+    r ^= luaL_checkunsigned(L, i);
+  lua_pushunsigned(L, trim(r));
+  return 1;
+}
+
+
+static int b_not (lua_State *L) {
+  b_uint r = ~luaL_checkunsigned(L, 1);
+  lua_pushunsigned(L, trim(r));
+  return 1;
+}
+
+
+static int b_shift (lua_State *L, b_uint r, int i) {
+  if (i < 0) {  /* shift right? */
+    i = -i;
+    r = trim(r);
+    if (i >= LUA_NBITS) r = 0;
+    else r >>= i;
+  }
+  else {  /* shift left */
+    if (i >= LUA_NBITS) r = 0;
+    else r <<= i;
+    r = trim(r);
+  }
+  lua_pushunsigned(L, r);
+  return 1;
+}
+
+
+static int b_lshift (lua_State *L) {
+  return b_shift(L, luaL_checkunsigned(L, 1), luaL_checkint(L, 2));
+}
+
+
+static int b_rshift (lua_State *L) {
+  return b_shift(L, luaL_checkunsigned(L, 1), -luaL_checkint(L, 2));
+}
+
+
+static int b_arshift (lua_State *L) {
+  b_uint r = luaL_checkunsigned(L, 1);
+  int i = luaL_checkint(L, 2);
+  if (i < 0 || !(r & ((b_uint)1 << (LUA_NBITS - 1))))
+    return b_shift(L, r, -i);
+  else {  /* arithmetic shift for 'negative' number */
+    if (i >= LUA_NBITS) r = ALLONES;
+    else
+      r = trim((r >> i) | ~(~(b_uint)0 >> i));  /* add signal bit */
+    lua_pushunsigned(L, r);
+    return 1;
+  }
+}
+
+
+static int b_rot (lua_State *L, int i) {
+  b_uint r = luaL_checkunsigned(L, 1);
+  i &= (LUA_NBITS - 1);  /* i = i % NBITS */
+  r = trim(r);
+  r = (r << i) | (r >> (LUA_NBITS - i));
+  lua_pushunsigned(L, trim(r));
+  return 1;
+}
+
+
+static int b_lrot (lua_State *L) {
+  return b_rot(L, luaL_checkint(L, 2));
+}
+
+
+static int b_rrot (lua_State *L) {
+  return b_rot(L, -luaL_checkint(L, 2));
+}
+
+
+/*
+** get field and width arguments for field-manipulation functions,
+** checking whether they are valid.
+** ('luaL_error' called without 'return' to avoid later warnings about
+** 'width' being used uninitialized.)
+*/
+static int fieldargs (lua_State *L, int farg, int *width) {
+  int f = luaL_checkint(L, farg);
+  int w = luaL_optint(L, farg + 1, 1);
+  luaL_argcheck(L, 0 <= f, farg, "field cannot be negative");
+  luaL_argcheck(L, 0 < w, farg + 1, "width must be positive");
+  if (f + w > LUA_NBITS)
+    luaL_error(L, "trying to access non-existent bits");
+  *width = w;
+  return f;
+}
+
+
+static int b_extract (lua_State *L) {
+  int w;
+  b_uint r = luaL_checkunsigned(L, 1);
+  int f = fieldargs(L, 2, &w);
+  r = (r >> f) & mask(w);
+  lua_pushunsigned(L, r);
+  return 1;
+}
+
+
+static int b_replace (lua_State *L) {
+  int w;
+  b_uint r = luaL_checkunsigned(L, 1);
+  b_uint v = luaL_checkunsigned(L, 2);
+  int f = fieldargs(L, 3, &w);
+  int m = mask(w);
+  v &= m;  /* erase bits outside given width */
+  r = (r & ~(m << f)) | (v << f);
+  lua_pushunsigned(L, r);
+  return 1;
+}
+
+
+static const luaL_Reg bitlib[] = {
+  {"arshift", b_arshift},
+  {"band", b_and},
+  {"bnot", b_not},
+  {"bor", b_or},
+  {"bxor", b_xor},
+  {"btest", b_test},
+  {"extract", b_extract},
+  {"lrotate", b_lrot},
+  {"lshift", b_lshift},
+  {"replace", b_replace},
+  {"rrotate", b_rrot},
+  {"rshift", b_rshift},
+  {NULL, NULL}
+};
+
+
+
+LUAMOD_API int luaopen_bit32 (lua_State *L) {
+  luaL_newlib(L, bitlib);
+  return 1;
+}
+
diff --git a/ext/lua/src/lcode.c b/ext/lua/src/lcode.c
new file mode 100644
index 0000000..56c26ac
--- /dev/null
+++ b/ext/lua/src/lcode.c
@@ -0,0 +1,881 @@
+/*
+** $Id: lcode.c,v 2.62 2012/08/16 17:34:28 roberto Exp $
+** Code generator for Lua
+** See Copyright Notice in lua.h
+*/
+
+
+#include <stdlib.h>
+
+#define lcode_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lcode.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lgc.h"
+#include "llex.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "lvm.h"
+
+
+#define hasjumps(e)	((e)->t != (e)->f)
+
+
+static int isnumeral(expdesc *e) {
+  return (e->k == VKNUM && e->t == NO_JUMP && e->f == NO_JUMP);
+}
+
+
+void luaK_nil (FuncState *fs, int from, int n) {
+  Instruction *previous;
+  int l = from + n - 1;  /* last register to set nil */
+  if (fs->pc > fs->lasttarget) {  /* no jumps to current position? */
+    previous = &fs->f->code[fs->pc-1];
+    if (GET_OPCODE(*previous) == OP_LOADNIL) {
+      int pfrom = GETARG_A(*previous);
+      int pl = pfrom + GETARG_B(*previous);
+      if ((pfrom <= from && from <= pl + 1) ||
+          (from <= pfrom && pfrom <= l + 1)) {  /* can connect both? */
+        if (pfrom < from) from = pfrom;  /* from = min(from, pfrom) */
+        if (pl > l) l = pl;  /* l = max(l, pl) */
+        SETARG_A(*previous, from);
+        SETARG_B(*previous, l - from);
+        return;
+      }
+    }  /* else go through */
+  }
+  luaK_codeABC(fs, OP_LOADNIL, from, n - 1, 0);  /* else no optimization */
+}
+
+
+int luaK_jump (FuncState *fs) {
+  int jpc = fs->jpc;  /* save list of jumps to here */
+  int j;
+  fs->jpc = NO_JUMP;
+  j = luaK_codeAsBx(fs, OP_JMP, 0, NO_JUMP);
+  luaK_concat(fs, &j, jpc);  /* keep them on hold */
+  return j;
+}
+
+
+void luaK_ret (FuncState *fs, int first, int nret) {
+  luaK_codeABC(fs, OP_RETURN, first, nret+1, 0);
+}
+
+
+static int condjump (FuncState *fs, OpCode op, int A, int B, int C) {
+  luaK_codeABC(fs, op, A, B, C);
+  return luaK_jump(fs);
+}
+
+
+static void fixjump (FuncState *fs, int pc, int dest) {
+  Instruction *jmp = &fs->f->code[pc];
+  int offset = dest-(pc+1);
+  lua_assert(dest != NO_JUMP);
+  if (abs(offset) > MAXARG_sBx)
+    luaX_syntaxerror(fs->ls, "control structure too long");
+  SETARG_sBx(*jmp, offset);
+}
+
+
+/*
+** returns current `pc' and marks it as a jump target (to avoid wrong
+** optimizations with consecutive instructions not in the same basic block).
+*/
+int luaK_getlabel (FuncState *fs) {
+  fs->lasttarget = fs->pc;
+  return fs->pc;
+}
+
+
+static int getjump (FuncState *fs, int pc) {
+  int offset = GETARG_sBx(fs->f->code[pc]);
+  if (offset == NO_JUMP)  /* point to itself represents end of list */
+    return NO_JUMP;  /* end of list */
+  else
+    return (pc+1)+offset;  /* turn offset into absolute position */
+}
+
+
+static Instruction *getjumpcontrol (FuncState *fs, int pc) {
+  Instruction *pi = &fs->f->code[pc];
+  if (pc >= 1 && testTMode(GET_OPCODE(*(pi-1))))
+    return pi-1;
+  else
+    return pi;
+}
+
+
+/*
+** check whether list has any jump that do not produce a value
+** (or produce an inverted value)
+*/
+static int need_value (FuncState *fs, int list) {
+  for (; list != NO_JUMP; list = getjump(fs, list)) {
+    Instruction i = *getjumpcontrol(fs, list);
+    if (GET_OPCODE(i) != OP_TESTSET) return 1;
+  }
+  return 0;  /* not found */
+}
+
+
+static int patchtestreg (FuncState *fs, int node, int reg) {
+  Instruction *i = getjumpcontrol(fs, node);
+  if (GET_OPCODE(*i) != OP_TESTSET)
+    return 0;  /* cannot patch other instructions */
+  if (reg != NO_REG && reg != GETARG_B(*i))
+    SETARG_A(*i, reg);
+  else  /* no register to put value or register already has the value */
+    *i = CREATE_ABC(OP_TEST, GETARG_B(*i), 0, GETARG_C(*i));
+
+  return 1;
+}
+
+
+static void removevalues (FuncState *fs, int list) {
+  for (; list != NO_JUMP; list = getjump(fs, list))
+      patchtestreg(fs, list, NO_REG);
+}
+
+
+static void patchlistaux (FuncState *fs, int list, int vtarget, int reg,
+                          int dtarget) {
+  while (list != NO_JUMP) {
+    int next = getjump(fs, list);
+    if (patchtestreg(fs, list, reg))
+      fixjump(fs, list, vtarget);
+    else
+      fixjump(fs, list, dtarget);  /* jump to default target */
+    list = next;
+  }
+}
+
+
+static void dischargejpc (FuncState *fs) {
+  patchlistaux(fs, fs->jpc, fs->pc, NO_REG, fs->pc);
+  fs->jpc = NO_JUMP;
+}
+
+
+void luaK_patchlist (FuncState *fs, int list, int target) {
+  if (target == fs->pc)
+    luaK_patchtohere(fs, list);
+  else {
+    lua_assert(target < fs->pc);
+    patchlistaux(fs, list, target, NO_REG, target);
+  }
+}
+
+
+LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level) {
+  level++;  /* argument is +1 to reserve 0 as non-op */
+  while (list != NO_JUMP) {
+    int next = getjump(fs, list);
+    lua_assert(GET_OPCODE(fs->f->code[list]) == OP_JMP &&
+                (GETARG_A(fs->f->code[list]) == 0 ||
+                 GETARG_A(fs->f->code[list]) >= level));
+    SETARG_A(fs->f->code[list], level);
+    list = next;
+  }
+}
+
+
+void luaK_patchtohere (FuncState *fs, int list) {
+  luaK_getlabel(fs);
+  luaK_concat(fs, &fs->jpc, list);
+}
+
+
+void luaK_concat (FuncState *fs, int *l1, int l2) {
+  if (l2 == NO_JUMP) return;
+  else if (*l1 == NO_JUMP)
+    *l1 = l2;
+  else {
+    int list = *l1;
+    int next;
+    while ((next = getjump(fs, list)) != NO_JUMP)  /* find last element */
+      list = next;
+    fixjump(fs, list, l2);
+  }
+}
+
+
+static int luaK_code (FuncState *fs, Instruction i) {
+  Proto *f = fs->f;
+  dischargejpc(fs);  /* `pc' will change */
+  /* put new instruction in code array */
+  luaM_growvector(fs->ls->L, f->code, fs->pc, f->sizecode, Instruction,
+                  MAX_INT, "opcodes");
+  f->code[fs->pc] = i;
+  /* save corresponding line information */
+  luaM_growvector(fs->ls->L, f->lineinfo, fs->pc, f->sizelineinfo, int,
+                  MAX_INT, "opcodes");
+  f->lineinfo[fs->pc] = fs->ls->lastline;
+  return fs->pc++;
+}
+
+
+int luaK_codeABC (FuncState *fs, OpCode o, int a, int b, int c) {
+  lua_assert(getOpMode(o) == iABC);
+  lua_assert(getBMode(o) != OpArgN || b == 0);
+  lua_assert(getCMode(o) != OpArgN || c == 0);
+  lua_assert(a <= MAXARG_A && b <= MAXARG_B && c <= MAXARG_C);
+  return luaK_code(fs, CREATE_ABC(o, a, b, c));
+}
+
+
+int luaK_codeABx (FuncState *fs, OpCode o, int a, unsigned int bc) {
+  lua_assert(getOpMode(o) == iABx || getOpMode(o) == iAsBx);
+  lua_assert(getCMode(o) == OpArgN);
+  lua_assert(a <= MAXARG_A && bc <= MAXARG_Bx);
+  return luaK_code(fs, CREATE_ABx(o, a, bc));
+}
+
+
+static int codeextraarg (FuncState *fs, int a) {
+  lua_assert(a <= MAXARG_Ax);
+  return luaK_code(fs, CREATE_Ax(OP_EXTRAARG, a));
+}
+
+
+int luaK_codek (FuncState *fs, int reg, int k) {
+  if (k <= MAXARG_Bx)
+    return luaK_codeABx(fs, OP_LOADK, reg, k);
+  else {
+    int p = luaK_codeABx(fs, OP_LOADKX, reg, 0);
+    codeextraarg(fs, k);
+    return p;
+  }
+}
+
+
+void luaK_checkstack (FuncState *fs, int n) {
+  int newstack = fs->freereg + n;
+  if (newstack > fs->f->maxstacksize) {
+    if (newstack >= MAXSTACK)
+      luaX_syntaxerror(fs->ls, "function or expression too complex");
+    fs->f->maxstacksize = cast_byte(newstack);
+  }
+}
+
+
+void luaK_reserveregs (FuncState *fs, int n) {
+  luaK_checkstack(fs, n);
+  fs->freereg += n;
+}
+
+
+static void freereg (FuncState *fs, int reg) {
+  if (!ISK(reg) && reg >= fs->nactvar) {
+    fs->freereg--;
+    lua_assert(reg == fs->freereg);
+  }
+}
+
+
+static void freeexp (FuncState *fs, expdesc *e) {
+  if (e->k == VNONRELOC)
+    freereg(fs, e->u.info);
+}
+
+
+static int addk (FuncState *fs, TValue *key, TValue *v) {
+  lua_State *L = fs->ls->L;
+  TValue *idx = luaH_set(L, fs->h, key);
+  Proto *f = fs->f;
+  int k, oldsize;
+  if (ttisnumber(idx)) {
+    lua_Number n = nvalue(idx);
+    lua_number2int(k, n);
+    if (luaV_rawequalobj(&f->k[k], v))
+      return k;
+    /* else may be a collision (e.g., between 0.0 and "\0\0\0\0\0\0\0\0");
+       go through and create a new entry for this value */
+  }
+  /* constant not found; create a new entry */
+  oldsize = f->sizek;
+  k = fs->nk;
+  /* numerical value does not need GC barrier;
+     table has no metatable, so it does not need to invalidate cache */
+  setnvalue(idx, cast_num(k));
+  luaM_growvector(L, f->k, k, f->sizek, TValue, MAXARG_Ax, "constants");
+  while (oldsize < f->sizek) setnilvalue(&f->k[oldsize++]);
+  setobj(L, &f->k[k], v);
+  fs->nk++;
+  luaC_barrier(L, f, v);
+  return k;
+}
+
+
+int luaK_stringK (FuncState *fs, TString *s) {
+  TValue o;
+  setsvalue(fs->ls->L, &o, s);
+  return addk(fs, &o, &o);
+}
+
+
+int luaK_numberK (FuncState *fs, lua_Number r) {
+  int n;
+  lua_State *L = fs->ls->L;
+  TValue o;
+  setnvalue(&o, r);
+  if (r == 0 || luai_numisnan(NULL, r)) {  /* handle -0 and NaN */
+    /* use raw representation as key to avoid numeric problems */
+    setsvalue(L, L->top++, luaS_newlstr(L, (char *)&r, sizeof(r)));
+    n = addk(fs, L->top - 1, &o);
+    L->top--;
+  }
+  else
+    n = addk(fs, &o, &o);  /* regular case */
+  return n;
+}
+
+
+static int boolK (FuncState *fs, int b) {
+  TValue o;
+  setbvalue(&o, b);
+  return addk(fs, &o, &o);
+}
+
+
+static int nilK (FuncState *fs) {
+  TValue k, v;
+  setnilvalue(&v);
+  /* cannot use nil as key; instead use table itself to represent nil */
+  sethvalue(fs->ls->L, &k, fs->h);
+  return addk(fs, &k, &v);
+}
+
+
+void luaK_setreturns (FuncState *fs, expdesc *e, int nresults) {
+  if (e->k == VCALL) {  /* expression is an open function call? */
+    SETARG_C(getcode(fs, e), nresults+1);
+  }
+  else if (e->k == VVARARG) {
+    SETARG_B(getcode(fs, e), nresults+1);
+    SETARG_A(getcode(fs, e), fs->freereg);
+    luaK_reserveregs(fs, 1);
+  }
+}
+
+
+void luaK_setoneret (FuncState *fs, expdesc *e) {
+  if (e->k == VCALL) {  /* expression is an open function call? */
+    e->k = VNONRELOC;
+    e->u.info = GETARG_A(getcode(fs, e));
+  }
+  else if (e->k == VVARARG) {
+    SETARG_B(getcode(fs, e), 2);
+    e->k = VRELOCABLE;  /* can relocate its simple result */
+  }
+}
+
+
+void luaK_dischargevars (FuncState *fs, expdesc *e) {
+  switch (e->k) {
+    case VLOCAL: {
+      e->k = VNONRELOC;
+      break;
+    }
+    case VUPVAL: {
+      e->u.info = luaK_codeABC(fs, OP_GETUPVAL, 0, e->u.info, 0);
+      e->k = VRELOCABLE;
+      break;
+    }
+    case VINDEXED: {
+      OpCode op = OP_GETTABUP;  /* assume 't' is in an upvalue */
+      freereg(fs, e->u.ind.idx);
+      if (e->u.ind.vt == VLOCAL) {  /* 't' is in a register? */
+        freereg(fs, e->u.ind.t);
+        op = OP_GETTABLE;
+      }
+      e->u.info = luaK_codeABC(fs, op, 0, e->u.ind.t, e->u.ind.idx);
+      e->k = VRELOCABLE;
+      break;
+    }
+    case VVARARG:
+    case VCALL: {
+      luaK_setoneret(fs, e);
+      break;
+    }
+    default: break;  /* there is one value available (somewhere) */
+  }
+}
+
+
+static int code_label (FuncState *fs, int A, int b, int jump) {
+  luaK_getlabel(fs);  /* those instructions may be jump targets */
+  return luaK_codeABC(fs, OP_LOADBOOL, A, b, jump);
+}
+
+
+static void discharge2reg (FuncState *fs, expdesc *e, int reg) {
+  luaK_dischargevars(fs, e);
+  switch (e->k) {
+    case VNIL: {
+      luaK_nil(fs, reg, 1);
+      break;
+    }
+    case VFALSE: case VTRUE: {
+      luaK_codeABC(fs, OP_LOADBOOL, reg, e->k == VTRUE, 0);
+      break;
+    }
+    case VK: {
+      luaK_codek(fs, reg, e->u.info);
+      break;
+    }
+    case VKNUM: {
+      luaK_codek(fs, reg, luaK_numberK(fs, e->u.nval));
+      break;
+    }
+    case VRELOCABLE: {
+      Instruction *pc = &getcode(fs, e);
+      SETARG_A(*pc, reg);
+      break;
+    }
+    case VNONRELOC: {
+      if (reg != e->u.info)
+        luaK_codeABC(fs, OP_MOVE, reg, e->u.info, 0);
+      break;
+    }
+    default: {
+      lua_assert(e->k == VVOID || e->k == VJMP);
+      return;  /* nothing to do... */
+    }
+  }
+  e->u.info = reg;
+  e->k = VNONRELOC;
+}
+
+
+static void discharge2anyreg (FuncState *fs, expdesc *e) {
+  if (e->k != VNONRELOC) {
+    luaK_reserveregs(fs, 1);
+    discharge2reg(fs, e, fs->freereg-1);
+  }
+}
+
+
+static void exp2reg (FuncState *fs, expdesc *e, int reg) {
+  discharge2reg(fs, e, reg);
+  if (e->k == VJMP)
+    luaK_concat(fs, &e->t, e->u.info);  /* put this jump in `t' list */
+  if (hasjumps(e)) {
+    int final;  /* position after whole expression */
+    int p_f = NO_JUMP;  /* position of an eventual LOAD false */
+    int p_t = NO_JUMP;  /* position of an eventual LOAD true */
+    if (need_value(fs, e->t) || need_value(fs, e->f)) {
+      int fj = (e->k == VJMP) ? NO_JUMP : luaK_jump(fs);
+      p_f = code_label(fs, reg, 0, 1);
+      p_t = code_label(fs, reg, 1, 0);
+      luaK_patchtohere(fs, fj);
+    }
+    final = luaK_getlabel(fs);
+    patchlistaux(fs, e->f, final, reg, p_f);
+    patchlistaux(fs, e->t, final, reg, p_t);
+  }
+  e->f = e->t = NO_JUMP;
+  e->u.info = reg;
+  e->k = VNONRELOC;
+}
+
+
+void luaK_exp2nextreg (FuncState *fs, expdesc *e) {
+  luaK_dischargevars(fs, e);
+  freeexp(fs, e);
+  luaK_reserveregs(fs, 1);
+  exp2reg(fs, e, fs->freereg - 1);
+}
+
+
+int luaK_exp2anyreg (FuncState *fs, expdesc *e) {
+  luaK_dischargevars(fs, e);
+  if (e->k == VNONRELOC) {
+    if (!hasjumps(e)) return e->u.info;  /* exp is already in a register */
+    if (e->u.info >= fs->nactvar) {  /* reg. is not a local? */
+      exp2reg(fs, e, e->u.info);  /* put value on it */
+      return e->u.info;
+    }
+  }
+  luaK_exp2nextreg(fs, e);  /* default */
+  return e->u.info;
+}
+
+
+void luaK_exp2anyregup (FuncState *fs, expdesc *e) {
+  if (e->k != VUPVAL || hasjumps(e))
+    luaK_exp2anyreg(fs, e);
+}
+
+
+void luaK_exp2val (FuncState *fs, expdesc *e) {
+  if (hasjumps(e))
+    luaK_exp2anyreg(fs, e);
+  else
+    luaK_dischargevars(fs, e);
+}
+
+
+int luaK_exp2RK (FuncState *fs, expdesc *e) {
+  luaK_exp2val(fs, e);
+  switch (e->k) {
+    case VTRUE:
+    case VFALSE:
+    case VNIL: {
+      if (fs->nk <= MAXINDEXRK) {  /* constant fits in RK operand? */
+        e->u.info = (e->k == VNIL) ? nilK(fs) : boolK(fs, (e->k == VTRUE));
+        e->k = VK;
+        return RKASK(e->u.info);
+      }
+      else break;
+    }
+    case VKNUM: {
+      e->u.info = luaK_numberK(fs, e->u.nval);
+      e->k = VK;
+      /* go through */
+    }
+    case VK: {
+      if (e->u.info <= MAXINDEXRK)  /* constant fits in argC? */
+        return RKASK(e->u.info);
+      else break;
+    }
+    default: break;
+  }
+  /* not a constant in the right range: put it in a register */
+  return luaK_exp2anyreg(fs, e);
+}
+
+
+void luaK_storevar (FuncState *fs, expdesc *var, expdesc *ex) {
+  switch (var->k) {
+    case VLOCAL: {
+      freeexp(fs, ex);
+      exp2reg(fs, ex, var->u.info);
+      return;
+    }
+    case VUPVAL: {
+      int e = luaK_exp2anyreg(fs, ex);
+      luaK_codeABC(fs, OP_SETUPVAL, e, var->u.info, 0);
+      break;
+    }
+    case VINDEXED: {
+      OpCode op = (var->u.ind.vt == VLOCAL) ? OP_SETTABLE : OP_SETTABUP;
+      int e = luaK_exp2RK(fs, ex);
+      luaK_codeABC(fs, op, var->u.ind.t, var->u.ind.idx, e);
+      break;
+    }
+    default: {
+      lua_assert(0);  /* invalid var kind to store */
+      break;
+    }
+  }
+  freeexp(fs, ex);
+}
+
+
+void luaK_self (FuncState *fs, expdesc *e, expdesc *key) {
+  int ereg;
+  luaK_exp2anyreg(fs, e);
+  ereg = e->u.info;  /* register where 'e' was placed */
+  freeexp(fs, e);
+  e->u.info = fs->freereg;  /* base register for op_self */
+  e->k = VNONRELOC;
+  luaK_reserveregs(fs, 2);  /* function and 'self' produced by op_self */
+  luaK_codeABC(fs, OP_SELF, e->u.info, ereg, luaK_exp2RK(fs, key));
+  freeexp(fs, key);
+}
+
+
+static void invertjump (FuncState *fs, expdesc *e) {
+  Instruction *pc = getjumpcontrol(fs, e->u.info);
+  lua_assert(testTMode(GET_OPCODE(*pc)) && GET_OPCODE(*pc) != OP_TESTSET &&
+                                           GET_OPCODE(*pc) != OP_TEST);
+  SETARG_A(*pc, !(GETARG_A(*pc)));
+}
+
+
+static int jumponcond (FuncState *fs, expdesc *e, int cond) {
+  if (e->k == VRELOCABLE) {
+    Instruction ie = getcode(fs, e);
+    if (GET_OPCODE(ie) == OP_NOT) {
+      fs->pc--;  /* remove previous OP_NOT */
+      return condjump(fs, OP_TEST, GETARG_B(ie), 0, !cond);
+    }
+    /* else go through */
+  }
+  discharge2anyreg(fs, e);
+  freeexp(fs, e);
+  return condjump(fs, OP_TESTSET, NO_REG, e->u.info, cond);
+}
+
+
+void luaK_goiftrue (FuncState *fs, expdesc *e) {
+  int pc;  /* pc of last jump */
+  luaK_dischargevars(fs, e);
+  switch (e->k) {
+    case VJMP: {
+      invertjump(fs, e);
+      pc = e->u.info;
+      break;
+    }
+    case VK: case VKNUM: case VTRUE: {
+      pc = NO_JUMP;  /* always true; do nothing */
+      break;
+    }
+    default: {
+      pc = jumponcond(fs, e, 0);
+      break;
+    }
+  }
+  luaK_concat(fs, &e->f, pc);  /* insert last jump in `f' list */
+  luaK_patchtohere(fs, e->t);
+  e->t = NO_JUMP;
+}
+
+
+void luaK_goiffalse (FuncState *fs, expdesc *e) {
+  int pc;  /* pc of last jump */
+  luaK_dischargevars(fs, e);
+  switch (e->k) {
+    case VJMP: {
+      pc = e->u.info;
+      break;
+    }
+    case VNIL: case VFALSE: {
+      pc = NO_JUMP;  /* always false; do nothing */
+      break;
+    }
+    default: {
+      pc = jumponcond(fs, e, 1);
+      break;
+    }
+  }
+  luaK_concat(fs, &e->t, pc);  /* insert last jump in `t' list */
+  luaK_patchtohere(fs, e->f);
+  e->f = NO_JUMP;
+}
+
+
+static void codenot (FuncState *fs, expdesc *e) {
+  luaK_dischargevars(fs, e);
+  switch (e->k) {
+    case VNIL: case VFALSE: {
+      e->k = VTRUE;
+      break;
+    }
+    case VK: case VKNUM: case VTRUE: {
+      e->k = VFALSE;
+      break;
+    }
+    case VJMP: {
+      invertjump(fs, e);
+      break;
+    }
+    case VRELOCABLE:
+    case VNONRELOC: {
+      discharge2anyreg(fs, e);
+      freeexp(fs, e);
+      e->u.info = luaK_codeABC(fs, OP_NOT, 0, e->u.info, 0);
+      e->k = VRELOCABLE;
+      break;
+    }
+    default: {
+      lua_assert(0);  /* cannot happen */
+      break;
+    }
+  }
+  /* interchange true and false lists */
+  { int temp = e->f; e->f = e->t; e->t = temp; }
+  removevalues(fs, e->f);
+  removevalues(fs, e->t);
+}
+
+
+void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k) {
+  lua_assert(!hasjumps(t));
+  t->u.ind.t = t->u.info;
+  t->u.ind.idx = luaK_exp2RK(fs, k);
+  t->u.ind.vt = (t->k == VUPVAL) ? VUPVAL
+                                 : check_exp(vkisinreg(t->k), VLOCAL);
+  t->k = VINDEXED;
+}
+
+
+static int constfolding (OpCode op, expdesc *e1, expdesc *e2) {
+  lua_Number r;
+  if (!isnumeral(e1) || !isnumeral(e2)) return 0;
+  if ((op == OP_DIV || op == OP_MOD) && e2->u.nval == 0)
+    return 0;  /* do not attempt to divide by 0 */
+  r = luaO_arith(op - OP_ADD + LUA_OPADD, e1->u.nval, e2->u.nval);
+  e1->u.nval = r;
+  return 1;
+}
+
+
+static void codearith (FuncState *fs, OpCode op,
+                       expdesc *e1, expdesc *e2, int line) {
+  if (constfolding(op, e1, e2))
+    return;
+  else {
+    int o2 = (op != OP_UNM && op != OP_LEN) ? luaK_exp2RK(fs, e2) : 0;
+    int o1 = luaK_exp2RK(fs, e1);
+    if (o1 > o2) {
+      freeexp(fs, e1);
+      freeexp(fs, e2);
+    }
+    else {
+      freeexp(fs, e2);
+      freeexp(fs, e1);
+    }
+    e1->u.info = luaK_codeABC(fs, op, 0, o1, o2);
+    e1->k = VRELOCABLE;
+    luaK_fixline(fs, line);
+  }
+}
+
+
+static void codecomp (FuncState *fs, OpCode op, int cond, expdesc *e1,
+                                                          expdesc *e2) {
+  int o1 = luaK_exp2RK(fs, e1);
+  int o2 = luaK_exp2RK(fs, e2);
+  freeexp(fs, e2);
+  freeexp(fs, e1);
+  if (cond == 0 && op != OP_EQ) {
+    int temp;  /* exchange args to replace by `<' or `<=' */
+    temp = o1; o1 = o2; o2 = temp;  /* o1 <==> o2 */
+    cond = 1;
+  }
+  e1->u.info = condjump(fs, op, cond, o1, o2);
+  e1->k = VJMP;
+}
+
+
+void luaK_prefix (FuncState *fs, UnOpr op, expdesc *e, int line) {
+  expdesc e2;
+  e2.t = e2.f = NO_JUMP; e2.k = VKNUM; e2.u.nval = 0;
+  switch (op) {
+    case OPR_MINUS: {
+      if (isnumeral(e))  /* minus constant? */
+        e->u.nval = luai_numunm(NULL, e->u.nval);  /* fold it */
+      else {
+        luaK_exp2anyreg(fs, e);
+        codearith(fs, OP_UNM, e, &e2, line);
+      }
+      break;
+    }
+    case OPR_NOT: codenot(fs, e); break;
+    case OPR_LEN: {
+      luaK_exp2anyreg(fs, e);  /* cannot operate on constants */
+      codearith(fs, OP_LEN, e, &e2, line);
+      break;
+    }
+    default: lua_assert(0);
+  }
+}
+
+
+void luaK_infix (FuncState *fs, BinOpr op, expdesc *v) {
+  switch (op) {
+    case OPR_AND: {
+      luaK_goiftrue(fs, v);
+      break;
+    }
+    case OPR_OR: {
+      luaK_goiffalse(fs, v);
+      break;
+    }
+    case OPR_CONCAT: {
+      luaK_exp2nextreg(fs, v);  /* operand must be on the `stack' */
+      break;
+    }
+    case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV:
+    case OPR_MOD: case OPR_POW: {
+      if (!isnumeral(v)) luaK_exp2RK(fs, v);
+      break;
+    }
+    default: {
+      luaK_exp2RK(fs, v);
+      break;
+    }
+  }
+}
+
+
+void luaK_posfix (FuncState *fs, BinOpr op,
+                  expdesc *e1, expdesc *e2, int line) {
+  switch (op) {
+    case OPR_AND: {
+      lua_assert(e1->t == NO_JUMP);  /* list must be closed */
+      luaK_dischargevars(fs, e2);
+      luaK_concat(fs, &e2->f, e1->f);
+      *e1 = *e2;
+      break;
+    }
+    case OPR_OR: {
+      lua_assert(e1->f == NO_JUMP);  /* list must be closed */
+      luaK_dischargevars(fs, e2);
+      luaK_concat(fs, &e2->t, e1->t);
+      *e1 = *e2;
+      break;
+    }
+    case OPR_CONCAT: {
+      luaK_exp2val(fs, e2);
+      if (e2->k == VRELOCABLE && GET_OPCODE(getcode(fs, e2)) == OP_CONCAT) {
+        lua_assert(e1->u.info == GETARG_B(getcode(fs, e2))-1);
+        freeexp(fs, e1);
+        SETARG_B(getcode(fs, e2), e1->u.info);
+        e1->k = VRELOCABLE; e1->u.info = e2->u.info;
+      }
+      else {
+        luaK_exp2nextreg(fs, e2);  /* operand must be on the 'stack' */
+        codearith(fs, OP_CONCAT, e1, e2, line);
+      }
+      break;
+    }
+    case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV:
+    case OPR_MOD: case OPR_POW: {
+      codearith(fs, cast(OpCode, op - OPR_ADD + OP_ADD), e1, e2, line);
+      break;
+    }
+    case OPR_EQ: case OPR_LT: case OPR_LE: {
+      codecomp(fs, cast(OpCode, op - OPR_EQ + OP_EQ), 1, e1, e2);
+      break;
+    }
+    case OPR_NE: case OPR_GT: case OPR_GE: {
+      codecomp(fs, cast(OpCode, op - OPR_NE + OP_EQ), 0, e1, e2);
+      break;
+    }
+    default: lua_assert(0);
+  }
+}
+
+
+void luaK_fixline (FuncState *fs, int line) {
+  fs->f->lineinfo[fs->pc - 1] = line;
+}
+
+
+void luaK_setlist (FuncState *fs, int base, int nelems, int tostore) {
+  int c =  (nelems - 1)/LFIELDS_PER_FLUSH + 1;
+  int b = (tostore == LUA_MULTRET) ? 0 : tostore;
+  lua_assert(tostore != 0);
+  if (c <= MAXARG_C)
+    luaK_codeABC(fs, OP_SETLIST, base, b, c);
+  else if (c <= MAXARG_Ax) {
+    luaK_codeABC(fs, OP_SETLIST, base, b, 0);
+    codeextraarg(fs, c);
+  }
+  else
+    luaX_syntaxerror(fs->ls, "constructor too long");
+  fs->freereg = base + 1;  /* free registers with list values */
+}
+
diff --git a/ext/lua/src/lcorolib.c b/ext/lua/src/lcorolib.c
new file mode 100644
index 0000000..1326c81
--- /dev/null
+++ b/ext/lua/src/lcorolib.c
@@ -0,0 +1,155 @@
+/*
+** $Id: lcorolib.c,v 1.5 2013/02/21 13:44:53 roberto Exp $
+** Coroutine Library
+** See Copyright Notice in lua.h
+*/
+
+
+#include <stdlib.h>
+
+
+#define lcorolib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+static int auxresume (lua_State *L, lua_State *co, int narg) {
+  int status;
+  if (!lua_checkstack(co, narg)) {
+    lua_pushliteral(L, "too many arguments to resume");
+    return -1;  /* error flag */
+  }
+  if (lua_status(co) == LUA_OK && lua_gettop(co) == 0) {
+    lua_pushliteral(L, "cannot resume dead coroutine");
+    return -1;  /* error flag */
+  }
+  lua_xmove(L, co, narg);
+  status = lua_resume(co, L, narg);
+  if (status == LUA_OK || status == LUA_YIELD) {
+    int nres = lua_gettop(co);
+    if (!lua_checkstack(L, nres + 1)) {
+      lua_pop(co, nres);  /* remove results anyway */
+      lua_pushliteral(L, "too many results to resume");
+      return -1;  /* error flag */
+    }
+    lua_xmove(co, L, nres);  /* move yielded values */
+    return nres;
+  }
+  else {
+    lua_xmove(co, L, 1);  /* move error message */
+    return -1;  /* error flag */
+  }
+}
+
+
+static int luaB_coresume (lua_State *L) {
+  lua_State *co = lua_tothread(L, 1);
+  int r;
+  luaL_argcheck(L, co, 1, "coroutine expected");
+  r = auxresume(L, co, lua_gettop(L) - 1);
+  if (r < 0) {
+    lua_pushboolean(L, 0);
+    lua_insert(L, -2);
+    return 2;  /* return false + error message */
+  }
+  else {
+    lua_pushboolean(L, 1);
+    lua_insert(L, -(r + 1));
+    return r + 1;  /* return true + `resume' returns */
+  }
+}
+
+
+static int luaB_auxwrap (lua_State *L) {
+  lua_State *co = lua_tothread(L, lua_upvalueindex(1));
+  int r = auxresume(L, co, lua_gettop(L));
+  if (r < 0) {
+    if (lua_isstring(L, -1)) {  /* error object is a string? */
+      luaL_where(L, 1);  /* add extra info */
+      lua_insert(L, -2);
+      lua_concat(L, 2);
+    }
+    return lua_error(L);  /* propagate error */
+  }
+  return r;
+}
+
+
+static int luaB_cocreate (lua_State *L) {
+  lua_State *NL;
+  luaL_checktype(L, 1, LUA_TFUNCTION);
+  NL = lua_newthread(L);
+  lua_pushvalue(L, 1);  /* move function to top */
+  lua_xmove(L, NL, 1);  /* move function from L to NL */
+  return 1;
+}
+
+
+static int luaB_cowrap (lua_State *L) {
+  luaB_cocreate(L);
+  lua_pushcclosure(L, luaB_auxwrap, 1);
+  return 1;
+}
+
+
+static int luaB_yield (lua_State *L) {
+  return lua_yield(L, lua_gettop(L));
+}
+
+
+static int luaB_costatus (lua_State *L) {
+  lua_State *co = lua_tothread(L, 1);
+  luaL_argcheck(L, co, 1, "coroutine expected");
+  if (L == co) lua_pushliteral(L, "running");
+  else {
+    switch (lua_status(co)) {
+      case LUA_YIELD:
+        lua_pushliteral(L, "suspended");
+        break;
+      case LUA_OK: {
+        lua_Debug ar;
+        if (lua_getstack(co, 0, &ar) > 0)  /* does it have frames? */
+          lua_pushliteral(L, "normal");  /* it is running */
+        else if (lua_gettop(co) == 0)
+            lua_pushliteral(L, "dead");
+        else
+          lua_pushliteral(L, "suspended");  /* initial state */
+        break;
+      }
+      default:  /* some error occurred */
+        lua_pushliteral(L, "dead");
+        break;
+    }
+  }
+  return 1;
+}
+
+
+static int luaB_corunning (lua_State *L) {
+  int ismain = lua_pushthread(L);
+  lua_pushboolean(L, ismain);
+  return 2;
+}
+
+
+static const luaL_Reg co_funcs[] = {
+  {"create", luaB_cocreate},
+  {"resume", luaB_coresume},
+  {"running", luaB_corunning},
+  {"status", luaB_costatus},
+  {"wrap", luaB_cowrap},
+  {"yield", luaB_yield},
+  {NULL, NULL}
+};
+
+
+
+LUAMOD_API int luaopen_coroutine (lua_State *L) {
+  luaL_newlib(L, co_funcs);
+  return 1;
+}
+
diff --git a/ext/lua/src/lctype.c b/ext/lua/src/lctype.c
new file mode 100644
index 0000000..55e433a
--- /dev/null
+++ b/ext/lua/src/lctype.c
@@ -0,0 +1,52 @@
+/*
+** $Id: lctype.c,v 1.11 2011/10/03 16:19:23 roberto Exp $
+** 'ctype' functions for Lua
+** See Copyright Notice in lua.h
+*/
+
+#define lctype_c
+#define LUA_CORE
+
+#include "lctype.h"
+
+#if !LUA_USE_CTYPE	/* { */
+
+#include <limits.h>
+
+LUAI_DDEF const lu_byte luai_ctype_[UCHAR_MAX + 2] = {
+  0x00,  /* EOZ */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* 0. */
+  0x00,  0x08,  0x08,  0x08,  0x08,  0x08,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* 1. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x0c,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,	/* 2. */
+  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,
+  0x16,  0x16,  0x16,  0x16,  0x16,  0x16,  0x16,  0x16,	/* 3. */
+  0x16,  0x16,  0x04,  0x04,  0x04,  0x04,  0x04,  0x04,
+  0x04,  0x15,  0x15,  0x15,  0x15,  0x15,  0x15,  0x05,	/* 4. */
+  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,
+  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,	/* 5. */
+  0x05,  0x05,  0x05,  0x04,  0x04,  0x04,  0x04,  0x05,
+  0x04,  0x15,  0x15,  0x15,  0x15,  0x15,  0x15,  0x05,	/* 6. */
+  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,
+  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,  0x05,	/* 7. */
+  0x05,  0x05,  0x05,  0x04,  0x04,  0x04,  0x04,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* 8. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* 9. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* a. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* b. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* c. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* d. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* e. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,	/* f. */
+  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,  0x00,
+};
+
+#endif			/* } */
diff --git a/ext/lua/src/ldblib.c b/ext/lua/src/ldblib.c
new file mode 100644
index 0000000..c022694
--- /dev/null
+++ b/ext/lua/src/ldblib.c
@@ -0,0 +1,398 @@
+/*
+** $Id: ldblib.c,v 1.132 2012/01/19 20:14:44 roberto Exp $
+** Interface from Lua to its debug API
+** See Copyright Notice in lua.h
+*/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define ldblib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+#define HOOKKEY		"_HKEY"
+
+
+
+static int db_getregistry (lua_State *L) {
+  lua_pushvalue(L, LUA_REGISTRYINDEX);
+  return 1;
+}
+
+
+static int db_getmetatable (lua_State *L) {
+  luaL_checkany(L, 1);
+  if (!lua_getmetatable(L, 1)) {
+    lua_pushnil(L);  /* no metatable */
+  }
+  return 1;
+}
+
+
+static int db_setmetatable (lua_State *L) {
+  int t = lua_type(L, 2);
+  luaL_argcheck(L, t == LUA_TNIL || t == LUA_TTABLE, 2,
+                    "nil or table expected");
+  lua_settop(L, 2);
+  lua_setmetatable(L, 1);
+  return 1;  /* return 1st argument */
+}
+
+
+static int db_getuservalue (lua_State *L) {
+  if (lua_type(L, 1) != LUA_TUSERDATA)
+    lua_pushnil(L);
+  else
+    lua_getuservalue(L, 1);
+  return 1;
+}
+
+
+static int db_setuservalue (lua_State *L) {
+  if (lua_type(L, 1) == LUA_TLIGHTUSERDATA)
+    luaL_argerror(L, 1, "full userdata expected, got light userdata");
+  luaL_checktype(L, 1, LUA_TUSERDATA);
+  if (!lua_isnoneornil(L, 2))
+    luaL_checktype(L, 2, LUA_TTABLE);
+  lua_settop(L, 2);
+  lua_setuservalue(L, 1);
+  return 1;
+}
+
+
+static void settabss (lua_State *L, const char *i, const char *v) {
+  lua_pushstring(L, v);
+  lua_setfield(L, -2, i);
+}
+
+
+static void settabsi (lua_State *L, const char *i, int v) {
+  lua_pushinteger(L, v);
+  lua_setfield(L, -2, i);
+}
+
+
+static void settabsb (lua_State *L, const char *i, int v) {
+  lua_pushboolean(L, v);
+  lua_setfield(L, -2, i);
+}
+
+
+static lua_State *getthread (lua_State *L, int *arg) {
+  if (lua_isthread(L, 1)) {
+    *arg = 1;
+    return lua_tothread(L, 1);
+  }
+  else {
+    *arg = 0;
+    return L;
+  }
+}
+
+
+static void treatstackoption (lua_State *L, lua_State *L1, const char *fname) {
+  if (L == L1) {
+    lua_pushvalue(L, -2);
+    lua_remove(L, -3);
+  }
+  else
+    lua_xmove(L1, L, 1);
+  lua_setfield(L, -2, fname);
+}
+
+
+static int db_getinfo (lua_State *L) {
+  lua_Debug ar;
+  int arg;
+  lua_State *L1 = getthread(L, &arg);
+  const char *options = luaL_optstring(L, arg+2, "flnStu");
+  if (lua_isnumber(L, arg+1)) {
+    if (!lua_getstack(L1, (int)lua_tointeger(L, arg+1), &ar)) {
+      lua_pushnil(L);  /* level out of range */
+      return 1;
+    }
+  }
+  else if (lua_isfunction(L, arg+1)) {
+    lua_pushfstring(L, ">%s", options);
+    options = lua_tostring(L, -1);
+    lua_pushvalue(L, arg+1);
+    lua_xmove(L, L1, 1);
+  }
+  else
+    return luaL_argerror(L, arg+1, "function or level expected");
+  if (!lua_getinfo(L1, options, &ar))
+    return luaL_argerror(L, arg+2, "invalid option");
+  lua_createtable(L, 0, 2);
+  if (strchr(options, 'S')) {
+    settabss(L, "source", ar.source);
+    settabss(L, "short_src", ar.short_src);
+    settabsi(L, "linedefined", ar.linedefined);
+    settabsi(L, "lastlinedefined", ar.lastlinedefined);
+    settabss(L, "what", ar.what);
+  }
+  if (strchr(options, 'l'))
+    settabsi(L, "currentline", ar.currentline);
+  if (strchr(options, 'u')) {
+    settabsi(L, "nups", ar.nups);
+    settabsi(L, "nparams", ar.nparams);
+    settabsb(L, "isvararg", ar.isvararg);
+  }
+  if (strchr(options, 'n')) {
+    settabss(L, "name", ar.name);
+    settabss(L, "namewhat", ar.namewhat);
+  }
+  if (strchr(options, 't'))
+    settabsb(L, "istailcall", ar.istailcall);
+  if (strchr(options, 'L'))
+    treatstackoption(L, L1, "activelines");
+  if (strchr(options, 'f'))
+    treatstackoption(L, L1, "func");
+  return 1;  /* return table */
+}
+
+
+static int db_getlocal (lua_State *L) {
+  int arg;
+  lua_State *L1 = getthread(L, &arg);
+  lua_Debug ar;
+  const char *name;
+  int nvar = luaL_checkint(L, arg+2);  /* local-variable index */
+  if (lua_isfunction(L, arg + 1)) {  /* function argument? */
+    lua_pushvalue(L, arg + 1);  /* push function */
+    lua_pushstring(L, lua_getlocal(L, NULL, nvar));  /* push local name */
+    return 1;
+  }
+  else {  /* stack-level argument */
+    if (!lua_getstack(L1, luaL_checkint(L, arg+1), &ar))  /* out of range? */
+      return luaL_argerror(L, arg+1, "level out of range");
+    name = lua_getlocal(L1, &ar, nvar);
+    if (name) {
+      lua_xmove(L1, L, 1);  /* push local value */
+      lua_pushstring(L, name);  /* push name */
+      lua_pushvalue(L, -2);  /* re-order */
+      return 2;
+    }
+    else {
+      lua_pushnil(L);  /* no name (nor value) */
+      return 1;
+    }
+  }
+}
+
+
+static int db_setlocal (lua_State *L) {
+  int arg;
+  lua_State *L1 = getthread(L, &arg);
+  lua_Debug ar;
+  if (!lua_getstack(L1, luaL_checkint(L, arg+1), &ar))  /* out of range? */
+    return luaL_argerror(L, arg+1, "level out of range");
+  luaL_checkany(L, arg+3);
+  lua_settop(L, arg+3);
+  lua_xmove(L, L1, 1);
+  lua_pushstring(L, lua_setlocal(L1, &ar, luaL_checkint(L, arg+2)));
+  return 1;
+}
+
+
+static int auxupvalue (lua_State *L, int get) {
+  const char *name;
+  int n = luaL_checkint(L, 2);
+  luaL_checktype(L, 1, LUA_TFUNCTION);
+  name = get ? lua_getupvalue(L, 1, n) : lua_setupvalue(L, 1, n);
+  if (name == NULL) return 0;
+  lua_pushstring(L, name);
+  lua_insert(L, -(get+1));
+  return get + 1;
+}
+
+
+static int db_getupvalue (lua_State *L) {
+  return auxupvalue(L, 1);
+}
+
+
+static int db_setupvalue (lua_State *L) {
+  luaL_checkany(L, 3);
+  return auxupvalue(L, 0);
+}
+
+
+static int checkupval (lua_State *L, int argf, int argnup) {
+  lua_Debug ar;
+  int nup = luaL_checkint(L, argnup);
+  luaL_checktype(L, argf, LUA_TFUNCTION);
+  lua_pushvalue(L, argf);
+  lua_getinfo(L, ">u", &ar);
+  luaL_argcheck(L, 1 <= nup && nup <= ar.nups, argnup, "invalid upvalue index");
+  return nup;
+}
+
+
+static int db_upvalueid (lua_State *L) {
+  int n = checkupval(L, 1, 2);
+  lua_pushlightuserdata(L, lua_upvalueid(L, 1, n));
+  return 1;
+}
+
+
+static int db_upvaluejoin (lua_State *L) {
+  int n1 = checkupval(L, 1, 2);
+  int n2 = checkupval(L, 3, 4);
+  luaL_argcheck(L, !lua_iscfunction(L, 1), 1, "Lua function expected");
+  luaL_argcheck(L, !lua_iscfunction(L, 3), 3, "Lua function expected");
+  lua_upvaluejoin(L, 1, n1, 3, n2);
+  return 0;
+}
+
+
+#define gethooktable(L)	luaL_getsubtable(L, LUA_REGISTRYINDEX, HOOKKEY)
+
+
+static void hookf (lua_State *L, lua_Debug *ar) {
+  static const char *const hooknames[] =
+    {"call", "return", "line", "count", "tail call"};
+  gethooktable(L);
+  lua_pushthread(L);
+  lua_rawget(L, -2);
+  if (lua_isfunction(L, -1)) {
+    lua_pushstring(L, hooknames[(int)ar->event]);
+    if (ar->currentline >= 0)
+      lua_pushinteger(L, ar->currentline);
+    else lua_pushnil(L);
+    lua_assert(lua_getinfo(L, "lS", ar));
+    lua_call(L, 2, 0);
+  }
+}
+
+
+static int makemask (const char *smask, int count) {
+  int mask = 0;
+  if (strchr(smask, 'c')) mask |= LUA_MASKCALL;
+  if (strchr(smask, 'r')) mask |= LUA_MASKRET;
+  if (strchr(smask, 'l')) mask |= LUA_MASKLINE;
+  if (count > 0) mask |= LUA_MASKCOUNT;
+  return mask;
+}
+
+
+static char *unmakemask (int mask, char *smask) {
+  int i = 0;
+  if (mask & LUA_MASKCALL) smask[i++] = 'c';
+  if (mask & LUA_MASKRET) smask[i++] = 'r';
+  if (mask & LUA_MASKLINE) smask[i++] = 'l';
+  smask[i] = '\0';
+  return smask;
+}
+
+
+static int db_sethook (lua_State *L) {
+  int arg, mask, count;
+  lua_Hook func;
+  lua_State *L1 = getthread(L, &arg);
+  if (lua_isnoneornil(L, arg+1)) {
+    lua_settop(L, arg+1);
+    func = NULL; mask = 0; count = 0;  /* turn off hooks */
+  }
+  else {
+    const char *smask = luaL_checkstring(L, arg+2);
+    luaL_checktype(L, arg+1, LUA_TFUNCTION);
+    count = luaL_optint(L, arg+3, 0);
+    func = hookf; mask = makemask(smask, count);
+  }
+  if (gethooktable(L) == 0) {  /* creating hook table? */
+    lua_pushstring(L, "k");
+    lua_setfield(L, -2, "__mode");  /** hooktable.__mode = "k" */
+    lua_pushvalue(L, -1);
+    lua_setmetatable(L, -2);  /* setmetatable(hooktable) = hooktable */
+  }
+  lua_pushthread(L1); lua_xmove(L1, L, 1);
+  lua_pushvalue(L, arg+1);
+  lua_rawset(L, -3);  /* set new hook */
+  lua_sethook(L1, func, mask, count);  /* set hooks */
+  return 0;
+}
+
+
+static int db_gethook (lua_State *L) {
+  int arg;
+  lua_State *L1 = getthread(L, &arg);
+  char buff[5];
+  int mask = lua_gethookmask(L1);
+  lua_Hook hook = lua_gethook(L1);
+  if (hook != NULL && hook != hookf)  /* external hook? */
+    lua_pushliteral(L, "external hook");
+  else {
+    gethooktable(L);
+    lua_pushthread(L1); lua_xmove(L1, L, 1);
+    lua_rawget(L, -2);   /* get hook */
+    lua_remove(L, -2);  /* remove hook table */
+  }
+  lua_pushstring(L, unmakemask(mask, buff));
+  lua_pushinteger(L, lua_gethookcount(L1));
+  return 3;
+}
+
+
+static int db_debug (lua_State *L) {
+  for (;;) {
+    char buffer[250];
+    luai_writestringerror("%s", "lua_debug> ");
+    if (fgets(buffer, sizeof(buffer), stdin) == 0 ||
+        strcmp(buffer, "cont\n") == 0)
+      return 0;
+    if (luaL_loadbuffer(L, buffer, strlen(buffer), "=(debug command)") ||
+        lua_pcall(L, 0, 0, 0))
+      luai_writestringerror("%s\n", lua_tostring(L, -1));
+    lua_settop(L, 0);  /* remove eventual returns */
+  }
+}
+
+
+static int db_traceback (lua_State *L) {
+  int arg;
+  lua_State *L1 = getthread(L, &arg);
+  const char *msg = lua_tostring(L, arg + 1);
+  if (msg == NULL && !lua_isnoneornil(L, arg + 1))  /* non-string 'msg'? */
+    lua_pushvalue(L, arg + 1);  /* return it untouched */
+  else {
+    int level = luaL_optint(L, arg + 2, (L == L1) ? 1 : 0);
+    luaL_traceback(L, L1, msg, level);
+  }
+  return 1;
+}
+
+
+static const luaL_Reg dblib[] = {
+  {"debug", db_debug},
+  {"getuservalue", db_getuservalue},
+  {"gethook", db_gethook},
+  {"getinfo", db_getinfo},
+  {"getlocal", db_getlocal},
+  {"getregistry", db_getregistry},
+  {"getmetatable", db_getmetatable},
+  {"getupvalue", db_getupvalue},
+  {"upvaluejoin", db_upvaluejoin},
+  {"upvalueid", db_upvalueid},
+  {"setuservalue", db_setuservalue},
+  {"sethook", db_sethook},
+  {"setlocal", db_setlocal},
+  {"setmetatable", db_setmetatable},
+  {"setupvalue", db_setupvalue},
+  {"traceback", db_traceback},
+  {NULL, NULL}
+};
+
+
+LUAMOD_API int luaopen_debug (lua_State *L) {
+  luaL_newlib(L, dblib);
+  return 1;
+}
+
diff --git a/ext/lua/src/ldebug.c b/ext/lua/src/ldebug.c
new file mode 100644
index 0000000..7e04f9d
--- /dev/null
+++ b/ext/lua/src/ldebug.c
@@ -0,0 +1,580 @@
+/*
+** $Id: ldebug.c,v 2.90 2012/08/16 17:34:28 roberto Exp $
+** Debug Interface
+** See Copyright Notice in lua.h
+*/
+
+
+#include <stdarg.h>
+#include <stddef.h>
+#include <string.h>
+
+
+#define ldebug_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lapi.h"
+#include "lcode.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lvm.h"
+
+
+
+#define noLuaClosure(f)		((f) == NULL || (f)->c.tt == LUA_TCCL)
+
+
+static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name);
+
+
+static int currentpc (CallInfo *ci) {
+  lua_assert(isLua(ci));
+  return pcRel(ci->u.l.savedpc, ci_func(ci)->p);
+}
+
+
+static int currentline (CallInfo *ci) {
+  return getfuncline(ci_func(ci)->p, currentpc(ci));
+}
+
+
+/*
+** this function can be called asynchronous (e.g. during a signal)
+*/
+LUA_API int lua_sethook (lua_State *L, lua_Hook func, int mask, int count) {
+  if (func == NULL || mask == 0) {  /* turn off hooks? */
+    mask = 0;
+    func = NULL;
+  }
+  if (isLua(L->ci))
+    L->oldpc = L->ci->u.l.savedpc;
+  L->hook = func;
+  L->basehookcount = count;
+  resethookcount(L);
+  L->hookmask = cast_byte(mask);
+  return 1;
+}
+
+
+LUA_API lua_Hook lua_gethook (lua_State *L) {
+  return L->hook;
+}
+
+
+LUA_API int lua_gethookmask (lua_State *L) {
+  return L->hookmask;
+}
+
+
+LUA_API int lua_gethookcount (lua_State *L) {
+  return L->basehookcount;
+}
+
+
+LUA_API int lua_getstack (lua_State *L, int level, lua_Debug *ar) {
+  int status;
+  CallInfo *ci;
+  if (level < 0) return 0;  /* invalid (negative) level */
+  lua_lock(L);
+  for (ci = L->ci; level > 0 && ci != &L->base_ci; ci = ci->previous)
+    level--;
+  if (level == 0 && ci != &L->base_ci) {  /* level found? */
+    status = 1;
+    ar->i_ci = ci;
+  }
+  else status = 0;  /* no such level */
+  lua_unlock(L);
+  return status;
+}
+
+
+static const char *upvalname (Proto *p, int uv) {
+  TString *s = check_exp(uv < p->sizeupvalues, p->upvalues[uv].name);
+  if (s == NULL) return "?";
+  else return getstr(s);
+}
+
+
+static const char *findvararg (CallInfo *ci, int n, StkId *pos) {
+  int nparams = clLvalue(ci->func)->p->numparams;
+  if (n >= ci->u.l.base - ci->func - nparams)
+    return NULL;  /* no such vararg */
+  else {
+    *pos = ci->func + nparams + n;
+    return "(*vararg)";  /* generic name for any vararg */
+  }
+}
+
+
+static const char *findlocal (lua_State *L, CallInfo *ci, int n,
+                              StkId *pos) {
+  const char *name = NULL;
+  StkId base;
+  if (isLua(ci)) {
+    if (n < 0)  /* access to vararg values? */
+      return findvararg(ci, -n, pos);
+    else {
+      base = ci->u.l.base;
+      name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci));
+    }
+  }
+  else
+    base = ci->func + 1;
+  if (name == NULL) {  /* no 'standard' name? */
+    StkId limit = (ci == L->ci) ? L->top : ci->next->func;
+    if (limit - base >= n && n > 0)  /* is 'n' inside 'ci' stack? */
+      name = "(*temporary)";  /* generic name for any valid slot */
+    else
+      return NULL;  /* no name */
+  }
+  *pos = base + (n - 1);
+  return name;
+}
+
+
+LUA_API const char *lua_getlocal (lua_State *L, const lua_Debug *ar, int n) {
+  const char *name;
+  lua_lock(L);
+  if (ar == NULL) {  /* information about non-active function? */
+    if (!isLfunction(L->top - 1))  /* not a Lua function? */
+      name = NULL;
+    else  /* consider live variables at function start (parameters) */
+      name = luaF_getlocalname(clLvalue(L->top - 1)->p, n, 0);
+  }
+  else {  /* active function; get information through 'ar' */
+    StkId pos = 0;  /* to avoid warnings */
+    name = findlocal(L, ar->i_ci, n, &pos);
+    if (name) {
+      setobj2s(L, L->top, pos);
+      api_incr_top(L);
+    }
+  }
+  lua_unlock(L);
+  return name;
+}
+
+
+LUA_API const char *lua_setlocal (lua_State *L, const lua_Debug *ar, int n) {
+  StkId pos = 0;  /* to avoid warnings */
+  const char *name = findlocal(L, ar->i_ci, n, &pos);
+  lua_lock(L);
+  if (name)
+    setobjs2s(L, pos, L->top - 1);
+  L->top--;  /* pop value */
+  lua_unlock(L);
+  return name;
+}
+
+
+static void funcinfo (lua_Debug *ar, Closure *cl) {
+  if (noLuaClosure(cl)) {
+    ar->source = "=[C]";
+    ar->linedefined = -1;
+    ar->lastlinedefined = -1;
+    ar->what = "C";
+  }
+  else {
+    Proto *p = cl->l.p;
+    ar->source = p->source ? getstr(p->source) : "=?";
+    ar->linedefined = p->linedefined;
+    ar->lastlinedefined = p->lastlinedefined;
+    ar->what = (ar->linedefined == 0) ? "main" : "Lua";
+  }
+  luaO_chunkid(ar->short_src, ar->source, LUA_IDSIZE);
+}
+
+
+static void collectvalidlines (lua_State *L, Closure *f) {
+  if (noLuaClosure(f)) {
+    setnilvalue(L->top);
+    api_incr_top(L);
+  }
+  else {
+    int i;
+    TValue v;
+    int *lineinfo = f->l.p->lineinfo;
+    Table *t = luaH_new(L);  /* new table to store active lines */
+    sethvalue(L, L->top, t);  /* push it on stack */
+    api_incr_top(L);
+    setbvalue(&v, 1);  /* boolean 'true' to be the value of all indices */
+    for (i = 0; i < f->l.p->sizelineinfo; i++)  /* for all lines with code */
+      luaH_setint(L, t, lineinfo[i], &v);  /* table[line] = true */
+  }
+}
+
+
+static int auxgetinfo (lua_State *L, const char *what, lua_Debug *ar,
+                       Closure *f, CallInfo *ci) {
+  int status = 1;
+  for (; *what; what++) {
+    switch (*what) {
+      case 'S': {
+        funcinfo(ar, f);
+        break;
+      }
+      case 'l': {
+        ar->currentline = (ci && isLua(ci)) ? currentline(ci) : -1;
+        break;
+      }
+      case 'u': {
+        ar->nups = (f == NULL) ? 0 : f->c.nupvalues;
+        if (noLuaClosure(f)) {
+          ar->isvararg = 1;
+          ar->nparams = 0;
+        }
+        else {
+          ar->isvararg = f->l.p->is_vararg;
+          ar->nparams = f->l.p->numparams;
+        }
+        break;
+      }
+      case 't': {
+        ar->istailcall = (ci) ? ci->callstatus & CIST_TAIL : 0;
+        break;
+      }
+      case 'n': {
+        /* calling function is a known Lua function? */
+        if (ci && !(ci->callstatus & CIST_TAIL) && isLua(ci->previous))
+          ar->namewhat = getfuncname(L, ci->previous, &ar->name);
+        else
+          ar->namewhat = NULL;
+        if (ar->namewhat == NULL) {
+          ar->namewhat = "";  /* not found */
+          ar->name = NULL;
+        }
+        break;
+      }
+      case 'L':
+      case 'f':  /* handled by lua_getinfo */
+        break;
+      default: status = 0;  /* invalid option */
+    }
+  }
+  return status;
+}
+
+
+LUA_API int lua_getinfo (lua_State *L, const char *what, lua_Debug *ar) {
+  int status;
+  Closure *cl;
+  CallInfo *ci;
+  StkId func;
+  lua_lock(L);
+  if (*what == '>') {
+    ci = NULL;
+    func = L->top - 1;
+    api_check(L, ttisfunction(func), "function expected");
+    what++;  /* skip the '>' */
+    L->top--;  /* pop function */
+  }
+  else {
+    ci = ar->i_ci;
+    func = ci->func;
+    lua_assert(ttisfunction(ci->func));
+  }
+  cl = ttisclosure(func) ? clvalue(func) : NULL;
+  status = auxgetinfo(L, what, ar, cl, ci);
+  if (strchr(what, 'f')) {
+    setobjs2s(L, L->top, func);
+    api_incr_top(L);
+  }
+  if (strchr(what, 'L'))
+    collectvalidlines(L, cl);
+  lua_unlock(L);
+  return status;
+}
+
+
+/*
+** {======================================================
+** Symbolic Execution
+** =======================================================
+*/
+
+static const char *getobjname (Proto *p, int lastpc, int reg,
+                               const char **name);
+
+
+/*
+** find a "name" for the RK value 'c'
+*/
+static void kname (Proto *p, int pc, int c, const char **name) {
+  if (ISK(c)) {  /* is 'c' a constant? */
+    TValue *kvalue = &p->k[INDEXK(c)];
+    if (ttisstring(kvalue)) {  /* literal constant? */
+      *name = svalue(kvalue);  /* it is its own name */
+      return;
+    }
+    /* else no reasonable name found */
+  }
+  else {  /* 'c' is a register */
+    const char *what = getobjname(p, pc, c, name); /* search for 'c' */
+    if (what && *what == 'c') {  /* found a constant name? */
+      return;  /* 'name' already filled */
+    }
+    /* else no reasonable name found */
+  }
+  *name = "?";  /* no reasonable name found */
+}
+
+
+/*
+** try to find last instruction before 'lastpc' that modified register 'reg'
+*/
+static int findsetreg (Proto *p, int lastpc, int reg) {
+  int pc;
+  int setreg = -1;  /* keep last instruction that changed 'reg' */
+  for (pc = 0; pc < lastpc; pc++) {
+    Instruction i = p->code[pc];
+    OpCode op = GET_OPCODE(i);
+    int a = GETARG_A(i);
+    switch (op) {
+      case OP_LOADNIL: {
+        int b = GETARG_B(i);
+        if (a <= reg && reg <= a + b)  /* set registers from 'a' to 'a+b' */
+          setreg = pc;
+        break;
+      }
+      case OP_TFORCALL: {
+        if (reg >= a + 2) setreg = pc;  /* affect all regs above its base */
+        break;
+      }
+      case OP_CALL:
+      case OP_TAILCALL: {
+        if (reg >= a) setreg = pc;  /* affect all registers above base */
+        break;
+      }
+      case OP_JMP: {
+        int b = GETARG_sBx(i);
+        int dest = pc + 1 + b;
+        /* jump is forward and do not skip `lastpc'? */
+        if (pc < dest && dest <= lastpc)
+          pc += b;  /* do the jump */
+        break;
+      }
+      case OP_TEST: {
+        if (reg == a) setreg = pc;  /* jumped code can change 'a' */
+        break;
+      }
+      default:
+        if (testAMode(op) && reg == a)  /* any instruction that set A */
+          setreg = pc;
+        break;
+    }
+  }
+  return setreg;
+}
+
+
+static const char *getobjname (Proto *p, int lastpc, int reg,
+                               const char **name) {
+  int pc;
+  *name = luaF_getlocalname(p, reg + 1, lastpc);
+  if (*name)  /* is a local? */
+    return "local";
+  /* else try symbolic execution */
+  pc = findsetreg(p, lastpc, reg);
+  if (pc != -1) {  /* could find instruction? */
+    Instruction i = p->code[pc];
+    OpCode op = GET_OPCODE(i);
+    switch (op) {
+      case OP_MOVE: {
+        int b = GETARG_B(i);  /* move from 'b' to 'a' */
+        if (b < GETARG_A(i))
+          return getobjname(p, pc, b, name);  /* get name for 'b' */
+        break;
+      }
+      case OP_GETTABUP:
+      case OP_GETTABLE: {
+        int k = GETARG_C(i);  /* key index */
+        int t = GETARG_B(i);  /* table index */
+        const char *vn = (op == OP_GETTABLE)  /* name of indexed variable */
+                         ? luaF_getlocalname(p, t + 1, pc)
+                         : upvalname(p, t);
+        kname(p, pc, k, name);
+        return (vn && strcmp(vn, LUA_ENV) == 0) ? "global" : "field";
+      }
+      case OP_GETUPVAL: {
+        *name = upvalname(p, GETARG_B(i));
+        return "upvalue";
+      }
+      case OP_LOADK:
+      case OP_LOADKX: {
+        int b = (op == OP_LOADK) ? GETARG_Bx(i)
+                                 : GETARG_Ax(p->code[pc + 1]);
+        if (ttisstring(&p->k[b])) {
+          *name = svalue(&p->k[b]);
+          return "constant";
+        }
+        break;
+      }
+      case OP_SELF: {
+        int k = GETARG_C(i);  /* key index */
+        kname(p, pc, k, name);
+        return "method";
+      }
+      default: break;  /* go through to return NULL */
+    }
+  }
+  return NULL;  /* could not find reasonable name */
+}
+
+
+static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name) {
+  TMS tm;
+  Proto *p = ci_func(ci)->p;  /* calling function */
+  int pc = currentpc(ci);  /* calling instruction index */
+  Instruction i = p->code[pc];  /* calling instruction */
+  switch (GET_OPCODE(i)) {
+    case OP_CALL:
+    case OP_TAILCALL:  /* get function name */
+      return getobjname(p, pc, GETARG_A(i), name);
+    case OP_TFORCALL: {  /* for iterator */
+      *name = "for iterator";
+       return "for iterator";
+    }
+    /* all other instructions can call only through metamethods */
+    case OP_SELF:
+    case OP_GETTABUP:
+    case OP_GETTABLE: tm = TM_INDEX; break;
+    case OP_SETTABUP:
+    case OP_SETTABLE: tm = TM_NEWINDEX; break;
+    case OP_EQ: tm = TM_EQ; break;
+    case OP_ADD: tm = TM_ADD; break;
+    case OP_SUB: tm = TM_SUB; break;
+    case OP_MUL: tm = TM_MUL; break;
+    case OP_DIV: tm = TM_DIV; break;
+    case OP_MOD: tm = TM_MOD; break;
+    case OP_POW: tm = TM_POW; break;
+    case OP_UNM: tm = TM_UNM; break;
+    case OP_LEN: tm = TM_LEN; break;
+    case OP_LT: tm = TM_LT; break;
+    case OP_LE: tm = TM_LE; break;
+    case OP_CONCAT: tm = TM_CONCAT; break;
+    default:
+      return NULL;  /* else no useful name can be found */
+  }
+  *name = getstr(G(L)->tmname[tm]);
+  return "metamethod";
+}
+
+/* }====================================================== */
+
+
+
+/*
+** only ANSI way to check whether a pointer points to an array
+** (used only for error messages, so efficiency is not a big concern)
+*/
+static int isinstack (CallInfo *ci, const TValue *o) {
+  StkId p;
+  for (p = ci->u.l.base; p < ci->top; p++)
+    if (o == p) return 1;
+  return 0;
+}
+
+
+static const char *getupvalname (CallInfo *ci, const TValue *o,
+                                 const char **name) {
+  LClosure *c = ci_func(ci);
+  int i;
+  for (i = 0; i < c->nupvalues; i++) {
+    if (c->upvals[i]->v == o) {
+      *name = upvalname(c->p, i);
+      return "upvalue";
+    }
+  }
+  return NULL;
+}
+
+
+l_noret luaG_typeerror (lua_State *L, const TValue *o, const char *op) {
+  CallInfo *ci = L->ci;
+  const char *name = NULL;
+  const char *t = objtypename(o);
+  const char *kind = NULL;
+  if (isLua(ci)) {
+    kind = getupvalname(ci, o, &name);  /* check whether 'o' is an upvalue */
+    if (!kind && isinstack(ci, o))  /* no? try a register */
+      kind = getobjname(ci_func(ci)->p, currentpc(ci),
+                        cast_int(o - ci->u.l.base), &name);
+  }
+  if (kind)
+    luaG_runerror(L, "attempt to %s %s " LUA_QS " (a %s value)",
+                op, kind, name, t);
+  else
+    luaG_runerror(L, "attempt to %s a %s value", op, t);
+}
+
+
+l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2) {
+  if (ttisstring(p1) || ttisnumber(p1)) p1 = p2;
+  lua_assert(!ttisstring(p1) && !ttisnumber(p2));
+  luaG_typeerror(L, p1, "concatenate");
+}
+
+
+l_noret luaG_aritherror (lua_State *L, const TValue *p1, const TValue *p2) {
+  TValue temp;
+  if (luaV_tonumber(p1, &temp) == NULL)
+    p2 = p1;  /* first operand is wrong */
+  luaG_typeerror(L, p2, "perform arithmetic on");
+}
+
+
+l_noret luaG_ordererror (lua_State *L, const TValue *p1, const TValue *p2) {
+  const char *t1 = objtypename(p1);
+  const char *t2 = objtypename(p2);
+  if (t1 == t2)
+    luaG_runerror(L, "attempt to compare two %s values", t1);
+  else
+    luaG_runerror(L, "attempt to compare %s with %s", t1, t2);
+}
+
+
+static void addinfo (lua_State *L, const char *msg) {
+  CallInfo *ci = L->ci;
+  if (isLua(ci)) {  /* is Lua code? */
+    char buff[LUA_IDSIZE];  /* add file:line information */
+    int line = currentline(ci);
+    TString *src = ci_func(ci)->p->source;
+    if (src)
+      luaO_chunkid(buff, getstr(src), LUA_IDSIZE);
+    else {  /* no source available; use "?" instead */
+      buff[0] = '?'; buff[1] = '\0';
+    }
+    luaO_pushfstring(L, "%s:%d: %s", buff, line, msg);
+  }
+}
+
+
+l_noret luaG_errormsg (lua_State *L) {
+  if (L->errfunc != 0) {  /* is there an error handling function? */
+    StkId errfunc = restorestack(L, L->errfunc);
+    if (!ttisfunction(errfunc)) luaD_throw(L, LUA_ERRERR);
+    setobjs2s(L, L->top, L->top - 1);  /* move argument */
+    setobjs2s(L, L->top - 1, errfunc);  /* push function */
+    L->top++;
+    luaD_call(L, L->top - 2, 1, 0);  /* call it */
+  }
+  luaD_throw(L, LUA_ERRRUN);
+}
+
+
+l_noret luaG_runerror (lua_State *L, const char *fmt, ...) {
+  va_list argp;
+  va_start(argp, fmt);
+  addinfo(L, luaO_pushvfstring(L, fmt, argp));
+  va_end(argp);
+  luaG_errormsg(L);
+}
+
diff --git a/ext/lua/src/ldo.c b/ext/lua/src/ldo.c
new file mode 100644
index 0000000..aafa3dc
--- /dev/null
+++ b/ext/lua/src/ldo.c
@@ -0,0 +1,673 @@
+/*
+** $Id: ldo.c,v 2.108 2012/10/01 14:05:04 roberto Exp $
+** Stack and Call structure of Lua
+** See Copyright Notice in lua.h
+*/
+
+
+#include <setjmp.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define ldo_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lapi.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lundump.h"
+#include "lvm.h"
+#include "lzio.h"
+
+
+
+
+/*
+** {======================================================
+** Error-recovery functions
+** =======================================================
+*/
+
+/*
+** LUAI_THROW/LUAI_TRY define how Lua does exception handling. By
+** default, Lua handles errors with exceptions when compiling as
+** C++ code, with _longjmp/_setjmp when asked to use them, and with
+** longjmp/setjmp otherwise.
+*/
+#if !defined(LUAI_THROW)
+
+#if defined(__cplusplus) && !defined(LUA_USE_LONGJMP)
+/* C++ exceptions */
+#define LUAI_THROW(L,c)		throw(c)
+#define LUAI_TRY(L,c,a) \
+	try { a } catch(...) { if ((c)->status == 0) (c)->status = -1; }
+#define luai_jmpbuf		int  /* dummy variable */
+
+#elif defined(LUA_USE_ULONGJMP)
+/* in Unix, try _longjmp/_setjmp (more efficient) */
+#define LUAI_THROW(L,c)		_longjmp((c)->b, 1)
+#define LUAI_TRY(L,c,a)		if (_setjmp((c)->b) == 0) { a }
+#define luai_jmpbuf		jmp_buf
+
+#else
+/* default handling with long jumps */
+#define LUAI_THROW(L,c)		longjmp((c)->b, 1)
+#define LUAI_TRY(L,c,a)		if (setjmp((c)->b) == 0) { a }
+#define luai_jmpbuf		jmp_buf
+
+#endif
+
+#endif
+
+
+
+/* chain list of long jump buffers */
+struct lua_longjmp {
+  struct lua_longjmp *previous;
+  luai_jmpbuf b;
+  volatile int status;  /* error code */
+};
+
+
+static void seterrorobj (lua_State *L, int errcode, StkId oldtop) {
+  switch (errcode) {
+    case LUA_ERRMEM: {  /* memory error? */
+      setsvalue2s(L, oldtop, G(L)->memerrmsg); /* reuse preregistered msg. */
+      break;
+    }
+    case LUA_ERRERR: {
+      setsvalue2s(L, oldtop, luaS_newliteral(L, "error in error handling"));
+      break;
+    }
+    default: {
+      setobjs2s(L, oldtop, L->top - 1);  /* error message on current top */
+      break;
+    }
+  }
+  L->top = oldtop + 1;
+}
+
+
+l_noret luaD_throw (lua_State *L, int errcode) {
+  if (L->errorJmp) {  /* thread has an error handler? */
+    L->errorJmp->status = errcode;  /* set status */
+    LUAI_THROW(L, L->errorJmp);  /* jump to it */
+  }
+  else {  /* thread has no error handler */
+    L->status = cast_byte(errcode);  /* mark it as dead */
+    if (G(L)->mainthread->errorJmp) {  /* main thread has a handler? */
+      setobjs2s(L, G(L)->mainthread->top++, L->top - 1);  /* copy error obj. */
+      luaD_throw(G(L)->mainthread, errcode);  /* re-throw in main thread */
+    }
+    else {  /* no handler at all; abort */
+      if (G(L)->panic) {  /* panic function? */
+        lua_unlock(L);
+        G(L)->panic(L);  /* call it (last chance to jump out) */
+      }
+      abort();
+    }
+  }
+}
+
+
+int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud) {
+  unsigned short oldnCcalls = L->nCcalls;
+  struct lua_longjmp lj;
+  lj.status = LUA_OK;
+  lj.previous = L->errorJmp;  /* chain new error handler */
+  L->errorJmp = &lj;
+  LUAI_TRY(L, &lj,
+    (*f)(L, ud);
+  );
+  L->errorJmp = lj.previous;  /* restore old error handler */
+  L->nCcalls = oldnCcalls;
+  return lj.status;
+}
+
+/* }====================================================== */
+
+
+static void correctstack (lua_State *L, TValue *oldstack) {
+  CallInfo *ci;
+  GCObject *up;
+  L->top = (L->top - oldstack) + L->stack;
+  for (up = L->openupval; up != NULL; up = up->gch.next)
+    gco2uv(up)->v = (gco2uv(up)->v - oldstack) + L->stack;
+  for (ci = L->ci; ci != NULL; ci = ci->previous) {
+    ci->top = (ci->top - oldstack) + L->stack;
+    ci->func = (ci->func - oldstack) + L->stack;
+    if (isLua(ci))
+      ci->u.l.base = (ci->u.l.base - oldstack) + L->stack;
+  }
+}
+
+
+/* some space for error handling */
+#define ERRORSTACKSIZE	(LUAI_MAXSTACK + 200)
+
+
+void luaD_reallocstack (lua_State *L, int newsize) {
+  TValue *oldstack = L->stack;
+  int lim = L->stacksize;
+  lua_assert(newsize <= LUAI_MAXSTACK || newsize == ERRORSTACKSIZE);
+  lua_assert(L->stack_last - L->stack == L->stacksize - EXTRA_STACK);
+  luaM_reallocvector(L, L->stack, L->stacksize, newsize, TValue);
+  for (; lim < newsize; lim++)
+    setnilvalue(L->stack + lim); /* erase new segment */
+  L->stacksize = newsize;
+  L->stack_last = L->stack + newsize - EXTRA_STACK;
+  correctstack(L, oldstack);
+}
+
+
+void luaD_growstack (lua_State *L, int n) {
+  int size = L->stacksize;
+  if (size > LUAI_MAXSTACK)  /* error after extra size? */
+    luaD_throw(L, LUA_ERRERR);
+  else {
+    int needed = cast_int(L->top - L->stack) + n + EXTRA_STACK;
+    int newsize = 2 * size;
+    if (newsize > LUAI_MAXSTACK) newsize = LUAI_MAXSTACK;
+    if (newsize < needed) newsize = needed;
+    if (newsize > LUAI_MAXSTACK) {  /* stack overflow? */
+      luaD_reallocstack(L, ERRORSTACKSIZE);
+      luaG_runerror(L, "stack overflow");
+    }
+    else
+      luaD_reallocstack(L, newsize);
+  }
+}
+
+
+static int stackinuse (lua_State *L) {
+  CallInfo *ci;
+  StkId lim = L->top;
+  for (ci = L->ci; ci != NULL; ci = ci->previous) {
+    lua_assert(ci->top <= L->stack_last);
+    if (lim < ci->top) lim = ci->top;
+  }
+  return cast_int(lim - L->stack) + 1;  /* part of stack in use */
+}
+
+
+void luaD_shrinkstack (lua_State *L) {
+  int inuse = stackinuse(L);
+  int goodsize = inuse + (inuse / 8) + 2*EXTRA_STACK;
+  if (goodsize > LUAI_MAXSTACK) goodsize = LUAI_MAXSTACK;
+  if (inuse > LUAI_MAXSTACK ||  /* handling stack overflow? */
+      goodsize >= L->stacksize)  /* would grow instead of shrink? */
+    condmovestack(L);  /* don't change stack (change only for debugging) */
+  else
+    luaD_reallocstack(L, goodsize);  /* shrink it */
+}
+
+
+void luaD_hook (lua_State *L, int event, int line) {
+  lua_Hook hook = L->hook;
+  if (hook && L->allowhook) {
+    CallInfo *ci = L->ci;
+    ptrdiff_t top = savestack(L, L->top);
+    ptrdiff_t ci_top = savestack(L, ci->top);
+    lua_Debug ar;
+    ar.event = event;
+    ar.currentline = line;
+    ar.i_ci = ci;
+    luaD_checkstack(L, LUA_MINSTACK);  /* ensure minimum stack size */
+    ci->top = L->top + LUA_MINSTACK;
+    lua_assert(ci->top <= L->stack_last);
+    L->allowhook = 0;  /* cannot call hooks inside a hook */
+    ci->callstatus |= CIST_HOOKED;
+    lua_unlock(L);
+    (*hook)(L, &ar);
+    lua_lock(L);
+    lua_assert(!L->allowhook);
+    L->allowhook = 1;
+    ci->top = restorestack(L, ci_top);
+    L->top = restorestack(L, top);
+    ci->callstatus &= ~CIST_HOOKED;
+  }
+}
+
+
+static void callhook (lua_State *L, CallInfo *ci) {
+  int hook = LUA_HOOKCALL;
+  ci->u.l.savedpc++;  /* hooks assume 'pc' is already incremented */
+  if (isLua(ci->previous) &&
+      GET_OPCODE(*(ci->previous->u.l.savedpc - 1)) == OP_TAILCALL) {
+    ci->callstatus |= CIST_TAIL;
+    hook = LUA_HOOKTAILCALL;
+  }
+  luaD_hook(L, hook, -1);
+  ci->u.l.savedpc--;  /* correct 'pc' */
+}
+
+
+static StkId adjust_varargs (lua_State *L, Proto *p, int actual) {
+  int i;
+  int nfixargs = p->numparams;
+  StkId base, fixed;
+  lua_assert(actual >= nfixargs);
+  /* move fixed parameters to final position */
+  fixed = L->top - actual;  /* first fixed argument */
+  base = L->top;  /* final position of first argument */
+  for (i=0; i<nfixargs; i++) {
+    setobjs2s(L, L->top++, fixed + i);
+    setnilvalue(fixed + i);
+  }
+  return base;
+}
+
+
+static StkId tryfuncTM (lua_State *L, StkId func) {
+  const TValue *tm = luaT_gettmbyobj(L, func, TM_CALL);
+  StkId p;
+  ptrdiff_t funcr = savestack(L, func);
+  if (!ttisfunction(tm))
+    luaG_typeerror(L, func, "call");
+  /* Open a hole inside the stack at `func' */
+  for (p = L->top; p > func; p--) setobjs2s(L, p, p-1);
+  incr_top(L);
+  func = restorestack(L, funcr);  /* previous call may change stack */
+  setobj2s(L, func, tm);  /* tag method is the new function to be called */
+  return func;
+}
+
+
+
+#define next_ci(L) (L->ci = (L->ci->next ? L->ci->next : luaE_extendCI(L)))
+
+
+/*
+** returns true if function has been executed (C function)
+*/
+int luaD_precall (lua_State *L, StkId func, int nresults) {
+  lua_CFunction f;
+  CallInfo *ci;
+  int n;  /* number of arguments (Lua) or returns (C) */
+  ptrdiff_t funcr = savestack(L, func);
+  switch (ttype(func)) {
+    case LUA_TLCF:  /* light C function */
+      f = fvalue(func);
+      goto Cfunc;
+    case LUA_TCCL: {  /* C closure */
+      f = clCvalue(func)->f;
+     Cfunc:
+      luaD_checkstack(L, LUA_MINSTACK);  /* ensure minimum stack size */
+      ci = next_ci(L);  /* now 'enter' new function */
+      ci->nresults = nresults;
+      ci->func = restorestack(L, funcr);
+      ci->top = L->top + LUA_MINSTACK;
+      lua_assert(ci->top <= L->stack_last);
+      ci->callstatus = 0;
+      luaC_checkGC(L);  /* stack grow uses memory */
+      if (L->hookmask & LUA_MASKCALL)
+        luaD_hook(L, LUA_HOOKCALL, -1);
+      lua_unlock(L);
+      n = (*f)(L);  /* do the actual call */
+      lua_lock(L);
+      api_checknelems(L, n);
+      luaD_poscall(L, L->top - n);
+      return 1;
+    }
+    case LUA_TLCL: {  /* Lua function: prepare its call */
+      StkId base;
+      Proto *p = clLvalue(func)->p;
+      luaD_checkstack(L, p->maxstacksize);
+      func = restorestack(L, funcr);
+      n = cast_int(L->top - func) - 1;  /* number of real arguments */
+      for (; n < p->numparams; n++)
+        setnilvalue(L->top++);  /* complete missing arguments */
+      base = (!p->is_vararg) ? func + 1 : adjust_varargs(L, p, n);
+      ci = next_ci(L);  /* now 'enter' new function */
+      ci->nresults = nresults;
+      ci->func = func;
+      ci->u.l.base = base;
+      ci->top = base + p->maxstacksize;
+      lua_assert(ci->top <= L->stack_last);
+      ci->u.l.savedpc = p->code;  /* starting point */
+      ci->callstatus = CIST_LUA;
+      L->top = ci->top;
+      luaC_checkGC(L);  /* stack grow uses memory */
+      if (L->hookmask & LUA_MASKCALL)
+        callhook(L, ci);
+      return 0;
+    }
+    default: {  /* not a function */
+      func = tryfuncTM(L, func);  /* retry with 'function' tag method */
+      return luaD_precall(L, func, nresults);  /* now it must be a function */
+    }
+  }
+}
+
+
+int luaD_poscall (lua_State *L, StkId firstResult) {
+  StkId res;
+  int wanted, i;
+  CallInfo *ci = L->ci;
+  if (L->hookmask & (LUA_MASKRET | LUA_MASKLINE)) {
+    if (L->hookmask & LUA_MASKRET) {
+      ptrdiff_t fr = savestack(L, firstResult);  /* hook may change stack */
+      luaD_hook(L, LUA_HOOKRET, -1);
+      firstResult = restorestack(L, fr);
+    }
+    L->oldpc = ci->previous->u.l.savedpc;  /* 'oldpc' for caller function */
+  }
+  res = ci->func;  /* res == final position of 1st result */
+  wanted = ci->nresults;
+  L->ci = ci = ci->previous;  /* back to caller */
+  /* move results to correct place */
+  for (i = wanted; i != 0 && firstResult < L->top; i--)
+    setobjs2s(L, res++, firstResult++);
+  while (i-- > 0)
+    setnilvalue(res++);
+  L->top = res;
+  return (wanted - LUA_MULTRET);  /* 0 iff wanted == LUA_MULTRET */
+}
+
+
+/*
+** Call a function (C or Lua). The function to be called is at *func.
+** The arguments are on the stack, right after the function.
+** When returns, all the results are on the stack, starting at the original
+** function position.
+*/
+void luaD_call (lua_State *L, StkId func, int nResults, int allowyield) {
+  if (++L->nCcalls >= LUAI_MAXCCALLS) {
+    if (L->nCcalls == LUAI_MAXCCALLS)
+      luaG_runerror(L, "C stack overflow");
+    else if (L->nCcalls >= (LUAI_MAXCCALLS + (LUAI_MAXCCALLS>>3)))
+      luaD_throw(L, LUA_ERRERR);  /* error while handing stack error */
+  }
+  if (!allowyield) L->nny++;
+  if (!luaD_precall(L, func, nResults))  /* is a Lua function? */
+    luaV_execute(L);  /* call it */
+  if (!allowyield) L->nny--;
+  L->nCcalls--;
+}
+
+
+static void finishCcall (lua_State *L) {
+  CallInfo *ci = L->ci;
+  int n;
+  lua_assert(ci->u.c.k != NULL);  /* must have a continuation */
+  lua_assert(L->nny == 0);
+  if (ci->callstatus & CIST_YPCALL) {  /* was inside a pcall? */
+    ci->callstatus &= ~CIST_YPCALL;  /* finish 'lua_pcall' */
+    L->errfunc = ci->u.c.old_errfunc;
+  }
+  /* finish 'lua_callk'/'lua_pcall' */
+  adjustresults(L, ci->nresults);
+  /* call continuation function */
+  if (!(ci->callstatus & CIST_STAT))  /* no call status? */
+    ci->u.c.status = LUA_YIELD;  /* 'default' status */
+  lua_assert(ci->u.c.status != LUA_OK);
+  ci->callstatus = (ci->callstatus & ~(CIST_YPCALL | CIST_STAT)) | CIST_YIELDED;
+  lua_unlock(L);
+  n = (*ci->u.c.k)(L);
+  lua_lock(L);
+  api_checknelems(L, n);
+  /* finish 'luaD_precall' */
+  luaD_poscall(L, L->top - n);
+}
+
+
+static void unroll (lua_State *L, void *ud) {
+  UNUSED(ud);
+  for (;;) {
+    if (L->ci == &L->base_ci)  /* stack is empty? */
+      return;  /* coroutine finished normally */
+    if (!isLua(L->ci))  /* C function? */
+      finishCcall(L);
+    else {  /* Lua function */
+      luaV_finishOp(L);  /* finish interrupted instruction */
+      luaV_execute(L);  /* execute down to higher C 'boundary' */
+    }
+  }
+}
+
+
+/*
+** check whether thread has a suspended protected call
+*/
+static CallInfo *findpcall (lua_State *L) {
+  CallInfo *ci;
+  for (ci = L->ci; ci != NULL; ci = ci->previous) {  /* search for a pcall */
+    if (ci->callstatus & CIST_YPCALL)
+      return ci;
+  }
+  return NULL;  /* no pending pcall */
+}
+
+
+static int recover (lua_State *L, int status) {
+  StkId oldtop;
+  CallInfo *ci = findpcall(L);
+  if (ci == NULL) return 0;  /* no recovery point */
+  /* "finish" luaD_pcall */
+  oldtop = restorestack(L, ci->extra);
+  luaF_close(L, oldtop);
+  seterrorobj(L, status, oldtop);
+  L->ci = ci;
+  L->allowhook = ci->u.c.old_allowhook;
+  L->nny = 0;  /* should be zero to be yieldable */
+  luaD_shrinkstack(L);
+  L->errfunc = ci->u.c.old_errfunc;
+  ci->callstatus |= CIST_STAT;  /* call has error status */
+  ci->u.c.status = status;  /* (here it is) */
+  return 1;  /* continue running the coroutine */
+}
+
+
+/*
+** signal an error in the call to 'resume', not in the execution of the
+** coroutine itself. (Such errors should not be handled by any coroutine
+** error handler and should not kill the coroutine.)
+*/
+static l_noret resume_error (lua_State *L, const char *msg, StkId firstArg) {
+  L->top = firstArg;  /* remove args from the stack */
+  setsvalue2s(L, L->top, luaS_new(L, msg));  /* push error message */
+  api_incr_top(L);
+  luaD_throw(L, -1);  /* jump back to 'lua_resume' */
+}
+
+
+/*
+** do the work for 'lua_resume' in protected mode
+*/
+static void resume (lua_State *L, void *ud) {
+  int nCcalls = L->nCcalls;
+  StkId firstArg = cast(StkId, ud);
+  CallInfo *ci = L->ci;
+  if (nCcalls >= LUAI_MAXCCALLS)
+    resume_error(L, "C stack overflow", firstArg);
+  if (L->status == LUA_OK) {  /* may be starting a coroutine */
+    if (ci != &L->base_ci)  /* not in base level? */
+      resume_error(L, "cannot resume non-suspended coroutine", firstArg);
+    /* coroutine is in base level; start running it */
+    if (!luaD_precall(L, firstArg - 1, LUA_MULTRET))  /* Lua function? */
+      luaV_execute(L);  /* call it */
+  }
+  else if (L->status != LUA_YIELD)
+    resume_error(L, "cannot resume dead coroutine", firstArg);
+  else {  /* resuming from previous yield */
+    L->status = LUA_OK;
+    ci->func = restorestack(L, ci->extra);
+    if (isLua(ci))  /* yielded inside a hook? */
+      luaV_execute(L);  /* just continue running Lua code */
+    else {  /* 'common' yield */
+      if (ci->u.c.k != NULL) {  /* does it have a continuation? */
+        int n;
+        ci->u.c.status = LUA_YIELD;  /* 'default' status */
+        ci->callstatus |= CIST_YIELDED;
+        lua_unlock(L);
+        n = (*ci->u.c.k)(L);  /* call continuation */
+        lua_lock(L);
+        api_checknelems(L, n);
+        firstArg = L->top - n;  /* yield results come from continuation */
+      }
+      luaD_poscall(L, firstArg);  /* finish 'luaD_precall' */
+    }
+    unroll(L, NULL);
+  }
+  lua_assert(nCcalls == L->nCcalls);
+}
+
+
+LUA_API int lua_resume (lua_State *L, lua_State *from, int nargs) {
+  int status;
+  lua_lock(L);
+  luai_userstateresume(L, nargs);
+  L->nCcalls = (from) ? from->nCcalls + 1 : 1;
+  L->nny = 0;  /* allow yields */
+  api_checknelems(L, (L->status == LUA_OK) ? nargs + 1 : nargs);
+  status = luaD_rawrunprotected(L, resume, L->top - nargs);
+  if (status == -1)  /* error calling 'lua_resume'? */
+    status = LUA_ERRRUN;
+  else {  /* yield or regular error */
+    while (status != LUA_OK && status != LUA_YIELD) {  /* error? */
+      if (recover(L, status))  /* recover point? */
+        status = luaD_rawrunprotected(L, unroll, NULL);  /* run continuation */
+      else {  /* unrecoverable error */
+        L->status = cast_byte(status);  /* mark thread as `dead' */
+        seterrorobj(L, status, L->top);
+        L->ci->top = L->top;
+        break;
+      }
+    }
+    lua_assert(status == L->status);
+  }
+  L->nny = 1;  /* do not allow yields */
+  L->nCcalls--;
+  lua_assert(L->nCcalls == ((from) ? from->nCcalls : 0));
+  lua_unlock(L);
+  return status;
+}
+
+
+LUA_API int lua_yieldk (lua_State *L, int nresults, int ctx, lua_CFunction k) {
+  CallInfo *ci = L->ci;
+  luai_userstateyield(L, nresults);
+  lua_lock(L);
+  api_checknelems(L, nresults);
+  if (L->nny > 0) {
+    if (L != G(L)->mainthread)
+      luaG_runerror(L, "attempt to yield across a C-call boundary");
+    else
+      luaG_runerror(L, "attempt to yield from outside a coroutine");
+  }
+  L->status = LUA_YIELD;
+  ci->extra = savestack(L, ci->func);  /* save current 'func' */
+  if (isLua(ci)) {  /* inside a hook? */
+    api_check(L, k == NULL, "hooks cannot continue after yielding");
+  }
+  else {
+    if ((ci->u.c.k = k) != NULL)  /* is there a continuation? */
+      ci->u.c.ctx = ctx;  /* save context */
+    ci->func = L->top - nresults - 1;  /* protect stack below results */
+    luaD_throw(L, LUA_YIELD);
+  }
+  lua_assert(ci->callstatus & CIST_HOOKED);  /* must be inside a hook */
+  lua_unlock(L);
+  return 0;  /* return to 'luaD_hook' */
+}
+
+
+int luaD_pcall (lua_State *L, Pfunc func, void *u,
+                ptrdiff_t old_top, ptrdiff_t ef) {
+  int status;
+  CallInfo *old_ci = L->ci;
+  lu_byte old_allowhooks = L->allowhook;
+  unsigned short old_nny = L->nny;
+  ptrdiff_t old_errfunc = L->errfunc;
+  L->errfunc = ef;
+  status = luaD_rawrunprotected(L, func, u);
+  if (status != LUA_OK) {  /* an error occurred? */
+    StkId oldtop = restorestack(L, old_top);
+    luaF_close(L, oldtop);  /* close possible pending closures */
+    seterrorobj(L, status, oldtop);
+    L->ci = old_ci;
+    L->allowhook = old_allowhooks;
+    L->nny = old_nny;
+    luaD_shrinkstack(L);
+  }
+  L->errfunc = old_errfunc;
+  return status;
+}
+
+
+
+/*
+** Execute a protected parser.
+*/
+struct SParser {  /* data to `f_parser' */
+  ZIO *z;
+  Mbuffer buff;  /* dynamic structure used by the scanner */
+  Dyndata dyd;  /* dynamic structures used by the parser */
+  const char *mode;
+  const char *name;
+};
+
+
+static void checkmode (lua_State *L, const char *mode, const char *x) {
+  if (mode && strchr(mode, x[0]) == NULL) {
+    luaO_pushfstring(L,
+       "attempt to load a %s chunk (mode is " LUA_QS ")", x, mode);
+    luaD_throw(L, LUA_ERRSYNTAX);
+  }
+}
+
+
+static void f_parser (lua_State *L, void *ud) {
+  int i;
+  Closure *cl;
+  struct SParser *p = cast(struct SParser *, ud);
+  int c = zgetc(p->z);  /* read first character */
+  if (c == LUA_SIGNATURE[0]) {
+    checkmode(L, p->mode, "binary");
+    cl = luaU_undump(L, p->z, &p->buff, p->name);
+  }
+  else {
+    checkmode(L, p->mode, "text");
+    cl = luaY_parser(L, p->z, &p->buff, &p->dyd, p->name, c);
+  }
+  lua_assert(cl->l.nupvalues == cl->l.p->sizeupvalues);
+  for (i = 0; i < cl->l.nupvalues; i++) {  /* initialize upvalues */
+    UpVal *up = luaF_newupval(L);
+    cl->l.upvals[i] = up;
+    luaC_objbarrier(L, cl, up);
+  }
+}
+
+
+int luaD_protectedparser (lua_State *L, ZIO *z, const char *name,
+                                        const char *mode) {
+  struct SParser p;
+  int status;
+  L->nny++;  /* cannot yield during parsing */
+  p.z = z; p.name = name; p.mode = mode;
+  p.dyd.actvar.arr = NULL; p.dyd.actvar.size = 0;
+  p.dyd.gt.arr = NULL; p.dyd.gt.size = 0;
+  p.dyd.label.arr = NULL; p.dyd.label.size = 0;
+  luaZ_initbuffer(L, &p.buff);
+  status = luaD_pcall(L, f_parser, &p, savestack(L, L->top), L->errfunc);
+  luaZ_freebuffer(L, &p.buff);
+  luaM_freearray(L, p.dyd.actvar.arr, p.dyd.actvar.size);
+  luaM_freearray(L, p.dyd.gt.arr, p.dyd.gt.size);
+  luaM_freearray(L, p.dyd.label.arr, p.dyd.label.size);
+  L->nny--;
+  return status;
+}
+
+
diff --git a/ext/lua/src/ldump.c b/ext/lua/src/ldump.c
new file mode 100644
index 0000000..d5e6a47
--- /dev/null
+++ b/ext/lua/src/ldump.c
@@ -0,0 +1,173 @@
+/*
+** $Id: ldump.c,v 2.17 2012/01/23 23:02:10 roberto Exp $
+** save precompiled Lua chunks
+** See Copyright Notice in lua.h
+*/
+
+#include <stddef.h>
+
+#define ldump_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lobject.h"
+#include "lstate.h"
+#include "lundump.h"
+
+typedef struct {
+ lua_State* L;
+ lua_Writer writer;
+ void* data;
+ int strip;
+ int status;
+} DumpState;
+
+#define DumpMem(b,n,size,D)	DumpBlock(b,(n)*(size),D)
+#define DumpVar(x,D)		DumpMem(&x,1,sizeof(x),D)
+
+static void DumpBlock(const void* b, size_t size, DumpState* D)
+{
+ if (D->status==0)
+ {
+  lua_unlock(D->L);
+  D->status=(*D->writer)(D->L,b,size,D->data);
+  lua_lock(D->L);
+ }
+}
+
+static void DumpChar(int y, DumpState* D)
+{
+ char x=(char)y;
+ DumpVar(x,D);
+}
+
+static void DumpInt(int x, DumpState* D)
+{
+ DumpVar(x,D);
+}
+
+static void DumpNumber(lua_Number x, DumpState* D)
+{
+ DumpVar(x,D);
+}
+
+static void DumpVector(const void* b, int n, size_t size, DumpState* D)
+{
+ DumpInt(n,D);
+ DumpMem(b,n,size,D);
+}
+
+static void DumpString(const TString* s, DumpState* D)
+{
+ if (s==NULL)
+ {
+  size_t size=0;
+  DumpVar(size,D);
+ }
+ else
+ {
+  size_t size=s->tsv.len+1;		/* include trailing '\0' */
+  DumpVar(size,D);
+  DumpBlock(getstr(s),size*sizeof(char),D);
+ }
+}
+
+#define DumpCode(f,D)	 DumpVector(f->code,f->sizecode,sizeof(Instruction),D)
+
+static void DumpFunction(const Proto* f, DumpState* D);
+
+static void DumpConstants(const Proto* f, DumpState* D)
+{
+ int i,n=f->sizek;
+ DumpInt(n,D);
+ for (i=0; i<n; i++)
+ {
+  const TValue* o=&f->k[i];
+  DumpChar(ttypenv(o),D);
+  switch (ttypenv(o))
+  {
+   case LUA_TNIL:
+	break;
+   case LUA_TBOOLEAN:
+	DumpChar(bvalue(o),D);
+	break;
+   case LUA_TNUMBER:
+	DumpNumber(nvalue(o),D);
+	break;
+   case LUA_TSTRING:
+	DumpString(rawtsvalue(o),D);
+	break;
+    default: lua_assert(0);
+  }
+ }
+ n=f->sizep;
+ DumpInt(n,D);
+ for (i=0; i<n; i++) DumpFunction(f->p[i],D);
+}
+
+static void DumpUpvalues(const Proto* f, DumpState* D)
+{
+ int i,n=f->sizeupvalues;
+ DumpInt(n,D);
+ for (i=0; i<n; i++)
+ {
+  DumpChar(f->upvalues[i].instack,D);
+  DumpChar(f->upvalues[i].idx,D);
+ }
+}
+
+static void DumpDebug(const Proto* f, DumpState* D)
+{
+ int i,n;
+ DumpString((D->strip) ? NULL : f->source,D);
+ n= (D->strip) ? 0 : f->sizelineinfo;
+ DumpVector(f->lineinfo,n,sizeof(int),D);
+ n= (D->strip) ? 0 : f->sizelocvars;
+ DumpInt(n,D);
+ for (i=0; i<n; i++)
+ {
+  DumpString(f->locvars[i].varname,D);
+  DumpInt(f->locvars[i].startpc,D);
+  DumpInt(f->locvars[i].endpc,D);
+ }
+ n= (D->strip) ? 0 : f->sizeupvalues;
+ DumpInt(n,D);
+ for (i=0; i<n; i++) DumpString(f->upvalues[i].name,D);
+}
+
+static void DumpFunction(const Proto* f, DumpState* D)
+{
+ DumpInt(f->linedefined,D);
+ DumpInt(f->lastlinedefined,D);
+ DumpChar(f->numparams,D);
+ DumpChar(f->is_vararg,D);
+ DumpChar(f->maxstacksize,D);
+ DumpCode(f,D);
+ DumpConstants(f,D);
+ DumpUpvalues(f,D);
+ DumpDebug(f,D);
+}
+
+static void DumpHeader(DumpState* D)
+{
+ lu_byte h[LUAC_HEADERSIZE];
+ luaU_header(h);
+ DumpBlock(h,LUAC_HEADERSIZE,D);
+}
+
+/*
+** dump Lua function as precompiled chunk
+*/
+int luaU_dump (lua_State* L, const Proto* f, lua_Writer w, void* data, int strip)
+{
+ DumpState D;
+ D.L=L;
+ D.writer=w;
+ D.data=data;
+ D.strip=strip;
+ D.status=0;
+ DumpHeader(&D);
+ DumpFunction(f,&D);
+ return D.status;
+}
diff --git a/ext/lua/src/lfunc.c b/ext/lua/src/lfunc.c
new file mode 100644
index 0000000..c212840
--- /dev/null
+++ b/ext/lua/src/lfunc.c
@@ -0,0 +1,161 @@
+/*
+** $Id: lfunc.c,v 2.30 2012/10/03 12:36:46 roberto Exp $
+** Auxiliary functions to manipulate prototypes and closures
+** See Copyright Notice in lua.h
+*/
+
+
+#include <stddef.h>
+
+#define lfunc_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+
+
+
+Closure *luaF_newCclosure (lua_State *L, int n) {
+  Closure *c = &luaC_newobj(L, LUA_TCCL, sizeCclosure(n), NULL, 0)->cl;
+  c->c.nupvalues = cast_byte(n);
+  return c;
+}
+
+
+Closure *luaF_newLclosure (lua_State *L, int n) {
+  Closure *c = &luaC_newobj(L, LUA_TLCL, sizeLclosure(n), NULL, 0)->cl;
+  c->l.p = NULL;
+  c->l.nupvalues = cast_byte(n);
+  while (n--) c->l.upvals[n] = NULL;
+  return c;
+}
+
+
+UpVal *luaF_newupval (lua_State *L) {
+  UpVal *uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), NULL, 0)->uv;
+  uv->v = &uv->u.value;
+  setnilvalue(uv->v);
+  return uv;
+}
+
+
+UpVal *luaF_findupval (lua_State *L, StkId level) {
+  global_State *g = G(L);
+  GCObject **pp = &L->openupval;
+  UpVal *p;
+  UpVal *uv;
+  while (*pp != NULL && (p = gco2uv(*pp))->v >= level) {
+    GCObject *o = obj2gco(p);
+    lua_assert(p->v != &p->u.value);
+    lua_assert(!isold(o) || isold(obj2gco(L)));
+    if (p->v == level) {  /* found a corresponding upvalue? */
+      if (isdead(g, o))  /* is it dead? */
+        changewhite(o);  /* resurrect it */
+      return p;
+    }
+    pp = &p->next;
+  }
+  /* not found: create a new one */
+  uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), pp, 0)->uv;
+  uv->v = level;  /* current value lives in the stack */
+  uv->u.l.prev = &g->uvhead;  /* double link it in `uvhead' list */
+  uv->u.l.next = g->uvhead.u.l.next;
+  uv->u.l.next->u.l.prev = uv;
+  g->uvhead.u.l.next = uv;
+  lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv);
+  return uv;
+}
+
+
+static void unlinkupval (UpVal *uv) {
+  lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv);
+  uv->u.l.next->u.l.prev = uv->u.l.prev;  /* remove from `uvhead' list */
+  uv->u.l.prev->u.l.next = uv->u.l.next;
+}
+
+
+void luaF_freeupval (lua_State *L, UpVal *uv) {
+  if (uv->v != &uv->u.value)  /* is it open? */
+    unlinkupval(uv);  /* remove from open list */
+  luaM_free(L, uv);  /* free upvalue */
+}
+
+
+void luaF_close (lua_State *L, StkId level) {
+  UpVal *uv;
+  global_State *g = G(L);
+  while (L->openupval != NULL && (uv = gco2uv(L->openupval))->v >= level) {
+    GCObject *o = obj2gco(uv);
+    lua_assert(!isblack(o) && uv->v != &uv->u.value);
+    L->openupval = uv->next;  /* remove from `open' list */
+    if (isdead(g, o))
+      luaF_freeupval(L, uv);  /* free upvalue */
+    else {
+      unlinkupval(uv);  /* remove upvalue from 'uvhead' list */
+      setobj(L, &uv->u.value, uv->v);  /* move value to upvalue slot */
+      uv->v = &uv->u.value;  /* now current value lives here */
+      gch(o)->next = g->allgc;  /* link upvalue into 'allgc' list */
+      g->allgc = o;
+      luaC_checkupvalcolor(g, uv);
+    }
+  }
+}
+
+
+Proto *luaF_newproto (lua_State *L) {
+  Proto *f = &luaC_newobj(L, LUA_TPROTO, sizeof(Proto), NULL, 0)->p;
+  f->k = NULL;
+  f->sizek = 0;
+  f->p = NULL;
+  f->sizep = 0;
+  f->code = NULL;
+  f->cache = NULL;
+  f->sizecode = 0;
+  f->lineinfo = NULL;
+  f->sizelineinfo = 0;
+  f->upvalues = NULL;
+  f->sizeupvalues = 0;
+  f->numparams = 0;
+  f->is_vararg = 0;
+  f->maxstacksize = 0;
+  f->locvars = NULL;
+  f->sizelocvars = 0;
+  f->linedefined = 0;
+  f->lastlinedefined = 0;
+  f->source = NULL;
+  return f;
+}
+
+
+void luaF_freeproto (lua_State *L, Proto *f) {
+  luaM_freearray(L, f->code, f->sizecode);
+  luaM_freearray(L, f->p, f->sizep);
+  luaM_freearray(L, f->k, f->sizek);
+  luaM_freearray(L, f->lineinfo, f->sizelineinfo);
+  luaM_freearray(L, f->locvars, f->sizelocvars);
+  luaM_freearray(L, f->upvalues, f->sizeupvalues);
+  luaM_free(L, f);
+}
+
+
+/*
+** Look for n-th local variable at line `line' in function `func'.
+** Returns NULL if not found.
+*/
+const char *luaF_getlocalname (const Proto *f, int local_number, int pc) {
+  int i;
+  for (i = 0; i<f->sizelocvars && f->locvars[i].startpc <= pc; i++) {
+    if (pc < f->locvars[i].endpc) {  /* is variable active? */
+      local_number--;
+      if (local_number == 0)
+        return getstr(f->locvars[i].varname);
+    }
+  }
+  return NULL;  /* not found */
+}
+
diff --git a/ext/lua/src/lgc.c b/ext/lua/src/lgc.c
new file mode 100644
index 0000000..535e988
--- /dev/null
+++ b/ext/lua/src/lgc.c
@@ -0,0 +1,1213 @@
+/*
+** $Id: lgc.c,v 2.140 2013/03/16 21:10:18 roberto Exp $
+** Garbage Collector
+** See Copyright Notice in lua.h
+*/
+
+#include <string.h>
+
+#define lgc_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+
+
+
+/*
+** cost of sweeping one element (the size of a small object divided
+** by some adjust for the sweep speed)
+*/
+#define GCSWEEPCOST	((sizeof(TString) + 4) / 4)
+
+/* maximum number of elements to sweep in each single step */
+#define GCSWEEPMAX	(cast_int((GCSTEPSIZE / GCSWEEPCOST) / 4))
+
+/* maximum number of finalizers to call in each GC step */
+#define GCFINALIZENUM	4
+
+
+/*
+** macro to adjust 'stepmul': 'stepmul' is actually used like
+** 'stepmul / STEPMULADJ' (value chosen by tests)
+*/
+#define STEPMULADJ		200
+
+
+/*
+** macro to adjust 'pause': 'pause' is actually used like
+** 'pause / PAUSEADJ' (value chosen by tests)
+*/
+#define PAUSEADJ		100
+
+
+/*
+** 'makewhite' erases all color bits plus the old bit and then
+** sets only the current white bit
+*/
+#define maskcolors	(~(bit2mask(BLACKBIT, OLDBIT) | WHITEBITS))
+#define makewhite(g,x)	\
+ (gch(x)->marked = cast_byte((gch(x)->marked & maskcolors) | luaC_white(g)))
+
+#define white2gray(x)	resetbits(gch(x)->marked, WHITEBITS)
+#define black2gray(x)	resetbit(gch(x)->marked, BLACKBIT)
+
+
+#define isfinalized(x)		testbit(gch(x)->marked, FINALIZEDBIT)
+
+#define checkdeadkey(n)	lua_assert(!ttisdeadkey(gkey(n)) || ttisnil(gval(n)))
+
+
+#define checkconsistency(obj)  \
+  lua_longassert(!iscollectable(obj) || righttt(obj))
+
+
+#define markvalue(g,o) { checkconsistency(o); \
+  if (valiswhite(o)) reallymarkobject(g,gcvalue(o)); }
+
+#define markobject(g,t) { if ((t) && iswhite(obj2gco(t))) \
+		reallymarkobject(g, obj2gco(t)); }
+
+static void reallymarkobject (global_State *g, GCObject *o);
+
+
+/*
+** {======================================================
+** Generic functions
+** =======================================================
+*/
+
+
+/*
+** one after last element in a hash array
+*/
+#define gnodelast(h)	gnode(h, cast(size_t, sizenode(h)))
+
+
+/*
+** link table 'h' into list pointed by 'p'
+*/
+#define linktable(h,p)	((h)->gclist = *(p), *(p) = obj2gco(h))
+
+
+/*
+** if key is not marked, mark its entry as dead (therefore removing it
+** from the table)
+*/
+static void removeentry (Node *n) {
+  lua_assert(ttisnil(gval(n)));
+  if (valiswhite(gkey(n)))
+    setdeadvalue(gkey(n));  /* unused and unmarked key; remove it */
+}
+
+
+/*
+** tells whether a key or value can be cleared from a weak
+** table. Non-collectable objects are never removed from weak
+** tables. Strings behave as `values', so are never removed too. for
+** other objects: if really collected, cannot keep them; for objects
+** being finalized, keep them in keys, but not in values
+*/
+static int iscleared (global_State *g, const TValue *o) {
+  if (!iscollectable(o)) return 0;
+  else if (ttisstring(o)) {
+    markobject(g, rawtsvalue(o));  /* strings are `values', so are never weak */
+    return 0;
+  }
+  else return iswhite(gcvalue(o));
+}
+
+
+/*
+** barrier that moves collector forward, that is, mark the white object
+** being pointed by a black object.
+*/
+void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v) {
+  global_State *g = G(L);
+  lua_assert(isblack(o) && iswhite(v) && !isdead(g, v) && !isdead(g, o));
+  lua_assert(g->gcstate != GCSpause);
+  lua_assert(gch(o)->tt != LUA_TTABLE);
+  if (keepinvariantout(g))  /* must keep invariant? */
+    reallymarkobject(g, v);  /* restore invariant */
+  else {  /* sweep phase */
+    lua_assert(issweepphase(g));
+    makewhite(g, o);  /* mark main obj. as white to avoid other barriers */
+  }
+}
+
+
+/*
+** barrier that moves collector backward, that is, mark the black object
+** pointing to a white object as gray again. (Current implementation
+** only works for tables; access to 'gclist' is not uniform across
+** different types.)
+*/
+void luaC_barrierback_ (lua_State *L, GCObject *o) {
+  global_State *g = G(L);
+  lua_assert(isblack(o) && !isdead(g, o) && gch(o)->tt == LUA_TTABLE);
+  black2gray(o);  /* make object gray (again) */
+  gco2t(o)->gclist = g->grayagain;
+  g->grayagain = o;
+}
+
+
+/*
+** barrier for prototypes. When creating first closure (cache is
+** NULL), use a forward barrier; this may be the only closure of the
+** prototype (if it is a "regular" function, with a single instance)
+** and the prototype may be big, so it is better to avoid traversing
+** it again. Otherwise, use a backward barrier, to avoid marking all
+** possible instances.
+*/
+LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c) {
+  global_State *g = G(L);
+  lua_assert(isblack(obj2gco(p)));
+  if (p->cache == NULL) {  /* first time? */
+    luaC_objbarrier(L, p, c);
+  }
+  else {  /* use a backward barrier */
+    black2gray(obj2gco(p));  /* make prototype gray (again) */
+    p->gclist = g->grayagain;
+    g->grayagain = obj2gco(p);
+  }
+}
+
+
+/*
+** check color (and invariants) for an upvalue that was closed,
+** i.e., moved into the 'allgc' list
+*/
+void luaC_checkupvalcolor (global_State *g, UpVal *uv) {
+  GCObject *o = obj2gco(uv);
+  lua_assert(!isblack(o));  /* open upvalues are never black */
+  if (isgray(o)) {
+    if (keepinvariant(g)) {
+      resetoldbit(o);  /* see MOVE OLD rule */
+      gray2black(o);  /* it is being visited now */
+      markvalue(g, uv->v);
+    }
+    else {
+      lua_assert(issweepphase(g));
+      makewhite(g, o);
+    }
+  }
+}
+
+
+/*
+** create a new collectable object (with given type and size) and link
+** it to '*list'. 'offset' tells how many bytes to allocate before the
+** object itself (used only by states).
+*/
+GCObject *luaC_newobj (lua_State *L, int tt, size_t sz, GCObject **list,
+                       int offset) {
+  global_State *g = G(L);
+  char *raw = cast(char *, luaM_newobject(L, novariant(tt), sz));
+  GCObject *o = obj2gco(raw + offset);
+  if (list == NULL)
+    list = &g->allgc;  /* standard list for collectable objects */
+  gch(o)->marked = luaC_white(g);
+  gch(o)->tt = tt;
+  gch(o)->next = *list;
+  *list = o;
+  return o;
+}
+
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** Mark functions
+** =======================================================
+*/
+
+
+/*
+** mark an object. Userdata, strings, and closed upvalues are visited
+** and turned black here. Other objects are marked gray and added
+** to appropriate list to be visited (and turned black) later. (Open
+** upvalues are already linked in 'headuv' list.)
+*/
+static void reallymarkobject (global_State *g, GCObject *o) {
+  lu_mem size;
+  white2gray(o);
+  switch (gch(o)->tt) {
+    case LUA_TSHRSTR:
+    case LUA_TLNGSTR: {
+      size = sizestring(gco2ts(o));
+      break;  /* nothing else to mark; make it black */
+    }
+    case LUA_TUSERDATA: {
+      Table *mt = gco2u(o)->metatable;
+      markobject(g, mt);
+      markobject(g, gco2u(o)->env);
+      size = sizeudata(gco2u(o));
+      break;
+    }
+    case LUA_TUPVAL: {
+      UpVal *uv = gco2uv(o);
+      markvalue(g, uv->v);
+      if (uv->v != &uv->u.value)  /* open? */
+        return;  /* open upvalues remain gray */
+      size = sizeof(UpVal);
+      break;
+    }
+    case LUA_TLCL: {
+      gco2lcl(o)->gclist = g->gray;
+      g->gray = o;
+      return;
+    }
+    case LUA_TCCL: {
+      gco2ccl(o)->gclist = g->gray;
+      g->gray = o;
+      return;
+    }
+    case LUA_TTABLE: {
+      linktable(gco2t(o), &g->gray);
+      return;
+    }
+    case LUA_TTHREAD: {
+      gco2th(o)->gclist = g->gray;
+      g->gray = o;
+      return;
+    }
+    case LUA_TPROTO: {
+      gco2p(o)->gclist = g->gray;
+      g->gray = o;
+      return;
+    }
+    default: lua_assert(0); return;
+  }
+  gray2black(o);
+  g->GCmemtrav += size;
+}
+
+
+/*
+** mark metamethods for basic types
+*/
+static void markmt (global_State *g) {
+  int i;
+  for (i=0; i < LUA_NUMTAGS; i++)
+    markobject(g, g->mt[i]);
+}
+
+
+/*
+** mark all objects in list of being-finalized
+*/
+static void markbeingfnz (global_State *g) {
+  GCObject *o;
+  for (o = g->tobefnz; o != NULL; o = gch(o)->next) {
+    makewhite(g, o);
+    reallymarkobject(g, o);
+  }
+}
+
+
+/*
+** mark all values stored in marked open upvalues. (See comment in
+** 'lstate.h'.)
+*/
+static void remarkupvals (global_State *g) {
+  UpVal *uv;
+  for (uv = g->uvhead.u.l.next; uv != &g->uvhead; uv = uv->u.l.next) {
+    if (isgray(obj2gco(uv)))
+      markvalue(g, uv->v);
+  }
+}
+
+
+/*
+** mark root set and reset all gray lists, to start a new
+** incremental (or full) collection
+*/
+static void restartcollection (global_State *g) {
+  g->gray = g->grayagain = NULL;
+  g->weak = g->allweak = g->ephemeron = NULL;
+  markobject(g, g->mainthread);
+  markvalue(g, &g->l_registry);
+  markmt(g);
+  markbeingfnz(g);  /* mark any finalizing object left from previous cycle */
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Traverse functions
+** =======================================================
+*/
+
+static void traverseweakvalue (global_State *g, Table *h) {
+  Node *n, *limit = gnodelast(h);
+  /* if there is array part, assume it may have white values (do not
+     traverse it just to check) */
+  int hasclears = (h->sizearray > 0);
+  for (n = gnode(h, 0); n < limit; n++) {
+    checkdeadkey(n);
+    if (ttisnil(gval(n)))  /* entry is empty? */
+      removeentry(n);  /* remove it */
+    else {
+      lua_assert(!ttisnil(gkey(n)));
+      markvalue(g, gkey(n));  /* mark key */
+      if (!hasclears && iscleared(g, gval(n)))  /* is there a white value? */
+        hasclears = 1;  /* table will have to be cleared */
+    }
+  }
+  if (hasclears)
+    linktable(h, &g->weak);  /* has to be cleared later */
+  else  /* no white values */
+    linktable(h, &g->grayagain);  /* no need to clean */
+}
+
+
+static int traverseephemeron (global_State *g, Table *h) {
+  int marked = 0;  /* true if an object is marked in this traversal */
+  int hasclears = 0;  /* true if table has white keys */
+  int prop = 0;  /* true if table has entry "white-key -> white-value" */
+  Node *n, *limit = gnodelast(h);
+  int i;
+  /* traverse array part (numeric keys are 'strong') */
+  for (i = 0; i < h->sizearray; i++) {
+    if (valiswhite(&h->array[i])) {
+      marked = 1;
+      reallymarkobject(g, gcvalue(&h->array[i]));
+    }
+  }
+  /* traverse hash part */
+  for (n = gnode(h, 0); n < limit; n++) {
+    checkdeadkey(n);
+    if (ttisnil(gval(n)))  /* entry is empty? */
+      removeentry(n);  /* remove it */
+    else if (iscleared(g, gkey(n))) {  /* key is not marked (yet)? */
+      hasclears = 1;  /* table must be cleared */
+      if (valiswhite(gval(n)))  /* value not marked yet? */
+        prop = 1;  /* must propagate again */
+    }
+    else if (valiswhite(gval(n))) {  /* value not marked yet? */
+      marked = 1;
+      reallymarkobject(g, gcvalue(gval(n)));  /* mark it now */
+    }
+  }
+  if (prop)
+    linktable(h, &g->ephemeron);  /* have to propagate again */
+  else if (hasclears)  /* does table have white keys? */
+    linktable(h, &g->allweak);  /* may have to clean white keys */
+  else  /* no white keys */
+    linktable(h, &g->grayagain);  /* no need to clean */
+  return marked;
+}
+
+
+static void traversestrongtable (global_State *g, Table *h) {
+  Node *n, *limit = gnodelast(h);
+  int i;
+  for (i = 0; i < h->sizearray; i++)  /* traverse array part */
+    markvalue(g, &h->array[i]);
+  for (n = gnode(h, 0); n < limit; n++) {  /* traverse hash part */
+    checkdeadkey(n);
+    if (ttisnil(gval(n)))  /* entry is empty? */
+      removeentry(n);  /* remove it */
+    else {
+      lua_assert(!ttisnil(gkey(n)));
+      markvalue(g, gkey(n));  /* mark key */
+      markvalue(g, gval(n));  /* mark value */
+    }
+  }
+}
+
+
+static lu_mem traversetable (global_State *g, Table *h) {
+  const char *weakkey, *weakvalue;
+  const TValue *mode = gfasttm(g, h->metatable, TM_MODE);
+  markobject(g, h->metatable);
+  if (mode && ttisstring(mode) &&  /* is there a weak mode? */
+      ((weakkey = strchr(svalue(mode), 'k')),
+       (weakvalue = strchr(svalue(mode), 'v')),
+       (weakkey || weakvalue))) {  /* is really weak? */
+    black2gray(obj2gco(h));  /* keep table gray */
+    if (!weakkey)  /* strong keys? */
+      traverseweakvalue(g, h);
+    else if (!weakvalue)  /* strong values? */
+      traverseephemeron(g, h);
+    else  /* all weak */
+      linktable(h, &g->allweak);  /* nothing to traverse now */
+  }
+  else  /* not weak */
+    traversestrongtable(g, h);
+  return sizeof(Table) + sizeof(TValue) * h->sizearray +
+                         sizeof(Node) * cast(size_t, sizenode(h));
+}
+
+
+static int traverseproto (global_State *g, Proto *f) {
+  int i;
+  if (f->cache && iswhite(obj2gco(f->cache)))
+    f->cache = NULL;  /* allow cache to be collected */
+  markobject(g, f->source);
+  for (i = 0; i < f->sizek; i++)  /* mark literals */
+    markvalue(g, &f->k[i]);
+  for (i = 0; i < f->sizeupvalues; i++)  /* mark upvalue names */
+    markobject(g, f->upvalues[i].name);
+  for (i = 0; i < f->sizep; i++)  /* mark nested protos */
+    markobject(g, f->p[i]);
+  for (i = 0; i < f->sizelocvars; i++)  /* mark local-variable names */
+    markobject(g, f->locvars[i].varname);
+  return sizeof(Proto) + sizeof(Instruction) * f->sizecode +
+                         sizeof(Proto *) * f->sizep +
+                         sizeof(TValue) * f->sizek +
+                         sizeof(int) * f->sizelineinfo +
+                         sizeof(LocVar) * f->sizelocvars +
+                         sizeof(Upvaldesc) * f->sizeupvalues;
+}
+
+
+static lu_mem traverseCclosure (global_State *g, CClosure *cl) {
+  int i;
+  for (i = 0; i < cl->nupvalues; i++)  /* mark its upvalues */
+    markvalue(g, &cl->upvalue[i]);
+  return sizeCclosure(cl->nupvalues);
+}
+
+static lu_mem traverseLclosure (global_State *g, LClosure *cl) {
+  int i;
+  markobject(g, cl->p);  /* mark its prototype */
+  for (i = 0; i < cl->nupvalues; i++)  /* mark its upvalues */
+    markobject(g, cl->upvals[i]);
+  return sizeLclosure(cl->nupvalues);
+}
+
+
+static lu_mem traversestack (global_State *g, lua_State *th) {
+  StkId o = th->stack;
+  if (o == NULL)
+    return 1;  /* stack not completely built yet */
+  for (; o < th->top; o++)
+    markvalue(g, o);
+  if (g->gcstate == GCSatomic) {  /* final traversal? */
+    StkId lim = th->stack + th->stacksize;  /* real end of stack */
+    for (; o < lim; o++)  /* clear not-marked stack slice */
+      setnilvalue(o);
+  }
+  return sizeof(lua_State) + sizeof(TValue) * th->stacksize;
+}
+
+
+/*
+** traverse one gray object, turning it to black (except for threads,
+** which are always gray).
+*/
+static void propagatemark (global_State *g) {
+  lu_mem size;
+  GCObject *o = g->gray;
+  lua_assert(isgray(o));
+  gray2black(o);
+  switch (gch(o)->tt) {
+    case LUA_TTABLE: {
+      Table *h = gco2t(o);
+      g->gray = h->gclist;  /* remove from 'gray' list */
+      size = traversetable(g, h);
+      break;
+    }
+    case LUA_TLCL: {
+      LClosure *cl = gco2lcl(o);
+      g->gray = cl->gclist;  /* remove from 'gray' list */
+      size = traverseLclosure(g, cl);
+      break;
+    }
+    case LUA_TCCL: {
+      CClosure *cl = gco2ccl(o);
+      g->gray = cl->gclist;  /* remove from 'gray' list */
+      size = traverseCclosure(g, cl);
+      break;
+    }
+    case LUA_TTHREAD: {
+      lua_State *th = gco2th(o);
+      g->gray = th->gclist;  /* remove from 'gray' list */
+      th->gclist = g->grayagain;
+      g->grayagain = o;  /* insert into 'grayagain' list */
+      black2gray(o);
+      size = traversestack(g, th);
+      break;
+    }
+    case LUA_TPROTO: {
+      Proto *p = gco2p(o);
+      g->gray = p->gclist;  /* remove from 'gray' list */
+      size = traverseproto(g, p);
+      break;
+    }
+    default: lua_assert(0); return;
+  }
+  g->GCmemtrav += size;
+}
+
+
+static void propagateall (global_State *g) {
+  while (g->gray) propagatemark(g);
+}
+
+
+static void propagatelist (global_State *g, GCObject *l) {
+  lua_assert(g->gray == NULL);  /* no grays left */
+  g->gray = l;
+  propagateall(g);  /* traverse all elements from 'l' */
+}
+
+/*
+** retraverse all gray lists. Because tables may be reinserted in other
+** lists when traversed, traverse the original lists to avoid traversing
+** twice the same table (which is not wrong, but inefficient)
+*/
+static void retraversegrays (global_State *g) {
+  GCObject *weak = g->weak;  /* save original lists */
+  GCObject *grayagain = g->grayagain;
+  GCObject *ephemeron = g->ephemeron;
+  g->weak = g->grayagain = g->ephemeron = NULL;
+  propagateall(g);  /* traverse main gray list */
+  propagatelist(g, grayagain);
+  propagatelist(g, weak);
+  propagatelist(g, ephemeron);
+}
+
+
+static void convergeephemerons (global_State *g) {
+  int changed;
+  do {
+    GCObject *w;
+    GCObject *next = g->ephemeron;  /* get ephemeron list */
+    g->ephemeron = NULL;  /* tables will return to this list when traversed */
+    changed = 0;
+    while ((w = next) != NULL) {
+      next = gco2t(w)->gclist;
+      if (traverseephemeron(g, gco2t(w))) {  /* traverse marked some value? */
+        propagateall(g);  /* propagate changes */
+        changed = 1;  /* will have to revisit all ephemeron tables */
+      }
+    }
+  } while (changed);
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Sweep Functions
+** =======================================================
+*/
+
+
+/*
+** clear entries with unmarked keys from all weaktables in list 'l' up
+** to element 'f'
+*/
+static void clearkeys (global_State *g, GCObject *l, GCObject *f) {
+  for (; l != f; l = gco2t(l)->gclist) {
+    Table *h = gco2t(l);
+    Node *n, *limit = gnodelast(h);
+    for (n = gnode(h, 0); n < limit; n++) {
+      if (!ttisnil(gval(n)) && (iscleared(g, gkey(n)))) {
+        setnilvalue(gval(n));  /* remove value ... */
+        removeentry(n);  /* and remove entry from table */
+      }
+    }
+  }
+}
+
+
+/*
+** clear entries with unmarked values from all weaktables in list 'l' up
+** to element 'f'
+*/
+static void clearvalues (global_State *g, GCObject *l, GCObject *f) {
+  for (; l != f; l = gco2t(l)->gclist) {
+    Table *h = gco2t(l);
+    Node *n, *limit = gnodelast(h);
+    int i;
+    for (i = 0; i < h->sizearray; i++) {
+      TValue *o = &h->array[i];
+      if (iscleared(g, o))  /* value was collected? */
+        setnilvalue(o);  /* remove value */
+    }
+    for (n = gnode(h, 0); n < limit; n++) {
+      if (!ttisnil(gval(n)) && iscleared(g, gval(n))) {
+        setnilvalue(gval(n));  /* remove value ... */
+        removeentry(n);  /* and remove entry from table */
+      }
+    }
+  }
+}
+
+
+static void freeobj (lua_State *L, GCObject *o) {
+  switch (gch(o)->tt) {
+    case LUA_TPROTO: luaF_freeproto(L, gco2p(o)); break;
+    case LUA_TLCL: {
+      luaM_freemem(L, o, sizeLclosure(gco2lcl(o)->nupvalues));
+      break;
+    }
+    case LUA_TCCL: {
+      luaM_freemem(L, o, sizeCclosure(gco2ccl(o)->nupvalues));
+      break;
+    }
+    case LUA_TUPVAL: luaF_freeupval(L, gco2uv(o)); break;
+    case LUA_TTABLE: luaH_free(L, gco2t(o)); break;
+    case LUA_TTHREAD: luaE_freethread(L, gco2th(o)); break;
+    case LUA_TUSERDATA: luaM_freemem(L, o, sizeudata(gco2u(o))); break;
+    case LUA_TSHRSTR:
+      G(L)->strt.nuse--;
+      /* go through */
+    case LUA_TLNGSTR: {
+      luaM_freemem(L, o, sizestring(gco2ts(o)));
+      break;
+    }
+    default: lua_assert(0);
+  }
+}
+
+
+#define sweepwholelist(L,p)	sweeplist(L,p,MAX_LUMEM)
+static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count);
+
+
+/*
+** sweep the (open) upvalues of a thread and resize its stack and
+** list of call-info structures.
+*/
+static void sweepthread (lua_State *L, lua_State *L1) {
+  if (L1->stack == NULL) return;  /* stack not completely built yet */
+  sweepwholelist(L, &L1->openupval);  /* sweep open upvalues */
+  luaE_freeCI(L1);  /* free extra CallInfo slots */
+  /* should not change the stack during an emergency gc cycle */
+  if (G(L)->gckind != KGC_EMERGENCY)
+    luaD_shrinkstack(L1);
+}
+
+
+/*
+** sweep at most 'count' elements from a list of GCObjects erasing dead
+** objects, where a dead (not alive) object is one marked with the "old"
+** (non current) white and not fixed.
+** In non-generational mode, change all non-dead objects back to white,
+** preparing for next collection cycle.
+** In generational mode, keep black objects black, and also mark them as
+** old; stop when hitting an old object, as all objects after that
+** one will be old too.
+** When object is a thread, sweep its list of open upvalues too.
+*/
+static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count) {
+  global_State *g = G(L);
+  int ow = otherwhite(g);
+  int toclear, toset;  /* bits to clear and to set in all live objects */
+  int tostop;  /* stop sweep when this is true */
+  if (isgenerational(g)) {  /* generational mode? */
+    toclear = ~0;  /* clear nothing */
+    toset = bitmask(OLDBIT);  /* set the old bit of all surviving objects */
+    tostop = bitmask(OLDBIT);  /* do not sweep old generation */
+  }
+  else {  /* normal mode */
+    toclear = maskcolors;  /* clear all color bits + old bit */
+    toset = luaC_white(g);  /* make object white */
+    tostop = 0;  /* do not stop */
+  }
+  while (*p != NULL && count-- > 0) {
+    GCObject *curr = *p;
+    int marked = gch(curr)->marked;
+    if (isdeadm(ow, marked)) {  /* is 'curr' dead? */
+      *p = gch(curr)->next;  /* remove 'curr' from list */
+      freeobj(L, curr);  /* erase 'curr' */
+    }
+    else {
+      if (testbits(marked, tostop))
+        return NULL;  /* stop sweeping this list */
+      if (gch(curr)->tt == LUA_TTHREAD)
+        sweepthread(L, gco2th(curr));  /* sweep thread's upvalues */
+      /* update marks */
+      gch(curr)->marked = cast_byte((marked & toclear) | toset);
+      p = &gch(curr)->next;  /* go to next element */
+    }
+  }
+  return (*p == NULL) ? NULL : p;
+}
+
+
+/*
+** sweep a list until a live object (or end of list)
+*/
+static GCObject **sweeptolive (lua_State *L, GCObject **p, int *n) {
+  GCObject ** old = p;
+  int i = 0;
+  do {
+    i++;
+    p = sweeplist(L, p, 1);
+  } while (p == old);
+  if (n) *n += i;
+  return p;
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** Finalization
+** =======================================================
+*/
+
+static void checkSizes (lua_State *L) {
+  global_State *g = G(L);
+  if (g->gckind != KGC_EMERGENCY) {  /* do not change sizes in emergency */
+    int hs = g->strt.size / 2;  /* half the size of the string table */
+    if (g->strt.nuse < cast(lu_int32, hs))  /* using less than that half? */
+      luaS_resize(L, hs);  /* halve its size */
+    luaZ_freebuffer(L, &g->buff);  /* free concatenation buffer */
+  }
+}
+
+
+static GCObject *udata2finalize (global_State *g) {
+  GCObject *o = g->tobefnz;  /* get first element */
+  lua_assert(isfinalized(o));
+  g->tobefnz = gch(o)->next;  /* remove it from 'tobefnz' list */
+  gch(o)->next = g->allgc;  /* return it to 'allgc' list */
+  g->allgc = o;
+  resetbit(gch(o)->marked, SEPARATED);  /* mark that it is not in 'tobefnz' */
+  lua_assert(!isold(o));  /* see MOVE OLD rule */
+  if (!keepinvariantout(g))  /* not keeping invariant? */
+    makewhite(g, o);  /* "sweep" object */
+  return o;
+}
+
+
+static void dothecall (lua_State *L, void *ud) {
+  UNUSED(ud);
+  luaD_call(L, L->top - 2, 0, 0);
+}
+
+
+static void GCTM (lua_State *L, int propagateerrors) {
+  global_State *g = G(L);
+  const TValue *tm;
+  TValue v;
+  setgcovalue(L, &v, udata2finalize(g));
+  tm = luaT_gettmbyobj(L, &v, TM_GC);
+  if (tm != NULL && ttisfunction(tm)) {  /* is there a finalizer? */
+    int status;
+    lu_byte oldah = L->allowhook;
+    int running  = g->gcrunning;
+    L->allowhook = 0;  /* stop debug hooks during GC metamethod */
+    g->gcrunning = 0;  /* avoid GC steps */
+    setobj2s(L, L->top, tm);  /* push finalizer... */
+    setobj2s(L, L->top + 1, &v);  /* ... and its argument */
+    L->top += 2;  /* and (next line) call the finalizer */
+    status = luaD_pcall(L, dothecall, NULL, savestack(L, L->top - 2), 0);
+    L->allowhook = oldah;  /* restore hooks */
+    g->gcrunning = running;  /* restore state */
+    if (status != LUA_OK && propagateerrors) {  /* error while running __gc? */
+      if (status == LUA_ERRRUN) {  /* is there an error object? */
+        const char *msg = (ttisstring(L->top - 1))
+                            ? svalue(L->top - 1)
+                            : "no message";
+        luaO_pushfstring(L, "error in __gc metamethod (%s)", msg);
+        status = LUA_ERRGCMM;  /* error in __gc metamethod */
+      }
+      luaD_throw(L, status);  /* re-throw error */
+    }
+  }
+}
+
+
+/*
+** move all unreachable objects (or 'all' objects) that need
+** finalization from list 'finobj' to list 'tobefnz' (to be finalized)
+*/
+static void separatetobefnz (lua_State *L, int all) {
+  global_State *g = G(L);
+  GCObject **p = &g->finobj;
+  GCObject *curr;
+  GCObject **lastnext = &g->tobefnz;
+  /* find last 'next' field in 'tobefnz' list (to add elements in its end) */
+  while (*lastnext != NULL)
+    lastnext = &gch(*lastnext)->next;
+  while ((curr = *p) != NULL) {  /* traverse all finalizable objects */
+    lua_assert(!isfinalized(curr));
+    lua_assert(testbit(gch(curr)->marked, SEPARATED));
+    if (!(iswhite(curr) || all))  /* not being collected? */
+      p = &gch(curr)->next;  /* don't bother with it */
+    else {
+      l_setbit(gch(curr)->marked, FINALIZEDBIT); /* won't be finalized again */
+      *p = gch(curr)->next;  /* remove 'curr' from 'finobj' list */
+      gch(curr)->next = *lastnext;  /* link at the end of 'tobefnz' list */
+      *lastnext = curr;
+      lastnext = &gch(curr)->next;
+    }
+  }
+}
+
+
+/*
+** if object 'o' has a finalizer, remove it from 'allgc' list (must
+** search the list to find it) and link it in 'finobj' list.
+*/
+void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt) {
+  global_State *g = G(L);
+  if (testbit(gch(o)->marked, SEPARATED) || /* obj. is already separated... */
+      isfinalized(o) ||                           /* ... or is finalized... */
+      gfasttm(g, mt, TM_GC) == NULL)                /* or has no finalizer? */
+    return;  /* nothing to be done */
+  else {  /* move 'o' to 'finobj' list */
+    GCObject **p;
+    GCheader *ho = gch(o);
+    if (g->sweepgc == &ho->next) {  /* avoid removing current sweep object */
+      lua_assert(issweepphase(g));
+      g->sweepgc = sweeptolive(L, g->sweepgc, NULL);
+    }
+    /* search for pointer pointing to 'o' */
+    for (p = &g->allgc; *p != o; p = &gch(*p)->next) { /* empty */ }
+    *p = ho->next;  /* remove 'o' from root list */
+    ho->next = g->finobj;  /* link it in list 'finobj' */
+    g->finobj = o;
+    l_setbit(ho->marked, SEPARATED);  /* mark it as such */
+    if (!keepinvariantout(g))  /* not keeping invariant? */
+      makewhite(g, o);  /* "sweep" object */
+    else
+      resetoldbit(o);  /* see MOVE OLD rule */
+  }
+}
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** GC control
+** =======================================================
+*/
+
+
+/*
+** set a reasonable "time" to wait before starting a new GC cycle;
+** cycle will start when memory use hits threshold
+*/
+static void setpause (global_State *g, l_mem estimate) {
+  l_mem debt, threshold;
+  estimate = estimate / PAUSEADJ;  /* adjust 'estimate' */
+  threshold = (g->gcpause < MAX_LMEM / estimate)  /* overflow? */
+            ? estimate * g->gcpause  /* no overflow */
+            : MAX_LMEM;  /* overflow; truncate to maximum */
+  debt = -cast(l_mem, threshold - gettotalbytes(g));
+  luaE_setdebt(g, debt);
+}
+
+
+#define sweepphases  \
+	(bitmask(GCSsweepstring) | bitmask(GCSsweepudata) | bitmask(GCSsweep))
+
+
+/*
+** enter first sweep phase (strings) and prepare pointers for other
+** sweep phases.  The calls to 'sweeptolive' make pointers point to an
+** object inside the list (instead of to the header), so that the real
+** sweep do not need to skip objects created between "now" and the start
+** of the real sweep.
+** Returns how many objects it swept.
+*/
+static int entersweep (lua_State *L) {
+  global_State *g = G(L);
+  int n = 0;
+  g->gcstate = GCSsweepstring;
+  lua_assert(g->sweepgc == NULL && g->sweepfin == NULL);
+  /* prepare to sweep strings, finalizable objects, and regular objects */
+  g->sweepstrgc = 0;
+  g->sweepfin = sweeptolive(L, &g->finobj, &n);
+  g->sweepgc = sweeptolive(L, &g->allgc, &n);
+  return n;
+}
+
+
+/*
+** change GC mode
+*/
+void luaC_changemode (lua_State *L, int mode) {
+  global_State *g = G(L);
+  if (mode == g->gckind) return;  /* nothing to change */
+  if (mode == KGC_GEN) {  /* change to generational mode */
+    /* make sure gray lists are consistent */
+    luaC_runtilstate(L, bitmask(GCSpropagate));
+    g->GCestimate = gettotalbytes(g);
+    g->gckind = KGC_GEN;
+  }
+  else {  /* change to incremental mode */
+    /* sweep all objects to turn them back to white
+       (as white has not changed, nothing extra will be collected) */
+    g->gckind = KGC_NORMAL;
+    entersweep(L);
+    luaC_runtilstate(L, ~sweepphases);
+  }
+}
+
+
+/*
+** call all pending finalizers
+*/
+static void callallpendingfinalizers (lua_State *L, int propagateerrors) {
+  global_State *g = G(L);
+  while (g->tobefnz) {
+    resetoldbit(g->tobefnz);
+    GCTM(L, propagateerrors);
+  }
+}
+
+
+void luaC_freeallobjects (lua_State *L) {
+  global_State *g = G(L);
+  int i;
+  separatetobefnz(L, 1);  /* separate all objects with finalizers */
+  lua_assert(g->finobj == NULL);
+  callallpendingfinalizers(L, 0);
+  g->currentwhite = WHITEBITS; /* this "white" makes all objects look dead */
+  g->gckind = KGC_NORMAL;
+  sweepwholelist(L, &g->finobj);  /* finalizers can create objs. in 'finobj' */
+  sweepwholelist(L, &g->allgc);
+  for (i = 0; i < g->strt.size; i++)  /* free all string lists */
+    sweepwholelist(L, &g->strt.hash[i]);
+  lua_assert(g->strt.nuse == 0);
+}
+
+
+static l_mem atomic (lua_State *L) {
+  global_State *g = G(L);
+  l_mem work = -cast(l_mem, g->GCmemtrav);  /* start counting work */
+  GCObject *origweak, *origall;
+  lua_assert(!iswhite(obj2gco(g->mainthread)));
+  markobject(g, L);  /* mark running thread */
+  /* registry and global metatables may be changed by API */
+  markvalue(g, &g->l_registry);
+  markmt(g);  /* mark basic metatables */
+  /* remark occasional upvalues of (maybe) dead threads */
+  remarkupvals(g);
+  propagateall(g);  /* propagate changes */
+  work += g->GCmemtrav;  /* stop counting (do not (re)count grays) */
+  /* traverse objects caught by write barrier and by 'remarkupvals' */
+  retraversegrays(g);
+  work -= g->GCmemtrav;  /* restart counting */
+  convergeephemerons(g);
+  /* at this point, all strongly accessible objects are marked. */
+  /* clear values from weak tables, before checking finalizers */
+  clearvalues(g, g->weak, NULL);
+  clearvalues(g, g->allweak, NULL);
+  origweak = g->weak; origall = g->allweak;
+  work += g->GCmemtrav;  /* stop counting (objects being finalized) */
+  separatetobefnz(L, 0);  /* separate objects to be finalized */
+  markbeingfnz(g);  /* mark objects that will be finalized */
+  propagateall(g);  /* remark, to propagate `preserveness' */
+  work -= g->GCmemtrav;  /* restart counting */
+  convergeephemerons(g);
+  /* at this point, all resurrected objects are marked. */
+  /* remove dead objects from weak tables */
+  clearkeys(g, g->ephemeron, NULL);  /* clear keys from all ephemeron tables */
+  clearkeys(g, g->allweak, NULL);  /* clear keys from all allweak tables */
+  /* clear values from resurrected weak tables */
+  clearvalues(g, g->weak, origweak);
+  clearvalues(g, g->allweak, origall);
+  g->currentwhite = cast_byte(otherwhite(g));  /* flip current white */
+  work += g->GCmemtrav;  /* complete counting */
+  return work;  /* estimate of memory marked by 'atomic' */
+}
+
+
+static lu_mem singlestep (lua_State *L) {
+  global_State *g = G(L);
+  switch (g->gcstate) {
+    case GCSpause: {
+      /* start to count memory traversed */
+      g->GCmemtrav = g->strt.size * sizeof(GCObject*);
+      lua_assert(!isgenerational(g));
+      restartcollection(g);
+      g->gcstate = GCSpropagate;
+      return g->GCmemtrav;
+    }
+    case GCSpropagate: {
+      if (g->gray) {
+        lu_mem oldtrav = g->GCmemtrav;
+        propagatemark(g);
+        return g->GCmemtrav - oldtrav;  /* memory traversed in this step */
+      }
+      else {  /* no more `gray' objects */
+        lu_mem work;
+        int sw;
+        g->gcstate = GCSatomic;  /* finish mark phase */
+        g->GCestimate = g->GCmemtrav;  /* save what was counted */;
+        work = atomic(L);  /* add what was traversed by 'atomic' */
+        g->GCestimate += work;  /* estimate of total memory traversed */ 
+        sw = entersweep(L);
+        return work + sw * GCSWEEPCOST;
+      }
+    }
+    case GCSsweepstring: {
+      int i;
+      for (i = 0; i < GCSWEEPMAX && g->sweepstrgc + i < g->strt.size; i++)
+        sweepwholelist(L, &g->strt.hash[g->sweepstrgc + i]);
+      g->sweepstrgc += i;
+      if (g->sweepstrgc >= g->strt.size)  /* no more strings to sweep? */
+        g->gcstate = GCSsweepudata;
+      return i * GCSWEEPCOST;
+    }
+    case GCSsweepudata: {
+      if (g->sweepfin) {
+        g->sweepfin = sweeplist(L, g->sweepfin, GCSWEEPMAX);
+        return GCSWEEPMAX*GCSWEEPCOST;
+      }
+      else {
+        g->gcstate = GCSsweep;
+        return 0;
+      }
+    }
+    case GCSsweep: {
+      if (g->sweepgc) {
+        g->sweepgc = sweeplist(L, g->sweepgc, GCSWEEPMAX);
+        return GCSWEEPMAX*GCSWEEPCOST;
+      }
+      else {
+        /* sweep main thread */
+        GCObject *mt = obj2gco(g->mainthread);
+        sweeplist(L, &mt, 1);
+        checkSizes(L);
+        g->gcstate = GCSpause;  /* finish collection */
+        return GCSWEEPCOST;
+      }
+    }
+    default: lua_assert(0); return 0;
+  }
+}
+
+
+/*
+** advances the garbage collector until it reaches a state allowed
+** by 'statemask'
+*/
+void luaC_runtilstate (lua_State *L, int statesmask) {
+  global_State *g = G(L);
+  while (!testbit(statesmask, g->gcstate))
+    singlestep(L);
+}
+
+
+static void generationalcollection (lua_State *L) {
+  global_State *g = G(L);
+  lua_assert(g->gcstate == GCSpropagate);
+  if (g->GCestimate == 0) {  /* signal for another major collection? */
+    luaC_fullgc(L, 0);  /* perform a full regular collection */
+    g->GCestimate = gettotalbytes(g);  /* update control */
+  }
+  else {
+    lu_mem estimate = g->GCestimate;
+    luaC_runtilstate(L, bitmask(GCSpause));  /* run complete (minor) cycle */
+    g->gcstate = GCSpropagate;  /* skip restart */
+    if (gettotalbytes(g) > (estimate / 100) * g->gcmajorinc)
+      g->GCestimate = 0;  /* signal for a major collection */
+    else
+      g->GCestimate = estimate;  /* keep estimate from last major coll. */
+
+  }
+  setpause(g, gettotalbytes(g));
+  lua_assert(g->gcstate == GCSpropagate);
+}
+
+
+static void incstep (lua_State *L) {
+  global_State *g = G(L);
+  l_mem debt = g->GCdebt;
+  int stepmul = g->gcstepmul;
+  if (stepmul < 40) stepmul = 40;  /* avoid ridiculous low values (and 0) */
+  /* convert debt from Kb to 'work units' (avoid zero debt and overflows) */
+  debt = (debt / STEPMULADJ) + 1;
+  debt = (debt < MAX_LMEM / stepmul) ? debt * stepmul : MAX_LMEM;
+  do {  /* always perform at least one single step */
+    lu_mem work = singlestep(L);  /* do some work */
+    debt -= work;
+  } while (debt > -GCSTEPSIZE && g->gcstate != GCSpause);
+  if (g->gcstate == GCSpause)
+    setpause(g, g->GCestimate);  /* pause until next cycle */
+  else {
+    debt = (debt / stepmul) * STEPMULADJ;  /* convert 'work units' to Kb */
+    luaE_setdebt(g, debt);
+  }
+}
+
+
+/*
+** performs a basic GC step
+*/
+void luaC_forcestep (lua_State *L) {
+  global_State *g = G(L);
+  int i;
+  if (isgenerational(g)) generationalcollection(L);
+  else incstep(L);
+  /* run a few finalizers (or all of them at the end of a collect cycle) */
+  for (i = 0; g->tobefnz && (i < GCFINALIZENUM || g->gcstate == GCSpause); i++)
+    GCTM(L, 1);  /* call one finalizer */
+}
+
+
+/*
+** performs a basic GC step only if collector is running
+*/
+void luaC_step (lua_State *L) {
+  global_State *g = G(L);
+  if (g->gcrunning) luaC_forcestep(L);
+  else luaE_setdebt(g, -GCSTEPSIZE);  /* avoid being called too often */
+}
+
+
+
+/*
+** performs a full GC cycle; if "isemergency", does not call
+** finalizers (which could change stack positions)
+*/
+void luaC_fullgc (lua_State *L, int isemergency) {
+  global_State *g = G(L);
+  int origkind = g->gckind;
+  lua_assert(origkind != KGC_EMERGENCY);
+  if (isemergency)  /* do not run finalizers during emergency GC */
+    g->gckind = KGC_EMERGENCY;
+  else {
+    g->gckind = KGC_NORMAL;
+    callallpendingfinalizers(L, 1);
+  }
+  if (keepinvariant(g)) {  /* may there be some black objects? */
+    /* must sweep all objects to turn them back to white
+       (as white has not changed, nothing will be collected) */
+    entersweep(L);
+  }
+  /* finish any pending sweep phase to start a new cycle */
+  luaC_runtilstate(L, bitmask(GCSpause));
+  luaC_runtilstate(L, ~bitmask(GCSpause));  /* start new collection */
+  luaC_runtilstate(L, bitmask(GCSpause));  /* run entire collection */
+  if (origkind == KGC_GEN) {  /* generational mode? */
+    /* generational mode must be kept in propagate phase */
+    luaC_runtilstate(L, bitmask(GCSpropagate));
+  }
+  g->gckind = origkind;
+  setpause(g, gettotalbytes(g));
+  if (!isemergency)   /* do not run finalizers during emergency GC */
+    callallpendingfinalizers(L, 1);
+}
+
+/* }====================================================== */
+
+
diff --git a/ext/lua/src/linit.c b/ext/lua/src/linit.c
new file mode 100644
index 0000000..8d3aa65
--- /dev/null
+++ b/ext/lua/src/linit.c
@@ -0,0 +1,67 @@
+/*
+** $Id: linit.c,v 1.32 2011/04/08 19:17:36 roberto Exp $
+** Initialization of libraries for lua.c and other clients
+** See Copyright Notice in lua.h
+*/
+
+
+/*
+** If you embed Lua in your program and need to open the standard
+** libraries, call luaL_openlibs in your program. If you need a
+** different set of libraries, copy this file to your project and edit
+** it to suit your needs.
+*/
+
+
+#define linit_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lualib.h"
+#include "lauxlib.h"
+
+
+/*
+** these libs are loaded by lua.c and are readily available to any Lua
+** program
+*/
+static const luaL_Reg loadedlibs[] = {
+  {"_G", luaopen_base},
+  {LUA_LOADLIBNAME, luaopen_package},
+  {LUA_COLIBNAME, luaopen_coroutine},
+  {LUA_TABLIBNAME, luaopen_table},
+  {LUA_IOLIBNAME, luaopen_io},
+  {LUA_OSLIBNAME, luaopen_os},
+  {LUA_STRLIBNAME, luaopen_string},
+  {LUA_BITLIBNAME, luaopen_bit32},
+  {LUA_MATHLIBNAME, luaopen_math},
+  {LUA_DBLIBNAME, luaopen_debug},
+  {NULL, NULL}
+};
+
+
+/*
+** these libs are preloaded and must be required before used
+*/
+static const luaL_Reg preloadedlibs[] = {
+  {NULL, NULL}
+};
+
+
+LUALIB_API void luaL_openlibs (lua_State *L) {
+  const luaL_Reg *lib;
+  /* call open functions from 'loadedlibs' and set results to global table */
+  for (lib = loadedlibs; lib->func; lib++) {
+    luaL_requiref(L, lib->name, lib->func, 1);
+    lua_pop(L, 1);  /* remove lib */
+  }
+  /* add open functions from 'preloadedlibs' into 'package.preload' table */
+  luaL_getsubtable(L, LUA_REGISTRYINDEX, "_PRELOAD");
+  for (lib = preloadedlibs; lib->func; lib++) {
+    lua_pushcfunction(L, lib->func);
+    lua_setfield(L, -2, lib->name);
+  }
+  lua_pop(L, 1);  /* remove _PRELOAD table */
+}
+
diff --git a/ext/lua/src/liolib.c b/ext/lua/src/liolib.c
new file mode 100644
index 0000000..3f80db1
--- /dev/null
+++ b/ext/lua/src/liolib.c
@@ -0,0 +1,665 @@
+/*
+** $Id: liolib.c,v 2.111 2013/03/21 13:57:27 roberto Exp $
+** Standard I/O (and system) library
+** See Copyright Notice in lua.h
+*/
+
+
+/*
+** POSIX idiosyncrasy!
+** This definition must come before the inclusion of 'stdio.h'; it
+** should not affect non-POSIX systems
+*/
+#if !defined(_FILE_OFFSET_BITS)
+#define _FILE_OFFSET_BITS 64
+#endif
+
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define liolib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+#if !defined(lua_checkmode)
+
+/*
+** Check whether 'mode' matches '[rwa]%+?b?'.
+** Change this macro to accept other modes for 'fopen' besides
+** the standard ones.
+*/
+#define lua_checkmode(mode) \
+	(*mode != '\0' && strchr("rwa", *(mode++)) != NULL &&	\
+	(*mode != '+' || ++mode) &&  /* skip if char is '+' */	\
+	(*mode != 'b' || ++mode) &&  /* skip if char is 'b' */	\
+	(*mode == '\0'))
+
+#endif
+
+/*
+** {======================================================
+** lua_popen spawns a new process connected to the current
+** one through the file streams.
+** =======================================================
+*/
+
+#if !defined(lua_popen)	/* { */
+
+#if defined(LUA_USE_POPEN)	/* { */
+
+#define lua_popen(L,c,m)	((void)L, fflush(NULL), popen(c,m))
+#define lua_pclose(L,file)	((void)L, pclose(file))
+
+#elif defined(LUA_WIN)		/* }{ */
+
+#define lua_popen(L,c,m)		((void)L, _popen(c,m))
+#define lua_pclose(L,file)		((void)L, _pclose(file))
+
+
+#else				/* }{ */
+
+#define lua_popen(L,c,m)		((void)((void)c, m),  \
+		luaL_error(L, LUA_QL("popen") " not supported"), (FILE*)0)
+#define lua_pclose(L,file)		((void)((void)L, file), -1)
+
+
+#endif				/* } */
+
+#endif			/* } */
+
+/* }====================================================== */
+
+
+/*
+** {======================================================
+** lua_fseek/lua_ftell: configuration for longer offsets
+** =======================================================
+*/
+
+#if !defined(lua_fseek)	/* { */
+
+#if defined(LUA_USE_POSIX)
+
+#define l_fseek(f,o,w)		fseeko(f,o,w)
+#define l_ftell(f)		ftello(f)
+#define l_seeknum		off_t
+
+#elif defined(LUA_WIN) && !defined(_CRTIMP_TYPEINFO) \
+   && defined(_MSC_VER) && (_MSC_VER >= 1400)
+/* Windows (but not DDK) and Visual C++ 2005 or higher */
+
+#define l_fseek(f,o,w)		_fseeki64(f,o,w)
+#define l_ftell(f)		_ftelli64(f)
+#define l_seeknum		__int64
+
+#else
+
+#define l_fseek(f,o,w)		fseek(f,o,w)
+#define l_ftell(f)		ftell(f)
+#define l_seeknum		long
+
+#endif
+
+#endif			/* } */
+
+/* }====================================================== */
+
+
+#define IO_PREFIX	"_IO_"
+#define IO_INPUT	(IO_PREFIX "input")
+#define IO_OUTPUT	(IO_PREFIX "output")
+
+
+typedef luaL_Stream LStream;
+
+
+#define tolstream(L)	((LStream *)luaL_checkudata(L, 1, LUA_FILEHANDLE))
+
+#define isclosed(p)	((p)->closef == NULL)
+
+
+static int io_type (lua_State *L) {
+  LStream *p;
+  luaL_checkany(L, 1);
+  p = (LStream *)luaL_testudata(L, 1, LUA_FILEHANDLE);
+  if (p == NULL)
+    lua_pushnil(L);  /* not a file */
+  else if (isclosed(p))
+    lua_pushliteral(L, "closed file");
+  else
+    lua_pushliteral(L, "file");
+  return 1;
+}
+
+
+static int f_tostring (lua_State *L) {
+  LStream *p = tolstream(L);
+  if (isclosed(p))
+    lua_pushliteral(L, "file (closed)");
+  else
+    lua_pushfstring(L, "file (%p)", p->f);
+  return 1;
+}
+
+
+static FILE *tofile (lua_State *L) {
+  LStream *p = tolstream(L);
+  if (isclosed(p))
+    luaL_error(L, "attempt to use a closed file");
+  lua_assert(p->f);
+  return p->f;
+}
+
+
+/*
+** When creating file handles, always creates a `closed' file handle
+** before opening the actual file; so, if there is a memory error, the
+** file is not left opened.
+*/
+static LStream *newprefile (lua_State *L) {
+  LStream *p = (LStream *)lua_newuserdata(L, sizeof(LStream));
+  p->closef = NULL;  /* mark file handle as 'closed' */
+  luaL_setmetatable(L, LUA_FILEHANDLE);
+  return p;
+}
+
+
+static int aux_close (lua_State *L) {
+  LStream *p = tolstream(L);
+  lua_CFunction cf = p->closef;
+  p->closef = NULL;  /* mark stream as closed */
+  return (*cf)(L);  /* close it */
+}
+
+
+static int io_close (lua_State *L) {
+  if (lua_isnone(L, 1))  /* no argument? */
+    lua_getfield(L, LUA_REGISTRYINDEX, IO_OUTPUT);  /* use standard output */
+  tofile(L);  /* make sure argument is an open stream */
+  return aux_close(L);
+}
+
+
+static int f_gc (lua_State *L) {
+  LStream *p = tolstream(L);
+  if (!isclosed(p) && p->f != NULL)
+    aux_close(L);  /* ignore closed and incompletely open files */
+  return 0;
+}
+
+
+/*
+** function to close regular files
+*/
+static int io_fclose (lua_State *L) {
+  LStream *p = tolstream(L);
+  int res = fclose(p->f);
+  return luaL_fileresult(L, (res == 0), NULL);
+}
+
+
+static LStream *newfile (lua_State *L) {
+  LStream *p = newprefile(L);
+  p->f = NULL;
+  p->closef = &io_fclose;
+  return p;
+}
+
+
+static void opencheck (lua_State *L, const char *fname, const char *mode) {
+  LStream *p = newfile(L);
+  p->f = fopen(fname, mode);
+  if (p->f == NULL)
+    luaL_error(L, "cannot open file " LUA_QS " (%s)", fname, strerror(errno));
+}
+
+
+static int io_open (lua_State *L) {
+  const char *filename = luaL_checkstring(L, 1);
+  const char *mode = luaL_optstring(L, 2, "r");
+  LStream *p = newfile(L);
+  const char *md = mode;  /* to traverse/check mode */
+  luaL_argcheck(L, lua_checkmode(md), 2, "invalid mode");
+  p->f = fopen(filename, mode);
+  return (p->f == NULL) ? luaL_fileresult(L, 0, filename) : 1;
+}
+
+
+/*
+** function to close 'popen' files
+*/
+static int io_pclose (lua_State *L) {
+  LStream *p = tolstream(L);
+  return luaL_execresult(L, lua_pclose(L, p->f));
+}
+
+
+static int io_popen (lua_State *L) {
+  const char *filename = luaL_checkstring(L, 1);
+  const char *mode = luaL_optstring(L, 2, "r");
+  LStream *p = newprefile(L);
+  p->f = lua_popen(L, filename, mode);
+  p->closef = &io_pclose;
+  return (p->f == NULL) ? luaL_fileresult(L, 0, filename) : 1;
+}
+
+
+static int io_tmpfile (lua_State *L) {
+  LStream *p = newfile(L);
+  p->f = tmpfile();
+  return (p->f == NULL) ? luaL_fileresult(L, 0, NULL) : 1;
+}
+
+
+static FILE *getiofile (lua_State *L, const char *findex) {
+  LStream *p;
+  lua_getfield(L, LUA_REGISTRYINDEX, findex);
+  p = (LStream *)lua_touserdata(L, -1);
+  if (isclosed(p))
+    luaL_error(L, "standard %s file is closed", findex + strlen(IO_PREFIX));
+  return p->f;
+}
+
+
+static int g_iofile (lua_State *L, const char *f, const char *mode) {
+  if (!lua_isnoneornil(L, 1)) {
+    const char *filename = lua_tostring(L, 1);
+    if (filename)
+      opencheck(L, filename, mode);
+    else {
+      tofile(L);  /* check that it's a valid file handle */
+      lua_pushvalue(L, 1);
+    }
+    lua_setfield(L, LUA_REGISTRYINDEX, f);
+  }
+  /* return current value */
+  lua_getfield(L, LUA_REGISTRYINDEX, f);
+  return 1;
+}
+
+
+static int io_input (lua_State *L) {
+  return g_iofile(L, IO_INPUT, "r");
+}
+
+
+static int io_output (lua_State *L) {
+  return g_iofile(L, IO_OUTPUT, "w");
+}
+
+
+static int io_readline (lua_State *L);
+
+
+static void aux_lines (lua_State *L, int toclose) {
+  int i;
+  int n = lua_gettop(L) - 1;  /* number of arguments to read */
+  /* ensure that arguments will fit here and into 'io_readline' stack */
+  luaL_argcheck(L, n <= LUA_MINSTACK - 3, LUA_MINSTACK - 3, "too many options");
+  lua_pushvalue(L, 1);  /* file handle */
+  lua_pushinteger(L, n);  /* number of arguments to read */
+  lua_pushboolean(L, toclose);  /* close/not close file when finished */
+  for (i = 1; i <= n; i++) lua_pushvalue(L, i + 1);  /* copy arguments */
+  lua_pushcclosure(L, io_readline, 3 + n);
+}
+
+
+static int f_lines (lua_State *L) {
+  tofile(L);  /* check that it's a valid file handle */
+  aux_lines(L, 0);
+  return 1;
+}
+
+
+static int io_lines (lua_State *L) {
+  int toclose;
+  if (lua_isnone(L, 1)) lua_pushnil(L);  /* at least one argument */
+  if (lua_isnil(L, 1)) {  /* no file name? */
+    lua_getfield(L, LUA_REGISTRYINDEX, IO_INPUT);  /* get default input */
+    lua_replace(L, 1);  /* put it at index 1 */
+    tofile(L);  /* check that it's a valid file handle */
+    toclose = 0;  /* do not close it after iteration */
+  }
+  else {  /* open a new file */
+    const char *filename = luaL_checkstring(L, 1);
+    opencheck(L, filename, "r");
+    lua_replace(L, 1);  /* put file at index 1 */
+    toclose = 1;  /* close it after iteration */
+  }
+  aux_lines(L, toclose);
+  return 1;
+}
+
+
+/*
+** {======================================================
+** READ
+** =======================================================
+*/
+
+
+static int read_number (lua_State *L, FILE *f) {
+  lua_Number d;
+  if (fscanf(f, LUA_NUMBER_SCAN, &d) == 1) {
+    lua_pushnumber(L, d);
+    return 1;
+  }
+  else {
+   lua_pushnil(L);  /* "result" to be removed */
+   return 0;  /* read fails */
+  }
+}
+
+
+static int test_eof (lua_State *L, FILE *f) {
+  int c = getc(f);
+  ungetc(c, f);
+  lua_pushlstring(L, NULL, 0);
+  return (c != EOF);
+}
+
+
+static int read_line (lua_State *L, FILE *f, int chop) {
+  luaL_Buffer b;
+  luaL_buffinit(L, &b);
+  for (;;) {
+    size_t l;
+    char *p = luaL_prepbuffer(&b);
+    if (fgets(p, LUAL_BUFFERSIZE, f) == NULL) {  /* eof? */
+      luaL_pushresult(&b);  /* close buffer */
+      return (lua_rawlen(L, -1) > 0);  /* check whether read something */
+    }
+    l = strlen(p);
+    if (l == 0 || p[l-1] != '\n')
+      luaL_addsize(&b, l);
+    else {
+      luaL_addsize(&b, l - chop);  /* chop 'eol' if needed */
+      luaL_pushresult(&b);  /* close buffer */
+      return 1;  /* read at least an `eol' */
+    }
+  }
+}
+
+
+#define MAX_SIZE_T	(~(size_t)0)
+
+static void read_all (lua_State *L, FILE *f) {
+  size_t rlen = LUAL_BUFFERSIZE;  /* how much to read in each cycle */
+  luaL_Buffer b;
+  luaL_buffinit(L, &b);
+  for (;;) {
+    char *p = luaL_prepbuffsize(&b, rlen);
+    size_t nr = fread(p, sizeof(char), rlen, f);
+    luaL_addsize(&b, nr);
+    if (nr < rlen) break;  /* eof? */
+    else if (rlen <= (MAX_SIZE_T / 4))  /* avoid buffers too large */
+      rlen *= 2;  /* double buffer size at each iteration */
+  }
+  luaL_pushresult(&b);  /* close buffer */
+}
+
+
+static int read_chars (lua_State *L, FILE *f, size_t n) {
+  size_t nr;  /* number of chars actually read */
+  char *p;
+  luaL_Buffer b;
+  luaL_buffinit(L, &b);
+  p = luaL_prepbuffsize(&b, n);  /* prepare buffer to read whole block */
+  nr = fread(p, sizeof(char), n, f);  /* try to read 'n' chars */
+  luaL_addsize(&b, nr);
+  luaL_pushresult(&b);  /* close buffer */
+  return (nr > 0);  /* true iff read something */
+}
+
+
+static int g_read (lua_State *L, FILE *f, int first) {
+  int nargs = lua_gettop(L) - 1;
+  int success;
+  int n;
+  clearerr(f);
+  if (nargs == 0) {  /* no arguments? */
+    success = read_line(L, f, 1);
+    n = first+1;  /* to return 1 result */
+  }
+  else {  /* ensure stack space for all results and for auxlib's buffer */
+    luaL_checkstack(L, nargs+LUA_MINSTACK, "too many arguments");
+    success = 1;
+    for (n = first; nargs-- && success; n++) {
+      if (lua_type(L, n) == LUA_TNUMBER) {
+        size_t l = (size_t)lua_tointeger(L, n);
+        success = (l == 0) ? test_eof(L, f) : read_chars(L, f, l);
+      }
+      else {
+        const char *p = lua_tostring(L, n);
+        luaL_argcheck(L, p && p[0] == '*', n, "invalid option");
+        switch (p[1]) {
+          case 'n':  /* number */
+            success = read_number(L, f);
+            break;
+          case 'l':  /* line */
+            success = read_line(L, f, 1);
+            break;
+          case 'L':  /* line with end-of-line */
+            success = read_line(L, f, 0);
+            break;
+          case 'a':  /* file */
+            read_all(L, f);  /* read entire file */
+            success = 1; /* always success */
+            break;
+          default:
+            return luaL_argerror(L, n, "invalid format");
+        }
+      }
+    }
+  }
+  if (ferror(f))
+    return luaL_fileresult(L, 0, NULL);
+  if (!success) {
+    lua_pop(L, 1);  /* remove last result */
+    lua_pushnil(L);  /* push nil instead */
+  }
+  return n - first;
+}
+
+
+static int io_read (lua_State *L) {
+  return g_read(L, getiofile(L, IO_INPUT), 1);
+}
+
+
+static int f_read (lua_State *L) {
+  return g_read(L, tofile(L), 2);
+}
+
+
+static int io_readline (lua_State *L) {
+  LStream *p = (LStream *)lua_touserdata(L, lua_upvalueindex(1));
+  int i;
+  int n = (int)lua_tointeger(L, lua_upvalueindex(2));
+  if (isclosed(p))  /* file is already closed? */
+    return luaL_error(L, "file is already closed");
+  lua_settop(L , 1);
+  for (i = 1; i <= n; i++)  /* push arguments to 'g_read' */
+    lua_pushvalue(L, lua_upvalueindex(3 + i));
+  n = g_read(L, p->f, 2);  /* 'n' is number of results */
+  lua_assert(n > 0);  /* should return at least a nil */
+  if (!lua_isnil(L, -n))  /* read at least one value? */
+    return n;  /* return them */
+  else {  /* first result is nil: EOF or error */
+    if (n > 1) {  /* is there error information? */
+      /* 2nd result is error message */
+      return luaL_error(L, "%s", lua_tostring(L, -n + 1));
+    }
+    if (lua_toboolean(L, lua_upvalueindex(3))) {  /* generator created file? */
+      lua_settop(L, 0);
+      lua_pushvalue(L, lua_upvalueindex(1));
+      aux_close(L);  /* close it */
+    }
+    return 0;
+  }
+}
+
+/* }====================================================== */
+
+
+static int g_write (lua_State *L, FILE *f, int arg) {
+  int nargs = lua_gettop(L) - arg;
+  int status = 1;
+  for (; nargs--; arg++) {
+    if (lua_type(L, arg) == LUA_TNUMBER) {
+      /* optimization: could be done exactly as for strings */
+      status = status &&
+          fprintf(f, LUA_NUMBER_FMT, lua_tonumber(L, arg)) > 0;
+    }
+    else {
+      size_t l;
+      const char *s = luaL_checklstring(L, arg, &l);
+      status = status && (fwrite(s, sizeof(char), l, f) == l);
+    }
+  }
+  if (status) return 1;  /* file handle already on stack top */
+  else return luaL_fileresult(L, status, NULL);
+}
+
+
+static int io_write (lua_State *L) {
+  return g_write(L, getiofile(L, IO_OUTPUT), 1);
+}
+
+
+static int f_write (lua_State *L) {
+  FILE *f = tofile(L);
+  lua_pushvalue(L, 1);  /* push file at the stack top (to be returned) */
+  return g_write(L, f, 2);
+}
+
+
+static int f_seek (lua_State *L) {
+  static const int mode[] = {SEEK_SET, SEEK_CUR, SEEK_END};
+  static const char *const modenames[] = {"set", "cur", "end", NULL};
+  FILE *f = tofile(L);
+  int op = luaL_checkoption(L, 2, "cur", modenames);
+  lua_Number p3 = luaL_optnumber(L, 3, 0);
+  l_seeknum offset = (l_seeknum)p3;
+  luaL_argcheck(L, (lua_Number)offset == p3, 3,
+                  "not an integer in proper range");
+  op = l_fseek(f, offset, mode[op]);
+  if (op)
+    return luaL_fileresult(L, 0, NULL);  /* error */
+  else {
+    lua_pushnumber(L, (lua_Number)l_ftell(f));
+    return 1;
+  }
+}
+
+
+static int f_setvbuf (lua_State *L) {
+  static const int mode[] = {_IONBF, _IOFBF, _IOLBF};
+  static const char *const modenames[] = {"no", "full", "line", NULL};
+  FILE *f = tofile(L);
+  int op = luaL_checkoption(L, 2, NULL, modenames);
+  lua_Integer sz = luaL_optinteger(L, 3, LUAL_BUFFERSIZE);
+  int res = setvbuf(f, NULL, mode[op], sz);
+  return luaL_fileresult(L, res == 0, NULL);
+}
+
+
+
+static int io_flush (lua_State *L) {
+  return luaL_fileresult(L, fflush(getiofile(L, IO_OUTPUT)) == 0, NULL);
+}
+
+
+static int f_flush (lua_State *L) {
+  return luaL_fileresult(L, fflush(tofile(L)) == 0, NULL);
+}
+
+
+/*
+** functions for 'io' library
+*/
+static const luaL_Reg iolib[] = {
+  {"close", io_close},
+  {"flush", io_flush},
+  {"input", io_input},
+  {"lines", io_lines},
+  {"open", io_open},
+  {"output", io_output},
+  {"popen", io_popen},
+  {"read", io_read},
+  {"tmpfile", io_tmpfile},
+  {"type", io_type},
+  {"write", io_write},
+  {NULL, NULL}
+};
+
+
+/*
+** methods for file handles
+*/
+static const luaL_Reg flib[] = {
+  {"close", io_close},
+  {"flush", f_flush},
+  {"lines", f_lines},
+  {"read", f_read},
+  {"seek", f_seek},
+  {"setvbuf", f_setvbuf},
+  {"write", f_write},
+  {"__gc", f_gc},
+  {"__tostring", f_tostring},
+  {NULL, NULL}
+};
+
+
+static void createmeta (lua_State *L) {
+  luaL_newmetatable(L, LUA_FILEHANDLE);  /* create metatable for file handles */
+  lua_pushvalue(L, -1);  /* push metatable */
+  lua_setfield(L, -2, "__index");  /* metatable.__index = metatable */
+  luaL_setfuncs(L, flib, 0);  /* add file methods to new metatable */
+  lua_pop(L, 1);  /* pop new metatable */
+}
+
+
+/*
+** function to (not) close the standard files stdin, stdout, and stderr
+*/
+static int io_noclose (lua_State *L) {
+  LStream *p = tolstream(L);
+  p->closef = &io_noclose;  /* keep file opened */
+  lua_pushnil(L);
+  lua_pushliteral(L, "cannot close standard file");
+  return 2;
+}
+
+
+static void createstdfile (lua_State *L, FILE *f, const char *k,
+                           const char *fname) {
+  LStream *p = newprefile(L);
+  p->f = f;
+  p->closef = &io_noclose;
+  if (k != NULL) {
+    lua_pushvalue(L, -1);
+    lua_setfield(L, LUA_REGISTRYINDEX, k);  /* add file to registry */
+  }
+  lua_setfield(L, -2, fname);  /* add file to module */
+}
+
+
+LUAMOD_API int luaopen_io (lua_State *L) {
+  luaL_newlib(L, iolib);  /* new module */
+  createmeta(L);
+  /* create (and set) default files */
+  createstdfile(L, stdin, IO_INPUT, "stdin");
+  createstdfile(L, stdout, IO_OUTPUT, "stdout");
+  createstdfile(L, stderr, NULL, "stderr");
+  return 1;
+}
+
diff --git a/ext/lua/src/llex.c b/ext/lua/src/llex.c
new file mode 100644
index 0000000..1a32e34
--- /dev/null
+++ b/ext/lua/src/llex.c
@@ -0,0 +1,527 @@
+/*
+** $Id: llex.c,v 2.63 2013/03/16 21:10:18 roberto Exp $
+** Lexical Analyzer
+** See Copyright Notice in lua.h
+*/
+
+
+#include <locale.h>
+#include <string.h>
+
+#define llex_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lctype.h"
+#include "ldo.h"
+#include "llex.h"
+#include "lobject.h"
+#include "lparser.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "lzio.h"
+
+
+
+#define next(ls) (ls->current = zgetc(ls->z))
+
+
+
+#define currIsNewline(ls)	(ls->current == '\n' || ls->current == '\r')
+
+
+/* ORDER RESERVED */
+static const char *const luaX_tokens [] = {
+    "and", "break", "do", "else", "elseif",
+    "end", "false", "for", "function", "goto", "if",
+    "in", "local", "nil", "not", "or", "repeat",
+    "return", "then", "true", "until", "while",
+    "..", "...", "==", ">=", "<=", "~=", "::", "<eof>",
+    "<number>", "<name>", "<string>"
+};
+
+
+#define save_and_next(ls) (save(ls, ls->current), next(ls))
+
+
+static l_noret lexerror (LexState *ls, const char *msg, int token);
+
+
+static void save (LexState *ls, int c) {
+  Mbuffer *b = ls->buff;
+  if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) {
+    size_t newsize;
+    if (luaZ_sizebuffer(b) >= MAX_SIZET/2)
+      lexerror(ls, "lexical element too long", 0);
+    newsize = luaZ_sizebuffer(b) * 2;
+    luaZ_resizebuffer(ls->L, b, newsize);
+  }
+  b->buffer[luaZ_bufflen(b)++] = cast(char, c);
+}
+
+
+void luaX_init (lua_State *L) {
+  int i;
+  for (i=0; i<NUM_RESERVED; i++) {
+    TString *ts = luaS_new(L, luaX_tokens[i]);
+    luaS_fix(ts);  /* reserved words are never collected */
+    ts->tsv.extra = cast_byte(i+1);  /* reserved word */
+  }
+}
+
+
+const char *luaX_token2str (LexState *ls, int token) {
+  if (token < FIRST_RESERVED) {  /* single-byte symbols? */
+    lua_assert(token == cast(unsigned char, token));
+    return (lisprint(token)) ? luaO_pushfstring(ls->L, LUA_QL("%c"), token) :
+                              luaO_pushfstring(ls->L, "char(%d)", token);
+  }
+  else {
+    const char *s = luaX_tokens[token - FIRST_RESERVED];
+    if (token < TK_EOS)  /* fixed format (symbols and reserved words)? */
+      return luaO_pushfstring(ls->L, LUA_QS, s);
+    else  /* names, strings, and numerals */
+      return s;
+  }
+}
+
+
+static const char *txtToken (LexState *ls, int token) {
+  switch (token) {
+    case TK_NAME:
+    case TK_STRING:
+    case TK_NUMBER:
+      save(ls, '\0');
+      return luaO_pushfstring(ls->L, LUA_QS, luaZ_buffer(ls->buff));
+    default:
+      return luaX_token2str(ls, token);
+  }
+}
+
+
+static l_noret lexerror (LexState *ls, const char *msg, int token) {
+  char buff[LUA_IDSIZE];
+  luaO_chunkid(buff, getstr(ls->source), LUA_IDSIZE);
+  msg = luaO_pushfstring(ls->L, "%s:%d: %s", buff, ls->linenumber, msg);
+  if (token)
+    luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token));
+  luaD_throw(ls->L, LUA_ERRSYNTAX);
+}
+
+
+l_noret luaX_syntaxerror (LexState *ls, const char *msg) {
+  lexerror(ls, msg, ls->t.token);
+}
+
+
+/*
+** creates a new string and anchors it in function's table so that
+** it will not be collected until the end of the function's compilation
+** (by that time it should be anchored in function's prototype)
+*/
+TString *luaX_newstring (LexState *ls, const char *str, size_t l) {
+  lua_State *L = ls->L;
+  TValue *o;  /* entry for `str' */
+  TString *ts = luaS_newlstr(L, str, l);  /* create new string */
+  setsvalue2s(L, L->top++, ts);  /* temporarily anchor it in stack */
+  o = luaH_set(L, ls->fs->h, L->top - 1);
+  if (ttisnil(o)) {  /* not in use yet? (see 'addK') */
+    /* boolean value does not need GC barrier;
+       table has no metatable, so it does not need to invalidate cache */
+    setbvalue(o, 1);  /* t[string] = true */
+    luaC_checkGC(L);
+  }
+  L->top--;  /* remove string from stack */
+  return ts;
+}
+
+
+/*
+** increment line number and skips newline sequence (any of
+** \n, \r, \n\r, or \r\n)
+*/
+static void inclinenumber (LexState *ls) {
+  int old = ls->current;
+  lua_assert(currIsNewline(ls));
+  next(ls);  /* skip `\n' or `\r' */
+  if (currIsNewline(ls) && ls->current != old)
+    next(ls);  /* skip `\n\r' or `\r\n' */
+  if (++ls->linenumber >= MAX_INT)
+    luaX_syntaxerror(ls, "chunk has too many lines");
+}
+
+
+void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source,
+                    int firstchar) {
+  ls->decpoint = '.';
+  ls->L = L;
+  ls->current = firstchar;
+  ls->lookahead.token = TK_EOS;  /* no look-ahead token */
+  ls->z = z;
+  ls->fs = NULL;
+  ls->linenumber = 1;
+  ls->lastline = 1;
+  ls->source = source;
+  ls->envn = luaS_new(L, LUA_ENV);  /* create env name */
+  luaS_fix(ls->envn);  /* never collect this name */
+  luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER);  /* initialize buffer */
+}
+
+
+
+/*
+** =======================================================
+** LEXICAL ANALYZER
+** =======================================================
+*/
+
+
+
+static int check_next (LexState *ls, const char *set) {
+  if (ls->current == '\0' || !strchr(set, ls->current))
+    return 0;
+  save_and_next(ls);
+  return 1;
+}
+
+
+/*
+** change all characters 'from' in buffer to 'to'
+*/
+static void buffreplace (LexState *ls, char from, char to) {
+  size_t n = luaZ_bufflen(ls->buff);
+  char *p = luaZ_buffer(ls->buff);
+  while (n--)
+    if (p[n] == from) p[n] = to;
+}
+
+
+#if !defined(getlocaledecpoint)
+#define getlocaledecpoint()	(localeconv()->decimal_point[0])
+#endif
+
+
+#define buff2d(b,e)	luaO_str2d(luaZ_buffer(b), luaZ_bufflen(b) - 1, e)
+
+/*
+** in case of format error, try to change decimal point separator to
+** the one defined in the current locale and check again
+*/
+static void trydecpoint (LexState *ls, SemInfo *seminfo) {
+  char old = ls->decpoint;
+  ls->decpoint = getlocaledecpoint();
+  buffreplace(ls, old, ls->decpoint);  /* try new decimal separator */
+  if (!buff2d(ls->buff, &seminfo->r)) {
+    /* format error with correct decimal point: no more options */
+    buffreplace(ls, ls->decpoint, '.');  /* undo change (for error message) */
+    lexerror(ls, "malformed number", TK_NUMBER);
+  }
+}
+
+
+/* LUA_NUMBER */
+/*
+** this function is quite liberal in what it accepts, as 'luaO_str2d'
+** will reject ill-formed numerals.
+*/
+static void read_numeral (LexState *ls, SemInfo *seminfo) {
+  const char *expo = "Ee";
+  int first = ls->current;
+  lua_assert(lisdigit(ls->current));
+  save_and_next(ls);
+  if (first == '0' && check_next(ls, "Xx"))  /* hexadecimal? */
+    expo = "Pp";
+  for (;;) {
+    if (check_next(ls, expo))  /* exponent part? */
+      check_next(ls, "+-");  /* optional exponent sign */
+    if (lisxdigit(ls->current) || ls->current == '.')
+      save_and_next(ls);
+    else  break;
+  }
+  save(ls, '\0');
+  buffreplace(ls, '.', ls->decpoint);  /* follow locale for decimal point */
+  if (!buff2d(ls->buff, &seminfo->r))  /* format error? */
+    trydecpoint(ls, seminfo); /* try to update decimal point separator */
+}
+
+
+/*
+** skip a sequence '[=*[' or ']=*]' and return its number of '='s or
+** -1 if sequence is malformed
+*/
+static int skip_sep (LexState *ls) {
+  int count = 0;
+  int s = ls->current;
+  lua_assert(s == '[' || s == ']');
+  save_and_next(ls);
+  while (ls->current == '=') {
+    save_and_next(ls);
+    count++;
+  }
+  return (ls->current == s) ? count : (-count) - 1;
+}
+
+
+static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) {
+  save_and_next(ls);  /* skip 2nd `[' */
+  if (currIsNewline(ls))  /* string starts with a newline? */
+    inclinenumber(ls);  /* skip it */
+  for (;;) {
+    switch (ls->current) {
+      case EOZ:
+        lexerror(ls, (seminfo) ? "unfinished long string" :
+                                 "unfinished long comment", TK_EOS);
+        break;  /* to avoid warnings */
+      case ']': {
+        if (skip_sep(ls) == sep) {
+          save_and_next(ls);  /* skip 2nd `]' */
+          goto endloop;
+        }
+        break;
+      }
+      case '\n': case '\r': {
+        save(ls, '\n');
+        inclinenumber(ls);
+        if (!seminfo) luaZ_resetbuffer(ls->buff);  /* avoid wasting space */
+        break;
+      }
+      default: {
+        if (seminfo) save_and_next(ls);
+        else next(ls);
+      }
+    }
+  } endloop:
+  if (seminfo)
+    seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep),
+                                     luaZ_bufflen(ls->buff) - 2*(2 + sep));
+}
+
+
+static void escerror (LexState *ls, int *c, int n, const char *msg) {
+  int i;
+  luaZ_resetbuffer(ls->buff);  /* prepare error message */
+  save(ls, '\\');
+  for (i = 0; i < n && c[i] != EOZ; i++)
+    save(ls, c[i]);
+  lexerror(ls, msg, TK_STRING);
+}
+
+
+static int readhexaesc (LexState *ls) {
+  int c[3], i;  /* keep input for error message */
+  int r = 0;  /* result accumulator */
+  c[0] = 'x';  /* for error message */
+  for (i = 1; i < 3; i++) {  /* read two hexadecimal digits */
+    c[i] = next(ls);
+    if (!lisxdigit(c[i]))
+      escerror(ls, c, i + 1, "hexadecimal digit expected");
+    r = (r << 4) + luaO_hexavalue(c[i]);
+  }
+  return r;
+}
+
+
+static int readdecesc (LexState *ls) {
+  int c[3], i;
+  int r = 0;  /* result accumulator */
+  for (i = 0; i < 3 && lisdigit(ls->current); i++) {  /* read up to 3 digits */
+    c[i] = ls->current;
+    r = 10*r + c[i] - '0';
+    next(ls);
+  }
+  if (r > UCHAR_MAX)
+    escerror(ls, c, i, "decimal escape too large");
+  return r;
+}
+
+
+static void read_string (LexState *ls, int del, SemInfo *seminfo) {
+  save_and_next(ls);  /* keep delimiter (for error messages) */
+  while (ls->current != del) {
+    switch (ls->current) {
+      case EOZ:
+        lexerror(ls, "unfinished string", TK_EOS);
+        break;  /* to avoid warnings */
+      case '\n':
+      case '\r':
+        lexerror(ls, "unfinished string", TK_STRING);
+        break;  /* to avoid warnings */
+      case '\\': {  /* escape sequences */
+        int c;  /* final character to be saved */
+        next(ls);  /* do not save the `\' */
+        switch (ls->current) {
+          case 'a': c = '\a'; goto read_save;
+          case 'b': c = '\b'; goto read_save;
+          case 'f': c = '\f'; goto read_save;
+          case 'n': c = '\n'; goto read_save;
+          case 'r': c = '\r'; goto read_save;
+          case 't': c = '\t'; goto read_save;
+          case 'v': c = '\v'; goto read_save;
+          case 'x': c = readhexaesc(ls); goto read_save;
+          case '\n': case '\r':
+            inclinenumber(ls); c = '\n'; goto only_save;
+          case '\\': case '\"': case '\'':
+            c = ls->current; goto read_save;
+          case EOZ: goto no_save;  /* will raise an error next loop */
+          case 'z': {  /* zap following span of spaces */
+            next(ls);  /* skip the 'z' */
+            while (lisspace(ls->current)) {
+              if (currIsNewline(ls)) inclinenumber(ls);
+              else next(ls);
+            }
+            goto no_save;
+          }
+          default: {
+            if (!lisdigit(ls->current))
+              escerror(ls, &ls->current, 1, "invalid escape sequence");
+            /* digital escape \ddd */
+            c = readdecesc(ls);
+            goto only_save;
+          }
+        }
+       read_save: next(ls);  /* read next character */
+       only_save: save(ls, c);  /* save 'c' */
+       no_save: break;
+      }
+      default:
+        save_and_next(ls);
+    }
+  }
+  save_and_next(ls);  /* skip delimiter */
+  seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1,
+                                   luaZ_bufflen(ls->buff) - 2);
+}
+
+
+static int llex (LexState *ls, SemInfo *seminfo) {
+  luaZ_resetbuffer(ls->buff);
+  for (;;) {
+    switch (ls->current) {
+      case '\n': case '\r': {  /* line breaks */
+        inclinenumber(ls);
+        break;
+      }
+      case ' ': case '\f': case '\t': case '\v': {  /* spaces */
+        next(ls);
+        break;
+      }
+      case '-': {  /* '-' or '--' (comment) */
+        next(ls);
+        if (ls->current != '-') return '-';
+        /* else is a comment */
+        next(ls);
+        if (ls->current == '[') {  /* long comment? */
+          int sep = skip_sep(ls);
+          luaZ_resetbuffer(ls->buff);  /* `skip_sep' may dirty the buffer */
+          if (sep >= 0) {
+            read_long_string(ls, NULL, sep);  /* skip long comment */
+            luaZ_resetbuffer(ls->buff);  /* previous call may dirty the buff. */
+            break;
+          }
+        }
+        /* else short comment */
+        while (!currIsNewline(ls) && ls->current != EOZ)
+          next(ls);  /* skip until end of line (or end of file) */
+        break;
+      }
+      case '[': {  /* long string or simply '[' */
+        int sep = skip_sep(ls);
+        if (sep >= 0) {
+          read_long_string(ls, seminfo, sep);
+          return TK_STRING;
+        }
+        else if (sep == -1) return '[';
+        else lexerror(ls, "invalid long string delimiter", TK_STRING);
+      }
+      case '=': {
+        next(ls);
+        if (ls->current != '=') return '=';
+        else { next(ls); return TK_EQ; }
+      }
+      case '<': {
+        next(ls);
+        if (ls->current != '=') return '<';
+        else { next(ls); return TK_LE; }
+      }
+      case '>': {
+        next(ls);
+        if (ls->current != '=') return '>';
+        else { next(ls); return TK_GE; }
+      }
+      case '~': {
+        next(ls);
+        if (ls->current != '=') return '~';
+        else { next(ls); return TK_NE; }
+      }
+      case ':': {
+        next(ls);
+        if (ls->current != ':') return ':';
+        else { next(ls); return TK_DBCOLON; }
+      }
+      case '"': case '\'': {  /* short literal strings */
+        read_string(ls, ls->current, seminfo);
+        return TK_STRING;
+      }
+      case '.': {  /* '.', '..', '...', or number */
+        save_and_next(ls);
+        if (check_next(ls, ".")) {
+          if (check_next(ls, "."))
+            return TK_DOTS;   /* '...' */
+          else return TK_CONCAT;   /* '..' */
+        }
+        else if (!lisdigit(ls->current)) return '.';
+        /* else go through */
+      }
+      case '0': case '1': case '2': case '3': case '4':
+      case '5': case '6': case '7': case '8': case '9': {
+        read_numeral(ls, seminfo);
+        return TK_NUMBER;
+      }
+      case EOZ: {
+        return TK_EOS;
+      }
+      default: {
+        if (lislalpha(ls->current)) {  /* identifier or reserved word? */
+          TString *ts;
+          do {
+            save_and_next(ls);
+          } while (lislalnum(ls->current));
+          ts = luaX_newstring(ls, luaZ_buffer(ls->buff),
+                                  luaZ_bufflen(ls->buff));
+          seminfo->ts = ts;
+          if (isreserved(ts))  /* reserved word? */
+            return ts->tsv.extra - 1 + FIRST_RESERVED;
+          else {
+            return TK_NAME;
+          }
+        }
+        else {  /* single-char tokens (+ - / ...) */
+          int c = ls->current;
+          next(ls);
+          return c;
+        }
+      }
+    }
+  }
+}
+
+
+void luaX_next (LexState *ls) {
+  ls->lastline = ls->linenumber;
+  if (ls->lookahead.token != TK_EOS) {  /* is there a look-ahead token? */
+    ls->t = ls->lookahead;  /* use this one */
+    ls->lookahead.token = TK_EOS;  /* and discharge it */
+  }
+  else
+    ls->t.token = llex(ls, &ls->t.seminfo);  /* read next token */
+}
+
+
+int luaX_lookahead (LexState *ls) {
+  lua_assert(ls->lookahead.token == TK_EOS);
+  ls->lookahead.token = llex(ls, &ls->lookahead.seminfo);
+  return ls->lookahead.token;
+}
+
diff --git a/ext/lua/src/lmathlib.c b/ext/lua/src/lmathlib.c
new file mode 100644
index 0000000..a49f1fd
--- /dev/null
+++ b/ext/lua/src/lmathlib.c
@@ -0,0 +1,279 @@
+/*
+** $Id: lmathlib.c,v 1.83 2013/03/07 18:21:32 roberto Exp $
+** Standard mathematical library
+** See Copyright Notice in lua.h
+*/
+
+
+#include <stdlib.h>
+#include <math.h>
+
+#define lmathlib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+#undef PI
+#define PI	((lua_Number)(3.1415926535897932384626433832795))
+#define RADIANS_PER_DEGREE	((lua_Number)(PI/180.0))
+
+
+
+static int math_abs (lua_State *L) {
+  lua_pushnumber(L, l_mathop(fabs)(luaL_checknumber(L, 1)));
+  return 1;
+}
+
+static int math_sin (lua_State *L) {
+  lua_pushnumber(L, l_mathop(sin)(luaL_checknumber(L, 1)));
+  return 1;
+}
+
+static int math_sinh (lua_State *L) {
+  lua_pushnumber(L, l_mathop(sinh)(luaL_checknumber(L, 1)));
+  return 1;
+}
+
+static int math_cos (lua_State *L) {
+  lua_pushnumber(L, l_mathop(cos)(luaL_checknumber(L, 1)));
+  return 1;
+}
+
+static int math_cosh (lua_State *L) {
+  lua_pushnumber(L, l_mathop(cosh)(luaL_checknumber(L, 1)));
+  return 1;
+}
+
+static int math_tan (lua_State *L) {
+  lua_pushnumber(L, l_mathop(tan)(luaL_checknumber(L, 1)));
+  return 1;
+}
+
+static int math_tanh (lua_State *L) {
+  lua_pushnumber(L, l_mathop(tanh)(luaL_checknumber(L, 1)));
+  return 1;
+}
+
+static int math_asin (lua_State *L) {
+  lua_pushnumber(L, l_mathop(asin)(luaL_checknumber(L, 1)));
+  return 1;
+}
+
+static int math_acos (lua_State *L) {
+  lua_pushnumber(L, l_mathop(acos)(luaL_checknumber(L, 1)));
+  return 1;
+}
+
+static int math_atan (lua_State *L) {
+  lua_pushnumber(L, l_mathop(atan)(luaL_checknumber(L, 1)));
+  return 1;
+}
+
+static int math_atan2 (lua_State *L) {
+  lua_pushnumber(L, l_mathop(atan2)(luaL_checknumber(L, 1),
+                                luaL_checknumber(L, 2)));
+  return 1;
+}
+
+static int math_ceil (lua_State *L) {
+  lua_pushnumber(L, l_mathop(ceil)(luaL_checknumber(L, 1)));
+  return 1;
+}
+
+static int math_floor (lua_State *L) {
+  lua_pushnumber(L, l_mathop(floor)(luaL_checknumber(L, 1)));
+  return 1;
+}
+
+static int math_fmod (lua_State *L) {
+  lua_pushnumber(L, l_mathop(fmod)(luaL_checknumber(L, 1),
+                               luaL_checknumber(L, 2)));
+  return 1;
+}
+
+static int math_modf (lua_State *L) {
+  lua_Number ip;
+  lua_Number fp = l_mathop(modf)(luaL_checknumber(L, 1), &ip);
+  lua_pushnumber(L, ip);
+  lua_pushnumber(L, fp);
+  return 2;
+}
+
+static int math_sqrt (lua_State *L) {
+  lua_pushnumber(L, l_mathop(sqrt)(luaL_checknumber(L, 1)));
+  return 1;
+}
+
+static int math_pow (lua_State *L) {
+  lua_Number x = luaL_checknumber(L, 1);
+  lua_Number y = luaL_checknumber(L, 2);
+  lua_pushnumber(L, l_mathop(pow)(x, y));
+  return 1;
+}
+
+static int math_log (lua_State *L) {
+  lua_Number x = luaL_checknumber(L, 1);
+  lua_Number res;
+  if (lua_isnoneornil(L, 2))
+    res = l_mathop(log)(x);
+  else {
+    lua_Number base = luaL_checknumber(L, 2);
+    if (base == (lua_Number)10.0) res = l_mathop(log10)(x);
+    else res = l_mathop(log)(x)/l_mathop(log)(base);
+  }
+  lua_pushnumber(L, res);
+  return 1;
+}
+
+#if defined(LUA_COMPAT_LOG10)
+static int math_log10 (lua_State *L) {
+  lua_pushnumber(L, l_mathop(log10)(luaL_checknumber(L, 1)));
+  return 1;
+}
+#endif
+
+static int math_exp (lua_State *L) {
+  lua_pushnumber(L, l_mathop(exp)(luaL_checknumber(L, 1)));
+  return 1;
+}
+
+static int math_deg (lua_State *L) {
+  lua_pushnumber(L, luaL_checknumber(L, 1)/RADIANS_PER_DEGREE);
+  return 1;
+}
+
+static int math_rad (lua_State *L) {
+  lua_pushnumber(L, luaL_checknumber(L, 1)*RADIANS_PER_DEGREE);
+  return 1;
+}
+
+static int math_frexp (lua_State *L) {
+  int e;
+  lua_pushnumber(L, l_mathop(frexp)(luaL_checknumber(L, 1), &e));
+  lua_pushinteger(L, e);
+  return 2;
+}
+
+static int math_ldexp (lua_State *L) {
+  lua_Number x = luaL_checknumber(L, 1);
+  int ep = luaL_checkint(L, 2);
+  lua_pushnumber(L, l_mathop(ldexp)(x, ep));
+  return 1;
+}
+
+
+
+static int math_min (lua_State *L) {
+  int n = lua_gettop(L);  /* number of arguments */
+  lua_Number dmin = luaL_checknumber(L, 1);
+  int i;
+  for (i=2; i<=n; i++) {
+    lua_Number d = luaL_checknumber(L, i);
+    if (d < dmin)
+      dmin = d;
+  }
+  lua_pushnumber(L, dmin);
+  return 1;
+}
+
+
+static int math_max (lua_State *L) {
+  int n = lua_gettop(L);  /* number of arguments */
+  lua_Number dmax = luaL_checknumber(L, 1);
+  int i;
+  for (i=2; i<=n; i++) {
+    lua_Number d = luaL_checknumber(L, i);
+    if (d > dmax)
+      dmax = d;
+  }
+  lua_pushnumber(L, dmax);
+  return 1;
+}
+
+
+static int math_random (lua_State *L) {
+  /* the `%' avoids the (rare) case of r==1, and is needed also because on
+     some systems (SunOS!) `rand()' may return a value larger than RAND_MAX */
+  lua_Number r = (lua_Number)(rand()%RAND_MAX) / (lua_Number)RAND_MAX;
+  switch (lua_gettop(L)) {  /* check number of arguments */
+    case 0: {  /* no arguments */
+      lua_pushnumber(L, r);  /* Number between 0 and 1 */
+      break;
+    }
+    case 1: {  /* only upper limit */
+      lua_Number u = luaL_checknumber(L, 1);
+      luaL_argcheck(L, (lua_Number)1.0 <= u, 1, "interval is empty");
+      lua_pushnumber(L, l_mathop(floor)(r*u) + (lua_Number)(1.0));  /* [1, u] */
+      break;
+    }
+    case 2: {  /* lower and upper limits */
+      lua_Number l = luaL_checknumber(L, 1);
+      lua_Number u = luaL_checknumber(L, 2);
+      luaL_argcheck(L, l <= u, 2, "interval is empty");
+      lua_pushnumber(L, l_mathop(floor)(r*(u-l+1)) + l);  /* [l, u] */
+      break;
+    }
+    default: return luaL_error(L, "wrong number of arguments");
+  }
+  return 1;
+}
+
+
+static int math_randomseed (lua_State *L) {
+  srand(luaL_checkunsigned(L, 1));
+  (void)rand(); /* discard first value to avoid undesirable correlations */
+  return 0;
+}
+
+
+static const luaL_Reg mathlib[] = {
+  {"abs",   math_abs},
+  {"acos",  math_acos},
+  {"asin",  math_asin},
+  {"atan2", math_atan2},
+  {"atan",  math_atan},
+  {"ceil",  math_ceil},
+  {"cosh",   math_cosh},
+  {"cos",   math_cos},
+  {"deg",   math_deg},
+  {"exp",   math_exp},
+  {"floor", math_floor},
+  {"fmod",   math_fmod},
+  {"frexp", math_frexp},
+  {"ldexp", math_ldexp},
+#if defined(LUA_COMPAT_LOG10)
+  {"log10", math_log10},
+#endif
+  {"log",   math_log},
+  {"max",   math_max},
+  {"min",   math_min},
+  {"modf",   math_modf},
+  {"pow",   math_pow},
+  {"rad",   math_rad},
+  {"random",     math_random},
+  {"randomseed", math_randomseed},
+  {"sinh",   math_sinh},
+  {"sin",   math_sin},
+  {"sqrt",  math_sqrt},
+  {"tanh",   math_tanh},
+  {"tan",   math_tan},
+  {NULL, NULL}
+};
+
+
+/*
+** Open math library
+*/
+LUAMOD_API int luaopen_math (lua_State *L) {
+  luaL_newlib(L, mathlib);
+  lua_pushnumber(L, PI);
+  lua_setfield(L, -2, "pi");
+  lua_pushnumber(L, HUGE_VAL);
+  lua_setfield(L, -2, "huge");
+  return 1;
+}
+
diff --git a/ext/lua/src/lmem.c b/ext/lua/src/lmem.c
new file mode 100644
index 0000000..3f88496
--- /dev/null
+++ b/ext/lua/src/lmem.c
@@ -0,0 +1,99 @@
+/*
+** $Id: lmem.c,v 1.84 2012/05/23 15:41:53 roberto Exp $
+** Interface to Memory Manager
+** See Copyright Notice in lua.h
+*/
+
+
+#include <stddef.h>
+
+#define lmem_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+
+
+
+/*
+** About the realloc function:
+** void * frealloc (void *ud, void *ptr, size_t osize, size_t nsize);
+** (`osize' is the old size, `nsize' is the new size)
+**
+** * frealloc(ud, NULL, x, s) creates a new block of size `s' (no
+** matter 'x').
+**
+** * frealloc(ud, p, x, 0) frees the block `p'
+** (in this specific case, frealloc must return NULL);
+** particularly, frealloc(ud, NULL, 0, 0) does nothing
+** (which is equivalent to free(NULL) in ANSI C)
+**
+** frealloc returns NULL if it cannot create or reallocate the area
+** (any reallocation to an equal or smaller size cannot fail!)
+*/
+
+
+
+#define MINSIZEARRAY	4
+
+
+void *luaM_growaux_ (lua_State *L, void *block, int *size, size_t size_elems,
+                     int limit, const char *what) {
+  void *newblock;
+  int newsize;
+  if (*size >= limit/2) {  /* cannot double it? */
+    if (*size >= limit)  /* cannot grow even a little? */
+      luaG_runerror(L, "too many %s (limit is %d)", what, limit);
+    newsize = limit;  /* still have at least one free place */
+  }
+  else {
+    newsize = (*size)*2;
+    if (newsize < MINSIZEARRAY)
+      newsize = MINSIZEARRAY;  /* minimum size */
+  }
+  newblock = luaM_reallocv(L, block, *size, newsize, size_elems);
+  *size = newsize;  /* update only when everything else is OK */
+  return newblock;
+}
+
+
+l_noret luaM_toobig (lua_State *L) {
+  luaG_runerror(L, "memory allocation error: block too big");
+}
+
+
+
+/*
+** generic allocation routine.
+*/
+void *luaM_realloc_ (lua_State *L, void *block, size_t osize, size_t nsize) {
+  void *newblock;
+  global_State *g = G(L);
+  size_t realosize = (block) ? osize : 0;
+  lua_assert((realosize == 0) == (block == NULL));
+#if defined(HARDMEMTESTS)
+  if (nsize > realosize && g->gcrunning)
+    luaC_fullgc(L, 1);  /* force a GC whenever possible */
+#endif
+  newblock = (*g->frealloc)(g->ud, block, osize, nsize);
+  if (newblock == NULL && nsize > 0) {
+    api_check(L, nsize > realosize,
+                 "realloc cannot fail when shrinking a block");
+    if (g->gcrunning) {
+      luaC_fullgc(L, 1);  /* try to free some memory... */
+      newblock = (*g->frealloc)(g->ud, block, osize, nsize);  /* try again */
+    }
+    if (newblock == NULL)
+      luaD_throw(L, LUA_ERRMEM);
+  }
+  lua_assert((nsize == 0) == (newblock == NULL));
+  g->GCdebt = (g->GCdebt + nsize) - realosize;
+  return newblock;
+}
+
diff --git a/ext/lua/src/loadlib.c b/ext/lua/src/loadlib.c
new file mode 100644
index 0000000..a995927
--- /dev/null
+++ b/ext/lua/src/loadlib.c
@@ -0,0 +1,725 @@
+/*
+** $Id: loadlib.c,v 1.111 2012/05/30 12:33:44 roberto Exp $
+** Dynamic library loader for Lua
+** See Copyright Notice in lua.h
+**
+** This module contains an implementation of loadlib for Unix systems
+** that have dlfcn, an implementation for Windows, and a stub for other
+** systems.
+*/
+
+
+/*
+** if needed, includes windows header before everything else
+*/
+#if defined(_WIN32)
+#include <windows.h>
+#endif
+
+
+#include <stdlib.h>
+#include <string.h>
+
+
+#define loadlib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+/*
+** LUA_PATH and LUA_CPATH are the names of the environment
+** variables that Lua check to set its paths.
+*/
+#if !defined(LUA_PATH)
+#define LUA_PATH	"LUA_PATH"
+#endif
+
+#if !defined(LUA_CPATH)
+#define LUA_CPATH	"LUA_CPATH"
+#endif
+
+#define LUA_PATHSUFFIX		"_" LUA_VERSION_MAJOR "_" LUA_VERSION_MINOR
+
+#define LUA_PATHVERSION		LUA_PATH LUA_PATHSUFFIX
+#define LUA_CPATHVERSION	LUA_CPATH LUA_PATHSUFFIX
+
+/*
+** LUA_PATH_SEP is the character that separates templates in a path.
+** LUA_PATH_MARK is the string that marks the substitution points in a
+** template.
+** LUA_EXEC_DIR in a Windows path is replaced by the executable's
+** directory.
+** LUA_IGMARK is a mark to ignore all before it when building the
+** luaopen_ function name.
+*/
+#if !defined (LUA_PATH_SEP)
+#define LUA_PATH_SEP		";"
+#endif
+#if !defined (LUA_PATH_MARK)
+#define LUA_PATH_MARK		"?"
+#endif
+#if !defined (LUA_EXEC_DIR)
+#define LUA_EXEC_DIR		"!"
+#endif
+#if !defined (LUA_IGMARK)
+#define LUA_IGMARK		"-"
+#endif
+
+
+/*
+** LUA_CSUBSEP is the character that replaces dots in submodule names
+** when searching for a C loader.
+** LUA_LSUBSEP is the character that replaces dots in submodule names
+** when searching for a Lua loader.
+*/
+#if !defined(LUA_CSUBSEP)
+#define LUA_CSUBSEP		LUA_DIRSEP
+#endif
+
+#if !defined(LUA_LSUBSEP)
+#define LUA_LSUBSEP		LUA_DIRSEP
+#endif
+
+
+/* prefix for open functions in C libraries */
+#define LUA_POF		"luaopen_"
+
+/* separator for open functions in C libraries */
+#define LUA_OFSEP	"_"
+
+
+/* table (in the registry) that keeps handles for all loaded C libraries */
+#define CLIBS		"_CLIBS"
+
+#define LIB_FAIL	"open"
+
+
+/* error codes for ll_loadfunc */
+#define ERRLIB		1
+#define ERRFUNC		2
+
+#define setprogdir(L)		((void)0)
+
+
+/*
+** system-dependent functions
+*/
+static void ll_unloadlib (void *lib);
+static void *ll_load (lua_State *L, const char *path, int seeglb);
+static lua_CFunction ll_sym (lua_State *L, void *lib, const char *sym);
+
+
+
+#if defined(LUA_USE_DLOPEN)
+/*
+** {========================================================================
+** This is an implementation of loadlib based on the dlfcn interface.
+** The dlfcn interface is available in Linux, SunOS, Solaris, IRIX, FreeBSD,
+** NetBSD, AIX 4.2, HPUX 11, and  probably most other Unix flavors, at least
+** as an emulation layer on top of native functions.
+** =========================================================================
+*/
+
+#include <dlfcn.h>
+
+static void ll_unloadlib (void *lib) {
+  dlclose(lib);
+}
+
+
+static void *ll_load (lua_State *L, const char *path, int seeglb) {
+  void *lib = dlopen(path, RTLD_NOW | (seeglb ? RTLD_GLOBAL : RTLD_LOCAL));
+  if (lib == NULL) lua_pushstring(L, dlerror());
+  return lib;
+}
+
+
+static lua_CFunction ll_sym (lua_State *L, void *lib, const char *sym) {
+  lua_CFunction f = (lua_CFunction)dlsym(lib, sym);
+  if (f == NULL) lua_pushstring(L, dlerror());
+  return f;
+}
+
+/* }====================================================== */
+
+
+
+#elif defined(LUA_DL_DLL)
+/*
+** {======================================================================
+** This is an implementation of loadlib for Windows using native functions.
+** =======================================================================
+*/
+
+#undef setprogdir
+
+/*
+** optional flags for LoadLibraryEx
+*/
+#if !defined(LUA_LLE_FLAGS)
+#define LUA_LLE_FLAGS	0
+#endif
+
+
+static void setprogdir (lua_State *L) {
+  char buff[MAX_PATH + 1];
+  char *lb;
+  DWORD nsize = sizeof(buff)/sizeof(char);
+  DWORD n = GetModuleFileNameA(NULL, buff, nsize);
+  if (n == 0 || n == nsize || (lb = strrchr(buff, '\\')) == NULL)
+    luaL_error(L, "unable to get ModuleFileName");
+  else {
+    *lb = '\0';
+    luaL_gsub(L, lua_tostring(L, -1), LUA_EXEC_DIR, buff);
+    lua_remove(L, -2);  /* remove original string */
+  }
+}
+
+
+static void pusherror (lua_State *L) {
+  int error = GetLastError();
+  char buffer[128];
+  if (FormatMessageA(FORMAT_MESSAGE_IGNORE_INSERTS | FORMAT_MESSAGE_FROM_SYSTEM,
+      NULL, error, 0, buffer, sizeof(buffer)/sizeof(char), NULL))
+    lua_pushstring(L, buffer);
+  else
+    lua_pushfstring(L, "system error %d\n", error);
+}
+
+static void ll_unloadlib (void *lib) {
+  FreeLibrary((HMODULE)lib);
+}
+
+
+static void *ll_load (lua_State *L, const char *path, int seeglb) {
+  HMODULE lib = LoadLibraryExA(path, NULL, LUA_LLE_FLAGS);
+  (void)(seeglb);  /* not used: symbols are 'global' by default */
+  if (lib == NULL) pusherror(L);
+  return lib;
+}
+
+
+static lua_CFunction ll_sym (lua_State *L, void *lib, const char *sym) {
+  lua_CFunction f = (lua_CFunction)GetProcAddress((HMODULE)lib, sym);
+  if (f == NULL) pusherror(L);
+  return f;
+}
+
+/* }====================================================== */
+
+
+#else
+/*
+** {======================================================
+** Fallback for other systems
+** =======================================================
+*/
+
+#undef LIB_FAIL
+#define LIB_FAIL	"absent"
+
+
+#define DLMSG	"dynamic libraries not enabled; check your Lua installation"
+
+
+static void ll_unloadlib (void *lib) {
+  (void)(lib);  /* not used */
+}
+
+
+static void *ll_load (lua_State *L, const char *path, int seeglb) {
+  (void)(path); (void)(seeglb);  /* not used */
+  lua_pushliteral(L, DLMSG);
+  return NULL;
+}
+
+
+static lua_CFunction ll_sym (lua_State *L, void *lib, const char *sym) {
+  (void)(lib); (void)(sym);  /* not used */
+  lua_pushliteral(L, DLMSG);
+  return NULL;
+}
+
+/* }====================================================== */
+#endif
+
+
+static void *ll_checkclib (lua_State *L, const char *path) {
+  void *plib;
+  lua_getfield(L, LUA_REGISTRYINDEX, CLIBS);
+  lua_getfield(L, -1, path);
+  plib = lua_touserdata(L, -1);  /* plib = CLIBS[path] */
+  lua_pop(L, 2);  /* pop CLIBS table and 'plib' */
+  return plib;
+}
+
+
+static void ll_addtoclib (lua_State *L, const char *path, void *plib) {
+  lua_getfield(L, LUA_REGISTRYINDEX, CLIBS);
+  lua_pushlightuserdata(L, plib);
+  lua_pushvalue(L, -1);
+  lua_setfield(L, -3, path);  /* CLIBS[path] = plib */
+  lua_rawseti(L, -2, luaL_len(L, -2) + 1);  /* CLIBS[#CLIBS + 1] = plib */
+  lua_pop(L, 1);  /* pop CLIBS table */
+}
+
+
+/*
+** __gc tag method for CLIBS table: calls 'll_unloadlib' for all lib
+** handles in list CLIBS
+*/
+static int gctm (lua_State *L) {
+  int n = luaL_len(L, 1);
+  for (; n >= 1; n--) {  /* for each handle, in reverse order */
+    lua_rawgeti(L, 1, n);  /* get handle CLIBS[n] */
+    ll_unloadlib(lua_touserdata(L, -1));
+    lua_pop(L, 1);  /* pop handle */
+  }
+  return 0;
+}
+
+
+static int ll_loadfunc (lua_State *L, const char *path, const char *sym) {
+  void *reg = ll_checkclib(L, path);  /* check loaded C libraries */
+  if (reg == NULL) {  /* must load library? */
+    reg = ll_load(L, path, *sym == '*');
+    if (reg == NULL) return ERRLIB;  /* unable to load library */
+    ll_addtoclib(L, path, reg);
+  }
+  if (*sym == '*') {  /* loading only library (no function)? */
+    lua_pushboolean(L, 1);  /* return 'true' */
+    return 0;  /* no errors */
+  }
+  else {
+    lua_CFunction f = ll_sym(L, reg, sym);
+    if (f == NULL)
+      return ERRFUNC;  /* unable to find function */
+    lua_pushcfunction(L, f);  /* else create new function */
+    return 0;  /* no errors */
+  }
+}
+
+
+static int ll_loadlib (lua_State *L) {
+  const char *path = luaL_checkstring(L, 1);
+  const char *init = luaL_checkstring(L, 2);
+  int stat = ll_loadfunc(L, path, init);
+  if (stat == 0)  /* no errors? */
+    return 1;  /* return the loaded function */
+  else {  /* error; error message is on stack top */
+    lua_pushnil(L);
+    lua_insert(L, -2);
+    lua_pushstring(L, (stat == ERRLIB) ?  LIB_FAIL : "init");
+    return 3;  /* return nil, error message, and where */
+  }
+}
+
+
+
+/*
+** {======================================================
+** 'require' function
+** =======================================================
+*/
+
+
+static int readable (const char *filename) {
+  FILE *f = fopen(filename, "r");  /* try to open file */
+  if (f == NULL) return 0;  /* open failed */
+  fclose(f);
+  return 1;
+}
+
+
+static const char *pushnexttemplate (lua_State *L, const char *path) {
+  const char *l;
+  while (*path == *LUA_PATH_SEP) path++;  /* skip separators */
+  if (*path == '\0') return NULL;  /* no more templates */
+  l = strchr(path, *LUA_PATH_SEP);  /* find next separator */
+  if (l == NULL) l = path + strlen(path);
+  lua_pushlstring(L, path, l - path);  /* template */
+  return l;
+}
+
+
+static const char *searchpath (lua_State *L, const char *name,
+                                             const char *path,
+                                             const char *sep,
+                                             const char *dirsep) {
+  luaL_Buffer msg;  /* to build error message */
+  luaL_buffinit(L, &msg);
+  if (*sep != '\0')  /* non-empty separator? */
+    name = luaL_gsub(L, name, sep, dirsep);  /* replace it by 'dirsep' */
+  while ((path = pushnexttemplate(L, path)) != NULL) {
+    const char *filename = luaL_gsub(L, lua_tostring(L, -1),
+                                     LUA_PATH_MARK, name);
+    lua_remove(L, -2);  /* remove path template */
+    if (readable(filename))  /* does file exist and is readable? */
+      return filename;  /* return that file name */
+    lua_pushfstring(L, "\n\tno file " LUA_QS, filename);
+    lua_remove(L, -2);  /* remove file name */
+    luaL_addvalue(&msg);  /* concatenate error msg. entry */
+  }
+  luaL_pushresult(&msg);  /* create error message */
+  return NULL;  /* not found */
+}
+
+
+static int ll_searchpath (lua_State *L) {
+  const char *f = searchpath(L, luaL_checkstring(L, 1),
+                                luaL_checkstring(L, 2),
+                                luaL_optstring(L, 3, "."),
+                                luaL_optstring(L, 4, LUA_DIRSEP));
+  if (f != NULL) return 1;
+  else {  /* error message is on top of the stack */
+    lua_pushnil(L);
+    lua_insert(L, -2);
+    return 2;  /* return nil + error message */
+  }
+}
+
+
+static const char *findfile (lua_State *L, const char *name,
+                                           const char *pname,
+                                           const char *dirsep) {
+  const char *path;
+  lua_getfield(L, lua_upvalueindex(1), pname);
+  path = lua_tostring(L, -1);
+  if (path == NULL)
+    luaL_error(L, LUA_QL("package.%s") " must be a string", pname);
+  return searchpath(L, name, path, ".", dirsep);
+}
+
+
+static int checkload (lua_State *L, int stat, const char *filename) {
+  if (stat) {  /* module loaded successfully? */
+    lua_pushstring(L, filename);  /* will be 2nd argument to module */
+    return 2;  /* return open function and file name */
+  }
+  else
+    return luaL_error(L, "error loading module " LUA_QS
+                         " from file " LUA_QS ":\n\t%s",
+                          lua_tostring(L, 1), filename, lua_tostring(L, -1));
+}
+
+
+static int searcher_Lua (lua_State *L) {
+  const char *filename;
+  const char *name = luaL_checkstring(L, 1);
+  filename = findfile(L, name, "path", LUA_LSUBSEP);
+  if (filename == NULL) return 1;  /* module not found in this path */
+  return checkload(L, (luaL_loadfile(L, filename) == LUA_OK), filename);
+}
+
+
+static int loadfunc (lua_State *L, const char *filename, const char *modname) {
+  const char *funcname;
+  const char *mark;
+  modname = luaL_gsub(L, modname, ".", LUA_OFSEP);
+  mark = strchr(modname, *LUA_IGMARK);
+  if (mark) {
+    int stat;
+    funcname = lua_pushlstring(L, modname, mark - modname);
+    funcname = lua_pushfstring(L, LUA_POF"%s", funcname);
+    stat = ll_loadfunc(L, filename, funcname);
+    if (stat != ERRFUNC) return stat;
+    modname = mark + 1;  /* else go ahead and try old-style name */
+  }
+  funcname = lua_pushfstring(L, LUA_POF"%s", modname);
+  return ll_loadfunc(L, filename, funcname);
+}
+
+
+static int searcher_C (lua_State *L) {
+  const char *name = luaL_checkstring(L, 1);
+  const char *filename = findfile(L, name, "cpath", LUA_CSUBSEP);
+  if (filename == NULL) return 1;  /* module not found in this path */
+  return checkload(L, (loadfunc(L, filename, name) == 0), filename);
+}
+
+
+static int searcher_Croot (lua_State *L) {
+  const char *filename;
+  const char *name = luaL_checkstring(L, 1);
+  const char *p = strchr(name, '.');
+  int stat;
+  if (p == NULL) return 0;  /* is root */
+  lua_pushlstring(L, name, p - name);
+  filename = findfile(L, lua_tostring(L, -1), "cpath", LUA_CSUBSEP);
+  if (filename == NULL) return 1;  /* root not found */
+  if ((stat = loadfunc(L, filename, name)) != 0) {
+    if (stat != ERRFUNC)
+      return checkload(L, 0, filename);  /* real error */
+    else {  /* open function not found */
+      lua_pushfstring(L, "\n\tno module " LUA_QS " in file " LUA_QS,
+                         name, filename);
+      return 1;
+    }
+  }
+  lua_pushstring(L, filename);  /* will be 2nd argument to module */
+  return 2;
+}
+
+
+static int searcher_preload (lua_State *L) {
+  const char *name = luaL_checkstring(L, 1);
+  lua_getfield(L, LUA_REGISTRYINDEX, "_PRELOAD");
+  lua_getfield(L, -1, name);
+  if (lua_isnil(L, -1))  /* not found? */
+    lua_pushfstring(L, "\n\tno field package.preload['%s']", name);
+  return 1;
+}
+
+
+static void findloader (lua_State *L, const char *name) {
+  int i;
+  luaL_Buffer msg;  /* to build error message */
+  luaL_buffinit(L, &msg);
+  lua_getfield(L, lua_upvalueindex(1), "searchers");  /* will be at index 3 */
+  if (!lua_istable(L, 3))
+    luaL_error(L, LUA_QL("package.searchers") " must be a table");
+  /*  iterate over available searchers to find a loader */
+  for (i = 1; ; i++) {
+    lua_rawgeti(L, 3, i);  /* get a searcher */
+    if (lua_isnil(L, -1)) {  /* no more searchers? */
+      lua_pop(L, 1);  /* remove nil */
+      luaL_pushresult(&msg);  /* create error message */
+      luaL_error(L, "module " LUA_QS " not found:%s",
+                    name, lua_tostring(L, -1));
+    }
+    lua_pushstring(L, name);
+    lua_call(L, 1, 2);  /* call it */
+    if (lua_isfunction(L, -2))  /* did it find a loader? */
+      return;  /* module loader found */
+    else if (lua_isstring(L, -2)) {  /* searcher returned error message? */
+      lua_pop(L, 1);  /* remove extra return */
+      luaL_addvalue(&msg);  /* concatenate error message */
+    }
+    else
+      lua_pop(L, 2);  /* remove both returns */
+  }
+}
+
+
+static int ll_require (lua_State *L) {
+  const char *name = luaL_checkstring(L, 1);
+  lua_settop(L, 1);  /* _LOADED table will be at index 2 */
+  lua_getfield(L, LUA_REGISTRYINDEX, "_LOADED");
+  lua_getfield(L, 2, name);  /* _LOADED[name] */
+  if (lua_toboolean(L, -1))  /* is it there? */
+    return 1;  /* package is already loaded */
+  /* else must load package */
+  lua_pop(L, 1);  /* remove 'getfield' result */
+  findloader(L, name);
+  lua_pushstring(L, name);  /* pass name as argument to module loader */
+  lua_insert(L, -2);  /* name is 1st argument (before search data) */
+  lua_call(L, 2, 1);  /* run loader to load module */
+  if (!lua_isnil(L, -1))  /* non-nil return? */
+    lua_setfield(L, 2, name);  /* _LOADED[name] = returned value */
+  lua_getfield(L, 2, name);
+  if (lua_isnil(L, -1)) {   /* module did not set a value? */
+    lua_pushboolean(L, 1);  /* use true as result */
+    lua_pushvalue(L, -1);  /* extra copy to be returned */
+    lua_setfield(L, 2, name);  /* _LOADED[name] = true */
+  }
+  return 1;
+}
+
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** 'module' function
+** =======================================================
+*/
+#if defined(LUA_COMPAT_MODULE)
+
+/*
+** changes the environment variable of calling function
+*/
+static void set_env (lua_State *L) {
+  lua_Debug ar;
+  if (lua_getstack(L, 1, &ar) == 0 ||
+      lua_getinfo(L, "f", &ar) == 0 ||  /* get calling function */
+      lua_iscfunction(L, -1))
+    luaL_error(L, LUA_QL("module") " not called from a Lua function");
+  lua_pushvalue(L, -2);  /* copy new environment table to top */
+  lua_setupvalue(L, -2, 1);
+  lua_pop(L, 1);  /* remove function */
+}
+
+
+static void dooptions (lua_State *L, int n) {
+  int i;
+  for (i = 2; i <= n; i++) {
+    if (lua_isfunction(L, i)) {  /* avoid 'calling' extra info. */
+      lua_pushvalue(L, i);  /* get option (a function) */
+      lua_pushvalue(L, -2);  /* module */
+      lua_call(L, 1, 0);
+    }
+  }
+}
+
+
+static void modinit (lua_State *L, const char *modname) {
+  const char *dot;
+  lua_pushvalue(L, -1);
+  lua_setfield(L, -2, "_M");  /* module._M = module */
+  lua_pushstring(L, modname);
+  lua_setfield(L, -2, "_NAME");
+  dot = strrchr(modname, '.');  /* look for last dot in module name */
+  if (dot == NULL) dot = modname;
+  else dot++;
+  /* set _PACKAGE as package name (full module name minus last part) */
+  lua_pushlstring(L, modname, dot - modname);
+  lua_setfield(L, -2, "_PACKAGE");
+}
+
+
+static int ll_module (lua_State *L) {
+  const char *modname = luaL_checkstring(L, 1);
+  int lastarg = lua_gettop(L);  /* last parameter */
+  luaL_pushmodule(L, modname, 1);  /* get/create module table */
+  /* check whether table already has a _NAME field */
+  lua_getfield(L, -1, "_NAME");
+  if (!lua_isnil(L, -1))  /* is table an initialized module? */
+    lua_pop(L, 1);
+  else {  /* no; initialize it */
+    lua_pop(L, 1);
+    modinit(L, modname);
+  }
+  lua_pushvalue(L, -1);
+  set_env(L);
+  dooptions(L, lastarg);
+  return 1;
+}
+
+
+static int ll_seeall (lua_State *L) {
+  luaL_checktype(L, 1, LUA_TTABLE);
+  if (!lua_getmetatable(L, 1)) {
+    lua_createtable(L, 0, 1); /* create new metatable */
+    lua_pushvalue(L, -1);
+    lua_setmetatable(L, 1);
+  }
+  lua_pushglobaltable(L);
+  lua_setfield(L, -2, "__index");  /* mt.__index = _G */
+  return 0;
+}
+
+#endif
+/* }====================================================== */
+
+
+
+/* auxiliary mark (for internal use) */
+#define AUXMARK		"\1"
+
+
+/*
+** return registry.LUA_NOENV as a boolean
+*/
+static int noenv (lua_State *L) {
+  int b;
+  lua_getfield(L, LUA_REGISTRYINDEX, "LUA_NOENV");
+  b = lua_toboolean(L, -1);
+  lua_pop(L, 1);  /* remove value */
+  return b;
+}
+
+
+static void setpath (lua_State *L, const char *fieldname, const char *envname1,
+                                   const char *envname2, const char *def) {
+  const char *path = getenv(envname1);
+  if (path == NULL)  /* no environment variable? */
+    path = getenv(envname2);  /* try alternative name */
+  if (path == NULL || noenv(L))  /* no environment variable? */
+    lua_pushstring(L, def);  /* use default */
+  else {
+    /* replace ";;" by ";AUXMARK;" and then AUXMARK by default path */
+    path = luaL_gsub(L, path, LUA_PATH_SEP LUA_PATH_SEP,
+                              LUA_PATH_SEP AUXMARK LUA_PATH_SEP);
+    luaL_gsub(L, path, AUXMARK, def);
+    lua_remove(L, -2);
+  }
+  setprogdir(L);
+  lua_setfield(L, -2, fieldname);
+}
+
+
+static const luaL_Reg pk_funcs[] = {
+  {"loadlib", ll_loadlib},
+  {"searchpath", ll_searchpath},
+#if defined(LUA_COMPAT_MODULE)
+  {"seeall", ll_seeall},
+#endif
+  {NULL, NULL}
+};
+
+
+static const luaL_Reg ll_funcs[] = {
+#if defined(LUA_COMPAT_MODULE)
+  {"module", ll_module},
+#endif
+  {"require", ll_require},
+  {NULL, NULL}
+};
+
+
+static void createsearcherstable (lua_State *L) {
+  static const lua_CFunction searchers[] =
+    {searcher_preload, searcher_Lua, searcher_C, searcher_Croot, NULL};
+  int i;
+  /* create 'searchers' table */
+  lua_createtable(L, sizeof(searchers)/sizeof(searchers[0]) - 1, 0);
+  /* fill it with pre-defined searchers */
+  for (i=0; searchers[i] != NULL; i++) {
+    lua_pushvalue(L, -2);  /* set 'package' as upvalue for all searchers */
+    lua_pushcclosure(L, searchers[i], 1);
+    lua_rawseti(L, -2, i+1);
+  }
+}
+
+
+LUAMOD_API int luaopen_package (lua_State *L) {
+  /* create table CLIBS to keep track of loaded C libraries */
+  luaL_getsubtable(L, LUA_REGISTRYINDEX, CLIBS);
+  lua_createtable(L, 0, 1);  /* metatable for CLIBS */
+  lua_pushcfunction(L, gctm);
+  lua_setfield(L, -2, "__gc");  /* set finalizer for CLIBS table */
+  lua_setmetatable(L, -2);
+  /* create `package' table */
+  luaL_newlib(L, pk_funcs);
+  createsearcherstable(L);
+#if defined(LUA_COMPAT_LOADERS)
+  lua_pushvalue(L, -1);  /* make a copy of 'searchers' table */
+  lua_setfield(L, -3, "loaders");  /* put it in field `loaders' */
+#endif
+  lua_setfield(L, -2, "searchers");  /* put it in field 'searchers' */
+  /* set field 'path' */
+  setpath(L, "path", LUA_PATHVERSION, LUA_PATH, LUA_PATH_DEFAULT);
+  /* set field 'cpath' */
+  setpath(L, "cpath", LUA_CPATHVERSION, LUA_CPATH, LUA_CPATH_DEFAULT);
+  /* store config information */
+  lua_pushliteral(L, LUA_DIRSEP "\n" LUA_PATH_SEP "\n" LUA_PATH_MARK "\n"
+                     LUA_EXEC_DIR "\n" LUA_IGMARK "\n");
+  lua_setfield(L, -2, "config");
+  /* set field `loaded' */
+  luaL_getsubtable(L, LUA_REGISTRYINDEX, "_LOADED");
+  lua_setfield(L, -2, "loaded");
+  /* set field `preload' */
+  luaL_getsubtable(L, LUA_REGISTRYINDEX, "_PRELOAD");
+  lua_setfield(L, -2, "preload");
+  lua_pushglobaltable(L);
+  lua_pushvalue(L, -2);  /* set 'package' as upvalue for next lib */
+  luaL_setfuncs(L, ll_funcs, 1);  /* open lib into global table */
+  lua_pop(L, 1);  /* pop global table */
+  return 1;  /* return 'package' table */
+}
+
diff --git a/ext/lua/src/lobject.c b/ext/lua/src/lobject.c
new file mode 100644
index 0000000..c152785
--- /dev/null
+++ b/ext/lua/src/lobject.c
@@ -0,0 +1,287 @@
+/*
+** $Id: lobject.c,v 2.58 2013/02/20 14:08:56 roberto Exp $
+** Some generic functions over Lua objects
+** See Copyright Notice in lua.h
+*/
+
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define lobject_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lctype.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "lvm.h"
+
+
+
+LUAI_DDEF const TValue luaO_nilobject_ = {NILCONSTANT};
+
+
+/*
+** converts an integer to a "floating point byte", represented as
+** (eeeeexxx), where the real value is (1xxx) * 2^(eeeee - 1) if
+** eeeee != 0 and (xxx) otherwise.
+*/
+int luaO_int2fb (unsigned int x) {
+  int e = 0;  /* exponent */
+  if (x < 8) return x;
+  while (x >= 0x10) {
+    x = (x+1) >> 1;
+    e++;
+  }
+  return ((e+1) << 3) | (cast_int(x) - 8);
+}
+
+
+/* converts back */
+int luaO_fb2int (int x) {
+  int e = (x >> 3) & 0x1f;
+  if (e == 0) return x;
+  else return ((x & 7) + 8) << (e - 1);
+}
+
+
+int luaO_ceillog2 (unsigned int x) {
+  static const lu_byte log_2[256] = {
+    0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,
+    6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,
+    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+    7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,
+    8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+  };
+  int l = 0;
+  x--;
+  while (x >= 256) { l += 8; x >>= 8; }
+  return l + log_2[x];
+}
+
+
+lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2) {
+  switch (op) {
+    case LUA_OPADD: return luai_numadd(NULL, v1, v2);
+    case LUA_OPSUB: return luai_numsub(NULL, v1, v2);
+    case LUA_OPMUL: return luai_nummul(NULL, v1, v2);
+    case LUA_OPDIV: return luai_numdiv(NULL, v1, v2);
+    case LUA_OPMOD: return luai_nummod(NULL, v1, v2);
+    case LUA_OPPOW: return luai_numpow(NULL, v1, v2);
+    case LUA_OPUNM: return luai_numunm(NULL, v1);
+    default: lua_assert(0); return 0;
+  }
+}
+
+
+int luaO_hexavalue (int c) {
+  if (lisdigit(c)) return c - '0';
+  else return ltolower(c) - 'a' + 10;
+}
+
+
+#if !defined(lua_strx2number)
+
+#include <math.h>
+
+
+static int isneg (const char **s) {
+  if (**s == '-') { (*s)++; return 1; }
+  else if (**s == '+') (*s)++;
+  return 0;
+}
+
+
+static lua_Number readhexa (const char **s, lua_Number r, int *count) {
+  for (; lisxdigit(cast_uchar(**s)); (*s)++) {  /* read integer part */
+    r = (r * cast_num(16.0)) + cast_num(luaO_hexavalue(cast_uchar(**s)));
+    (*count)++;
+  }
+  return r;
+}
+
+
+/*
+** convert an hexadecimal numeric string to a number, following
+** C99 specification for 'strtod'
+*/
+static lua_Number lua_strx2number (const char *s, char **endptr) {
+  lua_Number r = 0.0;
+  int e = 0, i = 0;
+  int neg = 0;  /* 1 if number is negative */
+  *endptr = cast(char *, s);  /* nothing is valid yet */
+  while (lisspace(cast_uchar(*s))) s++;  /* skip initial spaces */
+  neg = isneg(&s);  /* check signal */
+  if (!(*s == '0' && (*(s + 1) == 'x' || *(s + 1) == 'X')))  /* check '0x' */
+    return 0.0;  /* invalid format (no '0x') */
+  s += 2;  /* skip '0x' */
+  r = readhexa(&s, r, &i);  /* read integer part */
+  if (*s == '.') {
+    s++;  /* skip dot */
+    r = readhexa(&s, r, &e);  /* read fractional part */
+  }
+  if (i == 0 && e == 0)
+    return 0.0;  /* invalid format (no digit) */
+  e *= -4;  /* each fractional digit divides value by 2^-4 */
+  *endptr = cast(char *, s);  /* valid up to here */
+  if (*s == 'p' || *s == 'P') {  /* exponent part? */
+    int exp1 = 0;
+    int neg1;
+    s++;  /* skip 'p' */
+    neg1 = isneg(&s);  /* signal */
+    if (!lisdigit(cast_uchar(*s)))
+      goto ret;  /* must have at least one digit */
+    while (lisdigit(cast_uchar(*s)))  /* read exponent */
+      exp1 = exp1 * 10 + *(s++) - '0';
+    if (neg1) exp1 = -exp1;
+    e += exp1;
+  }
+  *endptr = cast(char *, s);  /* valid up to here */
+ ret:
+  if (neg) r = -r;
+  return l_mathop(ldexp)(r, e);
+}
+
+#endif
+
+
+int luaO_str2d (const char *s, size_t len, lua_Number *result) {
+  char *endptr;
+  if (strpbrk(s, "nN"))  /* reject 'inf' and 'nan' */
+    return 0;
+  else if (strpbrk(s, "xX"))  /* hexa? */
+    *result = lua_strx2number(s, &endptr);
+  else
+    *result = lua_str2number(s, &endptr);
+  if (endptr == s) return 0;  /* nothing recognized */
+  while (lisspace(cast_uchar(*endptr))) endptr++;
+  return (endptr == s + len);  /* OK if no trailing characters */
+}
+
+
+
+static void pushstr (lua_State *L, const char *str, size_t l) {
+  setsvalue2s(L, L->top++, luaS_newlstr(L, str, l));
+}
+
+
+/* this function handles only `%d', `%c', %f, %p, and `%s' formats */
+const char *luaO_pushvfstring (lua_State *L, const char *fmt, va_list argp) {
+  int n = 0;
+  for (;;) {
+    const char *e = strchr(fmt, '%');
+    if (e == NULL) break;
+    luaD_checkstack(L, 2);  /* fmt + item */
+    pushstr(L, fmt, e - fmt);
+    switch (*(e+1)) {
+      case 's': {
+        const char *s = va_arg(argp, char *);
+        if (s == NULL) s = "(null)";
+        pushstr(L, s, strlen(s));
+        break;
+      }
+      case 'c': {
+        char buff;
+        buff = cast(char, va_arg(argp, int));
+        pushstr(L, &buff, 1);
+        break;
+      }
+      case 'd': {
+        setnvalue(L->top++, cast_num(va_arg(argp, int)));
+        break;
+      }
+      case 'f': {
+        setnvalue(L->top++, cast_num(va_arg(argp, l_uacNumber)));
+        break;
+      }
+      case 'p': {
+        char buff[4*sizeof(void *) + 8]; /* should be enough space for a `%p' */
+        int l = sprintf(buff, "%p", va_arg(argp, void *));
+        pushstr(L, buff, l);
+        break;
+      }
+      case '%': {
+        pushstr(L, "%", 1);
+        break;
+      }
+      default: {
+        luaG_runerror(L,
+            "invalid option " LUA_QL("%%%c") " to " LUA_QL("lua_pushfstring"),
+            *(e + 1));
+      }
+    }
+    n += 2;
+    fmt = e+2;
+  }
+  luaD_checkstack(L, 1);
+  pushstr(L, fmt, strlen(fmt));
+  if (n > 0) luaV_concat(L, n + 1);
+  return svalue(L->top - 1);
+}
+
+
+const char *luaO_pushfstring (lua_State *L, const char *fmt, ...) {
+  const char *msg;
+  va_list argp;
+  va_start(argp, fmt);
+  msg = luaO_pushvfstring(L, fmt, argp);
+  va_end(argp);
+  return msg;
+}
+
+
+/* number of chars of a literal string without the ending \0 */
+#define LL(x)	(sizeof(x)/sizeof(char) - 1)
+
+#define RETS	"..."
+#define PRE	"[string \""
+#define POS	"\"]"
+
+#define addstr(a,b,l)	( memcpy(a,b,(l) * sizeof(char)), a += (l) )
+
+void luaO_chunkid (char *out, const char *source, size_t bufflen) {
+  size_t l = strlen(source);
+  if (*source == '=') {  /* 'literal' source */
+    if (l <= bufflen)  /* small enough? */
+      memcpy(out, source + 1, l * sizeof(char));
+    else {  /* truncate it */
+      addstr(out, source + 1, bufflen - 1);
+      *out = '\0';
+    }
+  }
+  else if (*source == '@') {  /* file name */
+    if (l <= bufflen)  /* small enough? */
+      memcpy(out, source + 1, l * sizeof(char));
+    else {  /* add '...' before rest of name */
+      addstr(out, RETS, LL(RETS));
+      bufflen -= LL(RETS);
+      memcpy(out, source + 1 + l - bufflen, bufflen * sizeof(char));
+    }
+  }
+  else {  /* string; format as [string "source"] */
+    const char *nl = strchr(source, '\n');  /* find first new line (if any) */
+    addstr(out, PRE, LL(PRE));  /* add prefix */
+    bufflen -= LL(PRE RETS POS) + 1;  /* save space for prefix+suffix+'\0' */
+    if (l < bufflen && nl == NULL) {  /* small one-line source? */
+      addstr(out, source, l);  /* keep it */
+    }
+    else {
+      if (nl != NULL) l = nl - source;  /* stop at first newline */
+      if (l > bufflen) l = bufflen;
+      addstr(out, source, l);
+      addstr(out, RETS, LL(RETS));
+    }
+    memcpy(out, POS, (LL(POS) + 1) * sizeof(char));
+  }
+}
+
diff --git a/ext/lua/src/lopcodes.c b/ext/lua/src/lopcodes.c
new file mode 100644
index 0000000..ef73692
--- /dev/null
+++ b/ext/lua/src/lopcodes.c
@@ -0,0 +1,107 @@
+/*
+** $Id: lopcodes.c,v 1.49 2012/05/14 13:34:18 roberto Exp $
+** Opcodes for Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+
+#define lopcodes_c
+#define LUA_CORE
+
+
+#include "lopcodes.h"
+
+
+/* ORDER OP */
+
+LUAI_DDEF const char *const luaP_opnames[NUM_OPCODES+1] = {
+  "MOVE",
+  "LOADK",
+  "LOADKX",
+  "LOADBOOL",
+  "LOADNIL",
+  "GETUPVAL",
+  "GETTABUP",
+  "GETTABLE",
+  "SETTABUP",
+  "SETUPVAL",
+  "SETTABLE",
+  "NEWTABLE",
+  "SELF",
+  "ADD",
+  "SUB",
+  "MUL",
+  "DIV",
+  "MOD",
+  "POW",
+  "UNM",
+  "NOT",
+  "LEN",
+  "CONCAT",
+  "JMP",
+  "EQ",
+  "LT",
+  "LE",
+  "TEST",
+  "TESTSET",
+  "CALL",
+  "TAILCALL",
+  "RETURN",
+  "FORLOOP",
+  "FORPREP",
+  "TFORCALL",
+  "TFORLOOP",
+  "SETLIST",
+  "CLOSURE",
+  "VARARG",
+  "EXTRAARG",
+  NULL
+};
+
+
+#define opmode(t,a,b,c,m) (((t)<<7) | ((a)<<6) | ((b)<<4) | ((c)<<2) | (m))
+
+LUAI_DDEF const lu_byte luaP_opmodes[NUM_OPCODES] = {
+/*       T  A    B       C     mode		   opcode	*/
+  opmode(0, 1, OpArgR, OpArgN, iABC)		/* OP_MOVE */
+ ,opmode(0, 1, OpArgK, OpArgN, iABx)		/* OP_LOADK */
+ ,opmode(0, 1, OpArgN, OpArgN, iABx)		/* OP_LOADKX */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC)		/* OP_LOADBOOL */
+ ,opmode(0, 1, OpArgU, OpArgN, iABC)		/* OP_LOADNIL */
+ ,opmode(0, 1, OpArgU, OpArgN, iABC)		/* OP_GETUPVAL */
+ ,opmode(0, 1, OpArgU, OpArgK, iABC)		/* OP_GETTABUP */
+ ,opmode(0, 1, OpArgR, OpArgK, iABC)		/* OP_GETTABLE */
+ ,opmode(0, 0, OpArgK, OpArgK, iABC)		/* OP_SETTABUP */
+ ,opmode(0, 0, OpArgU, OpArgN, iABC)		/* OP_SETUPVAL */
+ ,opmode(0, 0, OpArgK, OpArgK, iABC)		/* OP_SETTABLE */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC)		/* OP_NEWTABLE */
+ ,opmode(0, 1, OpArgR, OpArgK, iABC)		/* OP_SELF */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_ADD */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_SUB */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_MUL */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_DIV */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_MOD */
+ ,opmode(0, 1, OpArgK, OpArgK, iABC)		/* OP_POW */
+ ,opmode(0, 1, OpArgR, OpArgN, iABC)		/* OP_UNM */
+ ,opmode(0, 1, OpArgR, OpArgN, iABC)		/* OP_NOT */
+ ,opmode(0, 1, OpArgR, OpArgN, iABC)		/* OP_LEN */
+ ,opmode(0, 1, OpArgR, OpArgR, iABC)		/* OP_CONCAT */
+ ,opmode(0, 0, OpArgR, OpArgN, iAsBx)		/* OP_JMP */
+ ,opmode(1, 0, OpArgK, OpArgK, iABC)		/* OP_EQ */
+ ,opmode(1, 0, OpArgK, OpArgK, iABC)		/* OP_LT */
+ ,opmode(1, 0, OpArgK, OpArgK, iABC)		/* OP_LE */
+ ,opmode(1, 0, OpArgN, OpArgU, iABC)		/* OP_TEST */
+ ,opmode(1, 1, OpArgR, OpArgU, iABC)		/* OP_TESTSET */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC)		/* OP_CALL */
+ ,opmode(0, 1, OpArgU, OpArgU, iABC)		/* OP_TAILCALL */
+ ,opmode(0, 0, OpArgU, OpArgN, iABC)		/* OP_RETURN */
+ ,opmode(0, 1, OpArgR, OpArgN, iAsBx)		/* OP_FORLOOP */
+ ,opmode(0, 1, OpArgR, OpArgN, iAsBx)		/* OP_FORPREP */
+ ,opmode(0, 0, OpArgN, OpArgU, iABC)		/* OP_TFORCALL */
+ ,opmode(0, 1, OpArgR, OpArgN, iAsBx)		/* OP_TFORLOOP */
+ ,opmode(0, 0, OpArgU, OpArgU, iABC)		/* OP_SETLIST */
+ ,opmode(0, 1, OpArgU, OpArgN, iABx)		/* OP_CLOSURE */
+ ,opmode(0, 1, OpArgU, OpArgN, iABC)		/* OP_VARARG */
+ ,opmode(0, 0, OpArgU, OpArgU, iAx)		/* OP_EXTRAARG */
+};
+
diff --git a/ext/lua/src/loslib.c b/ext/lua/src/loslib.c
new file mode 100644
index 0000000..5170fd0
--- /dev/null
+++ b/ext/lua/src/loslib.c
@@ -0,0 +1,323 @@
+/*
+** $Id: loslib.c,v 1.40 2012/10/19 15:54:02 roberto Exp $
+** Standard Operating System library
+** See Copyright Notice in lua.h
+*/
+
+
+#include <errno.h>
+#include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+#define loslib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+/*
+** list of valid conversion specifiers for the 'strftime' function
+*/
+#if !defined(LUA_STRFTIMEOPTIONS)
+
+#if !defined(LUA_USE_POSIX)
+#define LUA_STRFTIMEOPTIONS	{ "aAbBcdHIjmMpSUwWxXyYz%", "" }
+#else
+#define LUA_STRFTIMEOPTIONS \
+	{ "aAbBcCdDeFgGhHIjmMnprRStTuUVwWxXyYzZ%", "" \
+	  "", "E", "cCxXyY",  \
+	  "O", "deHImMSuUVwWy" }
+#endif
+
+#endif
+
+
+
+/*
+** By default, Lua uses tmpnam except when POSIX is available, where it
+** uses mkstemp.
+*/
+#if defined(LUA_USE_MKSTEMP)
+#include <unistd.h>
+#define LUA_TMPNAMBUFSIZE	32
+#define lua_tmpnam(b,e) { \
+        strcpy(b, "/tmp/lua_XXXXXX"); \
+        e = mkstemp(b); \
+        if (e != -1) close(e); \
+        e = (e == -1); }
+
+#elif !defined(lua_tmpnam)
+
+#define LUA_TMPNAMBUFSIZE	L_tmpnam
+#define lua_tmpnam(b,e)		{ e = (tmpnam(b) == NULL); }
+
+#endif
+
+
+/*
+** By default, Lua uses gmtime/localtime, except when POSIX is available,
+** where it uses gmtime_r/localtime_r
+*/
+#if defined(LUA_USE_GMTIME_R)
+
+#define l_gmtime(t,r)		gmtime_r(t,r)
+#define l_localtime(t,r)	localtime_r(t,r)
+
+#elif !defined(l_gmtime)
+
+#define l_gmtime(t,r)		((void)r, gmtime(t))
+#define l_localtime(t,r)  	((void)r, localtime(t))
+
+#endif
+
+
+
+static int os_execute (lua_State *L) {
+  const char *cmd = luaL_optstring(L, 1, NULL);
+  int stat = system(cmd);
+  if (cmd != NULL)
+    return luaL_execresult(L, stat);
+  else {
+    lua_pushboolean(L, stat);  /* true if there is a shell */
+    return 1;
+  }
+}
+
+
+static int os_remove (lua_State *L) {
+  const char *filename = luaL_checkstring(L, 1);
+  return luaL_fileresult(L, remove(filename) == 0, filename);
+}
+
+
+static int os_rename (lua_State *L) {
+  const char *fromname = luaL_checkstring(L, 1);
+  const char *toname = luaL_checkstring(L, 2);
+  return luaL_fileresult(L, rename(fromname, toname) == 0, NULL);
+}
+
+
+static int os_tmpname (lua_State *L) {
+  char buff[LUA_TMPNAMBUFSIZE];
+  int err;
+  lua_tmpnam(buff, err);
+  if (err)
+    return luaL_error(L, "unable to generate a unique filename");
+  lua_pushstring(L, buff);
+  return 1;
+}
+
+
+static int os_getenv (lua_State *L) {
+  lua_pushstring(L, getenv(luaL_checkstring(L, 1)));  /* if NULL push nil */
+  return 1;
+}
+
+
+static int os_clock (lua_State *L) {
+  lua_pushnumber(L, ((lua_Number)clock())/(lua_Number)CLOCKS_PER_SEC);
+  return 1;
+}
+
+
+/*
+** {======================================================
+** Time/Date operations
+** { year=%Y, month=%m, day=%d, hour=%H, min=%M, sec=%S,
+**   wday=%w+1, yday=%j, isdst=? }
+** =======================================================
+*/
+
+static void setfield (lua_State *L, const char *key, int value) {
+  lua_pushinteger(L, value);
+  lua_setfield(L, -2, key);
+}
+
+static void setboolfield (lua_State *L, const char *key, int value) {
+  if (value < 0)  /* undefined? */
+    return;  /* does not set field */
+  lua_pushboolean(L, value);
+  lua_setfield(L, -2, key);
+}
+
+static int getboolfield (lua_State *L, const char *key) {
+  int res;
+  lua_getfield(L, -1, key);
+  res = lua_isnil(L, -1) ? -1 : lua_toboolean(L, -1);
+  lua_pop(L, 1);
+  return res;
+}
+
+
+static int getfield (lua_State *L, const char *key, int d) {
+  int res, isnum;
+  lua_getfield(L, -1, key);
+  res = (int)lua_tointegerx(L, -1, &isnum);
+  if (!isnum) {
+    if (d < 0)
+      return luaL_error(L, "field " LUA_QS " missing in date table", key);
+    res = d;
+  }
+  lua_pop(L, 1);
+  return res;
+}
+
+
+static const char *checkoption (lua_State *L, const char *conv, char *buff) {
+  static const char *const options[] = LUA_STRFTIMEOPTIONS;
+  unsigned int i;
+  for (i = 0; i < sizeof(options)/sizeof(options[0]); i += 2) {
+    if (*conv != '\0' && strchr(options[i], *conv) != NULL) {
+      buff[1] = *conv;
+      if (*options[i + 1] == '\0') {  /* one-char conversion specifier? */
+        buff[2] = '\0';  /* end buffer */
+        return conv + 1;
+      }
+      else if (*(conv + 1) != '\0' &&
+               strchr(options[i + 1], *(conv + 1)) != NULL) {
+        buff[2] = *(conv + 1);  /* valid two-char conversion specifier */
+        buff[3] = '\0';  /* end buffer */
+        return conv + 2;
+      }
+    }
+  }
+  luaL_argerror(L, 1,
+    lua_pushfstring(L, "invalid conversion specifier '%%%s'", conv));
+  return conv;  /* to avoid warnings */
+}
+
+
+static int os_date (lua_State *L) {
+  const char *s = luaL_optstring(L, 1, "%c");
+  time_t t = luaL_opt(L, (time_t)luaL_checknumber, 2, time(NULL));
+  struct tm tmr, *stm;
+  if (*s == '!') {  /* UTC? */
+    stm = l_gmtime(&t, &tmr);
+    s++;  /* skip `!' */
+  }
+  else
+    stm = l_localtime(&t, &tmr);
+  if (stm == NULL)  /* invalid date? */
+    lua_pushnil(L);
+  else if (strcmp(s, "*t") == 0) {
+    lua_createtable(L, 0, 9);  /* 9 = number of fields */
+    setfield(L, "sec", stm->tm_sec);
+    setfield(L, "min", stm->tm_min);
+    setfield(L, "hour", stm->tm_hour);
+    setfield(L, "day", stm->tm_mday);
+    setfield(L, "month", stm->tm_mon+1);
+    setfield(L, "year", stm->tm_year+1900);
+    setfield(L, "wday", stm->tm_wday+1);
+    setfield(L, "yday", stm->tm_yday+1);
+    setboolfield(L, "isdst", stm->tm_isdst);
+  }
+  else {
+    char cc[4];
+    luaL_Buffer b;
+    cc[0] = '%';
+    luaL_buffinit(L, &b);
+    while (*s) {
+      if (*s != '%')  /* no conversion specifier? */
+        luaL_addchar(&b, *s++);
+      else {
+        size_t reslen;
+        char buff[200];  /* should be big enough for any conversion result */
+        s = checkoption(L, s + 1, cc);
+        reslen = strftime(buff, sizeof(buff), cc, stm);
+        luaL_addlstring(&b, buff, reslen);
+      }
+    }
+    luaL_pushresult(&b);
+  }
+  return 1;
+}
+
+
+static int os_time (lua_State *L) {
+  time_t t;
+  if (lua_isnoneornil(L, 1))  /* called without args? */
+    t = time(NULL);  /* get current time */
+  else {
+    struct tm ts;
+    luaL_checktype(L, 1, LUA_TTABLE);
+    lua_settop(L, 1);  /* make sure table is at the top */
+    ts.tm_sec = getfield(L, "sec", 0);
+    ts.tm_min = getfield(L, "min", 0);
+    ts.tm_hour = getfield(L, "hour", 12);
+    ts.tm_mday = getfield(L, "day", -1);
+    ts.tm_mon = getfield(L, "month", -1) - 1;
+    ts.tm_year = getfield(L, "year", -1) - 1900;
+    ts.tm_isdst = getboolfield(L, "isdst");
+    t = mktime(&ts);
+  }
+  if (t == (time_t)(-1))
+    lua_pushnil(L);
+  else
+    lua_pushnumber(L, (lua_Number)t);
+  return 1;
+}
+
+
+static int os_difftime (lua_State *L) {
+  lua_pushnumber(L, difftime((time_t)(luaL_checknumber(L, 1)),
+                             (time_t)(luaL_optnumber(L, 2, 0))));
+  return 1;
+}
+
+/* }====================================================== */
+
+
+static int os_setlocale (lua_State *L) {
+  static const int cat[] = {LC_ALL, LC_COLLATE, LC_CTYPE, LC_MONETARY,
+                      LC_NUMERIC, LC_TIME};
+  static const char *const catnames[] = {"all", "collate", "ctype", "monetary",
+     "numeric", "time", NULL};
+  const char *l = luaL_optstring(L, 1, NULL);
+  int op = luaL_checkoption(L, 2, "all", catnames);
+  lua_pushstring(L, setlocale(cat[op], l));
+  return 1;
+}
+
+
+static int os_exit (lua_State *L) {
+  int status;
+  if (lua_isboolean(L, 1))
+    status = (lua_toboolean(L, 1) ? EXIT_SUCCESS : EXIT_FAILURE);
+  else
+    status = luaL_optint(L, 1, EXIT_SUCCESS);
+  if (lua_toboolean(L, 2))
+    lua_close(L);
+  if (L) exit(status);  /* 'if' to avoid warnings for unreachable 'return' */
+  return 0;
+}
+
+
+static const luaL_Reg syslib[] = {
+  {"clock",     os_clock},
+  {"date",      os_date},
+  {"difftime",  os_difftime},
+  {"execute",   os_execute},
+  {"exit",      os_exit},
+  {"getenv",    os_getenv},
+  {"remove",    os_remove},
+  {"rename",    os_rename},
+  {"setlocale", os_setlocale},
+  {"time",      os_time},
+  {"tmpname",   os_tmpname},
+  {NULL, NULL}
+};
+
+/* }====================================================== */
+
+
+
+LUAMOD_API int luaopen_os (lua_State *L) {
+  luaL_newlib(L, syslib);
+  return 1;
+}
+
diff --git a/ext/lua/src/lparser.c b/ext/lua/src/lparser.c
new file mode 100644
index 0000000..d8f5b4f
--- /dev/null
+++ b/ext/lua/src/lparser.c
@@ -0,0 +1,1638 @@
+/*
+** $Id: lparser.c,v 2.130 2013/02/06 13:37:39 roberto Exp $
+** Lua Parser
+** See Copyright Notice in lua.h
+*/
+
+
+#include <string.h>
+
+#define lparser_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lcode.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "llex.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lparser.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+
+
+
+/* maximum number of local variables per function (must be smaller
+   than 250, due to the bytecode format) */
+#define MAXVARS		200
+
+
+#define hasmultret(k)		((k) == VCALL || (k) == VVARARG)
+
+
+
+/*
+** nodes for block list (list of active blocks)
+*/
+typedef struct BlockCnt {
+  struct BlockCnt *previous;  /* chain */
+  short firstlabel;  /* index of first label in this block */
+  short firstgoto;  /* index of first pending goto in this block */
+  lu_byte nactvar;  /* # active locals outside the block */
+  lu_byte upval;  /* true if some variable in the block is an upvalue */
+  lu_byte isloop;  /* true if `block' is a loop */
+} BlockCnt;
+
+
+
+/*
+** prototypes for recursive non-terminal functions
+*/
+static void statement (LexState *ls);
+static void expr (LexState *ls, expdesc *v);
+
+
+static void anchor_token (LexState *ls) {
+  /* last token from outer function must be EOS */
+  lua_assert(ls->fs != NULL || ls->t.token == TK_EOS);
+  if (ls->t.token == TK_NAME || ls->t.token == TK_STRING) {
+    TString *ts = ls->t.seminfo.ts;
+    luaX_newstring(ls, getstr(ts), ts->tsv.len);
+  }
+}
+
+
+/* semantic error */
+static l_noret semerror (LexState *ls, const char *msg) {
+  ls->t.token = 0;  /* remove 'near to' from final message */
+  luaX_syntaxerror(ls, msg);
+}
+
+
+static l_noret error_expected (LexState *ls, int token) {
+  luaX_syntaxerror(ls,
+      luaO_pushfstring(ls->L, "%s expected", luaX_token2str(ls, token)));
+}
+
+
+static l_noret errorlimit (FuncState *fs, int limit, const char *what) {
+  lua_State *L = fs->ls->L;
+  const char *msg;
+  int line = fs->f->linedefined;
+  const char *where = (line == 0)
+                      ? "main function"
+                      : luaO_pushfstring(L, "function at line %d", line);
+  msg = luaO_pushfstring(L, "too many %s (limit is %d) in %s",
+                             what, limit, where);
+  luaX_syntaxerror(fs->ls, msg);
+}
+
+
+static void checklimit (FuncState *fs, int v, int l, const char *what) {
+  if (v > l) errorlimit(fs, l, what);
+}
+
+
+static int testnext (LexState *ls, int c) {
+  if (ls->t.token == c) {
+    luaX_next(ls);
+    return 1;
+  }
+  else return 0;
+}
+
+
+static void check (LexState *ls, int c) {
+  if (ls->t.token != c)
+    error_expected(ls, c);
+}
+
+
+static void checknext (LexState *ls, int c) {
+  check(ls, c);
+  luaX_next(ls);
+}
+
+
+#define check_condition(ls,c,msg)	{ if (!(c)) luaX_syntaxerror(ls, msg); }
+
+
+
+static void check_match (LexState *ls, int what, int who, int where) {
+  if (!testnext(ls, what)) {
+    if (where == ls->linenumber)
+      error_expected(ls, what);
+    else {
+      luaX_syntaxerror(ls, luaO_pushfstring(ls->L,
+             "%s expected (to close %s at line %d)",
+              luaX_token2str(ls, what), luaX_token2str(ls, who), where));
+    }
+  }
+}
+
+
+static TString *str_checkname (LexState *ls) {
+  TString *ts;
+  check(ls, TK_NAME);
+  ts = ls->t.seminfo.ts;
+  luaX_next(ls);
+  return ts;
+}
+
+
+static void init_exp (expdesc *e, expkind k, int i) {
+  e->f = e->t = NO_JUMP;
+  e->k = k;
+  e->u.info = i;
+}
+
+
+static void codestring (LexState *ls, expdesc *e, TString *s) {
+  init_exp(e, VK, luaK_stringK(ls->fs, s));
+}
+
+
+static void checkname (LexState *ls, expdesc *e) {
+  codestring(ls, e, str_checkname(ls));
+}
+
+
+static int registerlocalvar (LexState *ls, TString *varname) {
+  FuncState *fs = ls->fs;
+  Proto *f = fs->f;
+  int oldsize = f->sizelocvars;
+  luaM_growvector(ls->L, f->locvars, fs->nlocvars, f->sizelocvars,
+                  LocVar, SHRT_MAX, "local variables");
+  while (oldsize < f->sizelocvars) f->locvars[oldsize++].varname = NULL;
+  f->locvars[fs->nlocvars].varname = varname;
+  luaC_objbarrier(ls->L, f, varname);
+  return fs->nlocvars++;
+}
+
+
+static void new_localvar (LexState *ls, TString *name) {
+  FuncState *fs = ls->fs;
+  Dyndata *dyd = ls->dyd;
+  int reg = registerlocalvar(ls, name);
+  checklimit(fs, dyd->actvar.n + 1 - fs->firstlocal,
+                  MAXVARS, "local variables");
+  luaM_growvector(ls->L, dyd->actvar.arr, dyd->actvar.n + 1,
+                  dyd->actvar.size, Vardesc, MAX_INT, "local variables");
+  dyd->actvar.arr[dyd->actvar.n++].idx = cast(short, reg);
+}
+
+
+static void new_localvarliteral_ (LexState *ls, const char *name, size_t sz) {
+  new_localvar(ls, luaX_newstring(ls, name, sz));
+}
+
+#define new_localvarliteral(ls,v) \
+	new_localvarliteral_(ls, "" v, (sizeof(v)/sizeof(char))-1)
+
+
+static LocVar *getlocvar (FuncState *fs, int i) {
+  int idx = fs->ls->dyd->actvar.arr[fs->firstlocal + i].idx;
+  lua_assert(idx < fs->nlocvars);
+  return &fs->f->locvars[idx];
+}
+
+
+static void adjustlocalvars (LexState *ls, int nvars) {
+  FuncState *fs = ls->fs;
+  fs->nactvar = cast_byte(fs->nactvar + nvars);
+  for (; nvars; nvars--) {
+    getlocvar(fs, fs->nactvar - nvars)->startpc = fs->pc;
+  }
+}
+
+
+static void removevars (FuncState *fs, int tolevel) {
+  fs->ls->dyd->actvar.n -= (fs->nactvar - tolevel);
+  while (fs->nactvar > tolevel)
+    getlocvar(fs, --fs->nactvar)->endpc = fs->pc;
+}
+
+
+static int searchupvalue (FuncState *fs, TString *name) {
+  int i;
+  Upvaldesc *up = fs->f->upvalues;
+  for (i = 0; i < fs->nups; i++) {
+    if (luaS_eqstr(up[i].name, name)) return i;
+  }
+  return -1;  /* not found */
+}
+
+
+static int newupvalue (FuncState *fs, TString *name, expdesc *v) {
+  Proto *f = fs->f;
+  int oldsize = f->sizeupvalues;
+  checklimit(fs, fs->nups + 1, MAXUPVAL, "upvalues");
+  luaM_growvector(fs->ls->L, f->upvalues, fs->nups, f->sizeupvalues,
+                  Upvaldesc, MAXUPVAL, "upvalues");
+  while (oldsize < f->sizeupvalues) f->upvalues[oldsize++].name = NULL;
+  f->upvalues[fs->nups].instack = (v->k == VLOCAL);
+  f->upvalues[fs->nups].idx = cast_byte(v->u.info);
+  f->upvalues[fs->nups].name = name;
+  luaC_objbarrier(fs->ls->L, f, name);
+  return fs->nups++;
+}
+
+
+static int searchvar (FuncState *fs, TString *n) {
+  int i;
+  for (i = cast_int(fs->nactvar) - 1; i >= 0; i--) {
+    if (luaS_eqstr(n, getlocvar(fs, i)->varname))
+      return i;
+  }
+  return -1;  /* not found */
+}
+
+
+/*
+  Mark block where variable at given level was defined
+  (to emit close instructions later).
+*/
+static void markupval (FuncState *fs, int level) {
+  BlockCnt *bl = fs->bl;
+  while (bl->nactvar > level) bl = bl->previous;
+  bl->upval = 1;
+}
+
+
+/*
+  Find variable with given name 'n'. If it is an upvalue, add this
+  upvalue into all intermediate functions.
+*/
+static int singlevaraux (FuncState *fs, TString *n, expdesc *var, int base) {
+  if (fs == NULL)  /* no more levels? */
+    return VVOID;  /* default is global */
+  else {
+    int v = searchvar(fs, n);  /* look up locals at current level */
+    if (v >= 0) {  /* found? */
+      init_exp(var, VLOCAL, v);  /* variable is local */
+      if (!base)
+        markupval(fs, v);  /* local will be used as an upval */
+      return VLOCAL;
+    }
+    else {  /* not found as local at current level; try upvalues */
+      int idx = searchupvalue(fs, n);  /* try existing upvalues */
+      if (idx < 0) {  /* not found? */
+        if (singlevaraux(fs->prev, n, var, 0) == VVOID) /* try upper levels */
+          return VVOID;  /* not found; is a global */
+        /* else was LOCAL or UPVAL */
+        idx  = newupvalue(fs, n, var);  /* will be a new upvalue */
+      }
+      init_exp(var, VUPVAL, idx);
+      return VUPVAL;
+    }
+  }
+}
+
+
+static void singlevar (LexState *ls, expdesc *var) {
+  TString *varname = str_checkname(ls);
+  FuncState *fs = ls->fs;
+  if (singlevaraux(fs, varname, var, 1) == VVOID) {  /* global name? */
+    expdesc key;
+    singlevaraux(fs, ls->envn, var, 1);  /* get environment variable */
+    lua_assert(var->k == VLOCAL || var->k == VUPVAL);
+    codestring(ls, &key, varname);  /* key is variable name */
+    luaK_indexed(fs, var, &key);  /* env[varname] */
+  }
+}
+
+
+static void adjust_assign (LexState *ls, int nvars, int nexps, expdesc *e) {
+  FuncState *fs = ls->fs;
+  int extra = nvars - nexps;
+  if (hasmultret(e->k)) {
+    extra++;  /* includes call itself */
+    if (extra < 0) extra = 0;
+    luaK_setreturns(fs, e, extra);  /* last exp. provides the difference */
+    if (extra > 1) luaK_reserveregs(fs, extra-1);
+  }
+  else {
+    if (e->k != VVOID) luaK_exp2nextreg(fs, e);  /* close last expression */
+    if (extra > 0) {
+      int reg = fs->freereg;
+      luaK_reserveregs(fs, extra);
+      luaK_nil(fs, reg, extra);
+    }
+  }
+}
+
+
+static void enterlevel (LexState *ls) {
+  lua_State *L = ls->L;
+  ++L->nCcalls;
+  checklimit(ls->fs, L->nCcalls, LUAI_MAXCCALLS, "C levels");
+}
+
+
+#define leavelevel(ls)	((ls)->L->nCcalls--)
+
+
+static void closegoto (LexState *ls, int g, Labeldesc *label) {
+  int i;
+  FuncState *fs = ls->fs;
+  Labellist *gl = &ls->dyd->gt;
+  Labeldesc *gt = &gl->arr[g];
+  lua_assert(luaS_eqstr(gt->name, label->name));
+  if (gt->nactvar < label->nactvar) {
+    TString *vname = getlocvar(fs, gt->nactvar)->varname;
+    const char *msg = luaO_pushfstring(ls->L,
+      "<goto %s> at line %d jumps into the scope of local " LUA_QS,
+      getstr(gt->name), gt->line, getstr(vname));
+    semerror(ls, msg);
+  }
+  luaK_patchlist(fs, gt->pc, label->pc);
+  /* remove goto from pending list */
+  for (i = g; i < gl->n - 1; i++)
+    gl->arr[i] = gl->arr[i + 1];
+  gl->n--;
+}
+
+
+/*
+** try to close a goto with existing labels; this solves backward jumps
+*/
+static int findlabel (LexState *ls, int g) {
+  int i;
+  BlockCnt *bl = ls->fs->bl;
+  Dyndata *dyd = ls->dyd;
+  Labeldesc *gt = &dyd->gt.arr[g];
+  /* check labels in current block for a match */
+  for (i = bl->firstlabel; i < dyd->label.n; i++) {
+    Labeldesc *lb = &dyd->label.arr[i];
+    if (luaS_eqstr(lb->name, gt->name)) {  /* correct label? */
+      if (gt->nactvar > lb->nactvar &&
+          (bl->upval || dyd->label.n > bl->firstlabel))
+        luaK_patchclose(ls->fs, gt->pc, lb->nactvar);
+      closegoto(ls, g, lb);  /* close it */
+      return 1;
+    }
+  }
+  return 0;  /* label not found; cannot close goto */
+}
+
+
+static int newlabelentry (LexState *ls, Labellist *l, TString *name,
+                          int line, int pc) {
+  int n = l->n;
+  luaM_growvector(ls->L, l->arr, n, l->size,
+                  Labeldesc, SHRT_MAX, "labels/gotos");
+  l->arr[n].name = name;
+  l->arr[n].line = line;
+  l->arr[n].nactvar = ls->fs->nactvar;
+  l->arr[n].pc = pc;
+  l->n++;
+  return n;
+}
+
+
+/*
+** check whether new label 'lb' matches any pending gotos in current
+** block; solves forward jumps
+*/
+static void findgotos (LexState *ls, Labeldesc *lb) {
+  Labellist *gl = &ls->dyd->gt;
+  int i = ls->fs->bl->firstgoto;
+  while (i < gl->n) {
+    if (luaS_eqstr(gl->arr[i].name, lb->name))
+      closegoto(ls, i, lb);
+    else
+      i++;
+  }
+}
+
+
+/*
+** "export" pending gotos to outer level, to check them against
+** outer labels; if the block being exited has upvalues, and
+** the goto exits the scope of any variable (which can be the
+** upvalue), close those variables being exited.
+*/
+static void movegotosout (FuncState *fs, BlockCnt *bl) {
+  int i = bl->firstgoto;
+  Labellist *gl = &fs->ls->dyd->gt;
+  /* correct pending gotos to current block and try to close it
+     with visible labels */
+  while (i < gl->n) {
+    Labeldesc *gt = &gl->arr[i];
+    if (gt->nactvar > bl->nactvar) {
+      if (bl->upval)
+        luaK_patchclose(fs, gt->pc, bl->nactvar);
+      gt->nactvar = bl->nactvar;
+    }
+    if (!findlabel(fs->ls, i))
+      i++;  /* move to next one */
+  }
+}
+
+
+static void enterblock (FuncState *fs, BlockCnt *bl, lu_byte isloop) {
+  bl->isloop = isloop;
+  bl->nactvar = fs->nactvar;
+  bl->firstlabel = fs->ls->dyd->label.n;
+  bl->firstgoto = fs->ls->dyd->gt.n;
+  bl->upval = 0;
+  bl->previous = fs->bl;
+  fs->bl = bl;
+  lua_assert(fs->freereg == fs->nactvar);
+}
+
+
+/*
+** create a label named "break" to resolve break statements
+*/
+static void breaklabel (LexState *ls) {
+  TString *n = luaS_new(ls->L, "break");
+  int l = newlabelentry(ls, &ls->dyd->label, n, 0, ls->fs->pc);
+  findgotos(ls, &ls->dyd->label.arr[l]);
+}
+
+/*
+** generates an error for an undefined 'goto'; choose appropriate
+** message when label name is a reserved word (which can only be 'break')
+*/
+static l_noret undefgoto (LexState *ls, Labeldesc *gt) {
+  const char *msg = isreserved(gt->name)
+                    ? "<%s> at line %d not inside a loop"
+                    : "no visible label " LUA_QS " for <goto> at line %d";
+  msg = luaO_pushfstring(ls->L, msg, getstr(gt->name), gt->line);
+  semerror(ls, msg);
+}
+
+
+static void leaveblock (FuncState *fs) {
+  BlockCnt *bl = fs->bl;
+  LexState *ls = fs->ls;
+  if (bl->previous && bl->upval) {
+    /* create a 'jump to here' to close upvalues */
+    int j = luaK_jump(fs);
+    luaK_patchclose(fs, j, bl->nactvar);
+    luaK_patchtohere(fs, j);
+  }
+  if (bl->isloop)
+    breaklabel(ls);  /* close pending breaks */
+  fs->bl = bl->previous;
+  removevars(fs, bl->nactvar);
+  lua_assert(bl->nactvar == fs->nactvar);
+  fs->freereg = fs->nactvar;  /* free registers */
+  ls->dyd->label.n = bl->firstlabel;  /* remove local labels */
+  if (bl->previous)  /* inner block? */
+    movegotosout(fs, bl);  /* update pending gotos to outer block */
+  else if (bl->firstgoto < ls->dyd->gt.n)  /* pending gotos in outer block? */
+    undefgoto(ls, &ls->dyd->gt.arr[bl->firstgoto]);  /* error */
+}
+
+
+/*
+** adds a new prototype into list of prototypes
+*/
+static Proto *addprototype (LexState *ls) {
+  Proto *clp;
+  lua_State *L = ls->L;
+  FuncState *fs = ls->fs;
+  Proto *f = fs->f;  /* prototype of current function */
+  if (fs->np >= f->sizep) {
+    int oldsize = f->sizep;
+    luaM_growvector(L, f->p, fs->np, f->sizep, Proto *, MAXARG_Bx, "functions");
+    while (oldsize < f->sizep) f->p[oldsize++] = NULL;
+  }
+  f->p[fs->np++] = clp = luaF_newproto(L);
+  luaC_objbarrier(L, f, clp);
+  return clp;
+}
+
+
+/*
+** codes instruction to create new closure in parent function.
+** The OP_CLOSURE instruction must use the last available register,
+** so that, if it invokes the GC, the GC knows which registers
+** are in use at that time.
+*/
+static void codeclosure (LexState *ls, expdesc *v) {
+  FuncState *fs = ls->fs->prev;
+  init_exp(v, VRELOCABLE, luaK_codeABx(fs, OP_CLOSURE, 0, fs->np - 1));
+  luaK_exp2nextreg(fs, v);  /* fix it at the last register */
+}
+
+
+static void open_func (LexState *ls, FuncState *fs, BlockCnt *bl) {
+  lua_State *L = ls->L;
+  Proto *f;
+  fs->prev = ls->fs;  /* linked list of funcstates */
+  fs->ls = ls;
+  ls->fs = fs;
+  fs->pc = 0;
+  fs->lasttarget = 0;
+  fs->jpc = NO_JUMP;
+  fs->freereg = 0;
+  fs->nk = 0;
+  fs->np = 0;
+  fs->nups = 0;
+  fs->nlocvars = 0;
+  fs->nactvar = 0;
+  fs->firstlocal = ls->dyd->actvar.n;
+  fs->bl = NULL;
+  f = fs->f;
+  f->source = ls->source;
+  f->maxstacksize = 2;  /* registers 0/1 are always valid */
+  fs->h = luaH_new(L);
+  /* anchor table of constants (to avoid being collected) */
+  sethvalue2s(L, L->top, fs->h);
+  incr_top(L);
+  enterblock(fs, bl, 0);
+}
+
+
+static void close_func (LexState *ls) {
+  lua_State *L = ls->L;
+  FuncState *fs = ls->fs;
+  Proto *f = fs->f;
+  luaK_ret(fs, 0, 0);  /* final return */
+  leaveblock(fs);
+  luaM_reallocvector(L, f->code, f->sizecode, fs->pc, Instruction);
+  f->sizecode = fs->pc;
+  luaM_reallocvector(L, f->lineinfo, f->sizelineinfo, fs->pc, int);
+  f->sizelineinfo = fs->pc;
+  luaM_reallocvector(L, f->k, f->sizek, fs->nk, TValue);
+  f->sizek = fs->nk;
+  luaM_reallocvector(L, f->p, f->sizep, fs->np, Proto *);
+  f->sizep = fs->np;
+  luaM_reallocvector(L, f->locvars, f->sizelocvars, fs->nlocvars, LocVar);
+  f->sizelocvars = fs->nlocvars;
+  luaM_reallocvector(L, f->upvalues, f->sizeupvalues, fs->nups, Upvaldesc);
+  f->sizeupvalues = fs->nups;
+  lua_assert(fs->bl == NULL);
+  ls->fs = fs->prev;
+  /* last token read was anchored in defunct function; must re-anchor it */
+  anchor_token(ls);
+  L->top--;  /* pop table of constants */
+  luaC_checkGC(L);
+}
+
+
+
+/*============================================================*/
+/* GRAMMAR RULES */
+/*============================================================*/
+
+
+/*
+** check whether current token is in the follow set of a block.
+** 'until' closes syntactical blocks, but do not close scope,
+** so it handled in separate.
+*/
+static int block_follow (LexState *ls, int withuntil) {
+  switch (ls->t.token) {
+    case TK_ELSE: case TK_ELSEIF:
+    case TK_END: case TK_EOS:
+      return 1;
+    case TK_UNTIL: return withuntil;
+    default: return 0;
+  }
+}
+
+
+static void statlist (LexState *ls) {
+  /* statlist -> { stat [`;'] } */
+  while (!block_follow(ls, 1)) {
+    if (ls->t.token == TK_RETURN) {
+      statement(ls);
+      return;  /* 'return' must be last statement */
+    }
+    statement(ls);
+  }
+}
+
+
+static void fieldsel (LexState *ls, expdesc *v) {
+  /* fieldsel -> ['.' | ':'] NAME */
+  FuncState *fs = ls->fs;
+  expdesc key;
+  luaK_exp2anyregup(fs, v);
+  luaX_next(ls);  /* skip the dot or colon */
+  checkname(ls, &key);
+  luaK_indexed(fs, v, &key);
+}
+
+
+static void yindex (LexState *ls, expdesc *v) {
+  /* index -> '[' expr ']' */
+  luaX_next(ls);  /* skip the '[' */
+  expr(ls, v);
+  luaK_exp2val(ls->fs, v);
+  checknext(ls, ']');
+}
+
+
+/*
+** {======================================================================
+** Rules for Constructors
+** =======================================================================
+*/
+
+
+struct ConsControl {
+  expdesc v;  /* last list item read */
+  expdesc *t;  /* table descriptor */
+  int nh;  /* total number of `record' elements */
+  int na;  /* total number of array elements */
+  int tostore;  /* number of array elements pending to be stored */
+};
+
+
+static void recfield (LexState *ls, struct ConsControl *cc) {
+  /* recfield -> (NAME | `['exp1`]') = exp1 */
+  FuncState *fs = ls->fs;
+  int reg = ls->fs->freereg;
+  expdesc key, val;
+  int rkkey;
+  if (ls->t.token == TK_NAME) {
+    checklimit(fs, cc->nh, MAX_INT, "items in a constructor");
+    checkname(ls, &key);
+  }
+  else  /* ls->t.token == '[' */
+    yindex(ls, &key);
+  cc->nh++;
+  checknext(ls, '=');
+  rkkey = luaK_exp2RK(fs, &key);
+  expr(ls, &val);
+  luaK_codeABC(fs, OP_SETTABLE, cc->t->u.info, rkkey, luaK_exp2RK(fs, &val));
+  fs->freereg = reg;  /* free registers */
+}
+
+
+static void closelistfield (FuncState *fs, struct ConsControl *cc) {
+  if (cc->v.k == VVOID) return;  /* there is no list item */
+  luaK_exp2nextreg(fs, &cc->v);
+  cc->v.k = VVOID;
+  if (cc->tostore == LFIELDS_PER_FLUSH) {
+    luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore);  /* flush */
+    cc->tostore = 0;  /* no more items pending */
+  }
+}
+
+
+static void lastlistfield (FuncState *fs, struct ConsControl *cc) {
+  if (cc->tostore == 0) return;
+  if (hasmultret(cc->v.k)) {
+    luaK_setmultret(fs, &cc->v);
+    luaK_setlist(fs, cc->t->u.info, cc->na, LUA_MULTRET);
+    cc->na--;  /* do not count last expression (unknown number of elements) */
+  }
+  else {
+    if (cc->v.k != VVOID)
+      luaK_exp2nextreg(fs, &cc->v);
+    luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore);
+  }
+}
+
+
+static void listfield (LexState *ls, struct ConsControl *cc) {
+  /* listfield -> exp */
+  expr(ls, &cc->v);
+  checklimit(ls->fs, cc->na, MAX_INT, "items in a constructor");
+  cc->na++;
+  cc->tostore++;
+}
+
+
+static void field (LexState *ls, struct ConsControl *cc) {
+  /* field -> listfield | recfield */
+  switch(ls->t.token) {
+    case TK_NAME: {  /* may be 'listfield' or 'recfield' */
+      if (luaX_lookahead(ls) != '=')  /* expression? */
+        listfield(ls, cc);
+      else
+        recfield(ls, cc);
+      break;
+    }
+    case '[': {
+      recfield(ls, cc);
+      break;
+    }
+    default: {
+      listfield(ls, cc);
+      break;
+    }
+  }
+}
+
+
+static void constructor (LexState *ls, expdesc *t) {
+  /* constructor -> '{' [ field { sep field } [sep] ] '}'
+     sep -> ',' | ';' */
+  FuncState *fs = ls->fs;
+  int line = ls->linenumber;
+  int pc = luaK_codeABC(fs, OP_NEWTABLE, 0, 0, 0);
+  struct ConsControl cc;
+  cc.na = cc.nh = cc.tostore = 0;
+  cc.t = t;
+  init_exp(t, VRELOCABLE, pc);
+  init_exp(&cc.v, VVOID, 0);  /* no value (yet) */
+  luaK_exp2nextreg(ls->fs, t);  /* fix it at stack top */
+  checknext(ls, '{');
+  do {
+    lua_assert(cc.v.k == VVOID || cc.tostore > 0);
+    if (ls->t.token == '}') break;
+    closelistfield(fs, &cc);
+    field(ls, &cc);
+  } while (testnext(ls, ',') || testnext(ls, ';'));
+  check_match(ls, '}', '{', line);
+  lastlistfield(fs, &cc);
+  SETARG_B(fs->f->code[pc], luaO_int2fb(cc.na)); /* set initial array size */
+  SETARG_C(fs->f->code[pc], luaO_int2fb(cc.nh));  /* set initial table size */
+}
+
+/* }====================================================================== */
+
+
+
+static void parlist (LexState *ls) {
+  /* parlist -> [ param { `,' param } ] */
+  FuncState *fs = ls->fs;
+  Proto *f = fs->f;
+  int nparams = 0;
+  f->is_vararg = 0;
+  if (ls->t.token != ')') {  /* is `parlist' not empty? */
+    do {
+      switch (ls->t.token) {
+        case TK_NAME: {  /* param -> NAME */
+          new_localvar(ls, str_checkname(ls));
+          nparams++;
+          break;
+        }
+        case TK_DOTS: {  /* param -> `...' */
+          luaX_next(ls);
+          f->is_vararg = 1;
+          break;
+        }
+        default: luaX_syntaxerror(ls, "<name> or " LUA_QL("...") " expected");
+      }
+    } while (!f->is_vararg && testnext(ls, ','));
+  }
+  adjustlocalvars(ls, nparams);
+  f->numparams = cast_byte(fs->nactvar);
+  luaK_reserveregs(fs, fs->nactvar);  /* reserve register for parameters */
+}
+
+
+static void body (LexState *ls, expdesc *e, int ismethod, int line) {
+  /* body ->  `(' parlist `)' block END */
+  FuncState new_fs;
+  BlockCnt bl;
+  new_fs.f = addprototype(ls);
+  new_fs.f->linedefined = line;
+  open_func(ls, &new_fs, &bl);
+  checknext(ls, '(');
+  if (ismethod) {
+    new_localvarliteral(ls, "self");  /* create 'self' parameter */
+    adjustlocalvars(ls, 1);
+  }
+  parlist(ls);
+  checknext(ls, ')');
+  statlist(ls);
+  new_fs.f->lastlinedefined = ls->linenumber;
+  check_match(ls, TK_END, TK_FUNCTION, line);
+  codeclosure(ls, e);
+  close_func(ls);
+}
+
+
+static int explist (LexState *ls, expdesc *v) {
+  /* explist -> expr { `,' expr } */
+  int n = 1;  /* at least one expression */
+  expr(ls, v);
+  while (testnext(ls, ',')) {
+    luaK_exp2nextreg(ls->fs, v);
+    expr(ls, v);
+    n++;
+  }
+  return n;
+}
+
+
+static void funcargs (LexState *ls, expdesc *f, int line) {
+  FuncState *fs = ls->fs;
+  expdesc args;
+  int base, nparams;
+  switch (ls->t.token) {
+    case '(': {  /* funcargs -> `(' [ explist ] `)' */
+      luaX_next(ls);
+      if (ls->t.token == ')')  /* arg list is empty? */
+        args.k = VVOID;
+      else {
+        explist(ls, &args);
+        luaK_setmultret(fs, &args);
+      }
+      check_match(ls, ')', '(', line);
+      break;
+    }
+    case '{': {  /* funcargs -> constructor */
+      constructor(ls, &args);
+      break;
+    }
+    case TK_STRING: {  /* funcargs -> STRING */
+      codestring(ls, &args, ls->t.seminfo.ts);
+      luaX_next(ls);  /* must use `seminfo' before `next' */
+      break;
+    }
+    default: {
+      luaX_syntaxerror(ls, "function arguments expected");
+    }
+  }
+  lua_assert(f->k == VNONRELOC);
+  base = f->u.info;  /* base register for call */
+  if (hasmultret(args.k))
+    nparams = LUA_MULTRET;  /* open call */
+  else {
+    if (args.k != VVOID)
+      luaK_exp2nextreg(fs, &args);  /* close last argument */
+    nparams = fs->freereg - (base+1);
+  }
+  init_exp(f, VCALL, luaK_codeABC(fs, OP_CALL, base, nparams+1, 2));
+  luaK_fixline(fs, line);
+  fs->freereg = base+1;  /* call remove function and arguments and leaves
+                            (unless changed) one result */
+}
+
+
+
+
+/*
+** {======================================================================
+** Expression parsing
+** =======================================================================
+*/
+
+
+static void primaryexp (LexState *ls, expdesc *v) {
+  /* primaryexp -> NAME | '(' expr ')' */
+  switch (ls->t.token) {
+    case '(': {
+      int line = ls->linenumber;
+      luaX_next(ls);
+      expr(ls, v);
+      check_match(ls, ')', '(', line);
+      luaK_dischargevars(ls->fs, v);
+      return;
+    }
+    case TK_NAME: {
+      singlevar(ls, v);
+      return;
+    }
+    default: {
+      luaX_syntaxerror(ls, "unexpected symbol");
+    }
+  }
+}
+
+
+static void suffixedexp (LexState *ls, expdesc *v) {
+  /* suffixedexp ->
+       primaryexp { '.' NAME | '[' exp ']' | ':' NAME funcargs | funcargs } */
+  FuncState *fs = ls->fs;
+  int line = ls->linenumber;
+  primaryexp(ls, v);
+  for (;;) {
+    switch (ls->t.token) {
+      case '.': {  /* fieldsel */
+        fieldsel(ls, v);
+        break;
+      }
+      case '[': {  /* `[' exp1 `]' */
+        expdesc key;
+        luaK_exp2anyregup(fs, v);
+        yindex(ls, &key);
+        luaK_indexed(fs, v, &key);
+        break;
+      }
+      case ':': {  /* `:' NAME funcargs */
+        expdesc key;
+        luaX_next(ls);
+        checkname(ls, &key);
+        luaK_self(fs, v, &key);
+        funcargs(ls, v, line);
+        break;
+      }
+      case '(': case TK_STRING: case '{': {  /* funcargs */
+        luaK_exp2nextreg(fs, v);
+        funcargs(ls, v, line);
+        break;
+      }
+      default: return;
+    }
+  }
+}
+
+
+static void simpleexp (LexState *ls, expdesc *v) {
+  /* simpleexp -> NUMBER | STRING | NIL | TRUE | FALSE | ... |
+                  constructor | FUNCTION body | suffixedexp */
+  switch (ls->t.token) {
+    case TK_NUMBER: {
+      init_exp(v, VKNUM, 0);
+      v->u.nval = ls->t.seminfo.r;
+      break;
+    }
+    case TK_STRING: {
+      codestring(ls, v, ls->t.seminfo.ts);
+      break;
+    }
+    case TK_NIL: {
+      init_exp(v, VNIL, 0);
+      break;
+    }
+    case TK_TRUE: {
+      init_exp(v, VTRUE, 0);
+      break;
+    }
+    case TK_FALSE: {
+      init_exp(v, VFALSE, 0);
+      break;
+    }
+    case TK_DOTS: {  /* vararg */
+      FuncState *fs = ls->fs;
+      check_condition(ls, fs->f->is_vararg,
+                      "cannot use " LUA_QL("...") " outside a vararg function");
+      init_exp(v, VVARARG, luaK_codeABC(fs, OP_VARARG, 0, 1, 0));
+      break;
+    }
+    case '{': {  /* constructor */
+      constructor(ls, v);
+      return;
+    }
+    case TK_FUNCTION: {
+      luaX_next(ls);
+      body(ls, v, 0, ls->linenumber);
+      return;
+    }
+    default: {
+      suffixedexp(ls, v);
+      return;
+    }
+  }
+  luaX_next(ls);
+}
+
+
+static UnOpr getunopr (int op) {
+  switch (op) {
+    case TK_NOT: return OPR_NOT;
+    case '-': return OPR_MINUS;
+    case '#': return OPR_LEN;
+    default: return OPR_NOUNOPR;
+  }
+}
+
+
+static BinOpr getbinopr (int op) {
+  switch (op) {
+    case '+': return OPR_ADD;
+    case '-': return OPR_SUB;
+    case '*': return OPR_MUL;
+    case '/': return OPR_DIV;
+    case '%': return OPR_MOD;
+    case '^': return OPR_POW;
+    case TK_CONCAT: return OPR_CONCAT;
+    case TK_NE: return OPR_NE;
+    case TK_EQ: return OPR_EQ;
+    case '<': return OPR_LT;
+    case TK_LE: return OPR_LE;
+    case '>': return OPR_GT;
+    case TK_GE: return OPR_GE;
+    case TK_AND: return OPR_AND;
+    case TK_OR: return OPR_OR;
+    default: return OPR_NOBINOPR;
+  }
+}
+
+
+static const struct {
+  lu_byte left;  /* left priority for each binary operator */
+  lu_byte right; /* right priority */
+} priority[] = {  /* ORDER OPR */
+   {6, 6}, {6, 6}, {7, 7}, {7, 7}, {7, 7},  /* `+' `-' `*' `/' `%' */
+   {10, 9}, {5, 4},                 /* ^, .. (right associative) */
+   {3, 3}, {3, 3}, {3, 3},          /* ==, <, <= */
+   {3, 3}, {3, 3}, {3, 3},          /* ~=, >, >= */
+   {2, 2}, {1, 1}                   /* and, or */
+};
+
+#define UNARY_PRIORITY	8  /* priority for unary operators */
+
+
+/*
+** subexpr -> (simpleexp | unop subexpr) { binop subexpr }
+** where `binop' is any binary operator with a priority higher than `limit'
+*/
+static BinOpr subexpr (LexState *ls, expdesc *v, int limit) {
+  BinOpr op;
+  UnOpr uop;
+  enterlevel(ls);
+  uop = getunopr(ls->t.token);
+  if (uop != OPR_NOUNOPR) {
+    int line = ls->linenumber;
+    luaX_next(ls);
+    subexpr(ls, v, UNARY_PRIORITY);
+    luaK_prefix(ls->fs, uop, v, line);
+  }
+  else simpleexp(ls, v);
+  /* expand while operators have priorities higher than `limit' */
+  op = getbinopr(ls->t.token);
+  while (op != OPR_NOBINOPR && priority[op].left > limit) {
+    expdesc v2;
+    BinOpr nextop;
+    int line = ls->linenumber;
+    luaX_next(ls);
+    luaK_infix(ls->fs, op, v);
+    /* read sub-expression with higher priority */
+    nextop = subexpr(ls, &v2, priority[op].right);
+    luaK_posfix(ls->fs, op, v, &v2, line);
+    op = nextop;
+  }
+  leavelevel(ls);
+  return op;  /* return first untreated operator */
+}
+
+
+static void expr (LexState *ls, expdesc *v) {
+  subexpr(ls, v, 0);
+}
+
+/* }==================================================================== */
+
+
+
+/*
+** {======================================================================
+** Rules for Statements
+** =======================================================================
+*/
+
+
+static void block (LexState *ls) {
+  /* block -> statlist */
+  FuncState *fs = ls->fs;
+  BlockCnt bl;
+  enterblock(fs, &bl, 0);
+  statlist(ls);
+  leaveblock(fs);
+}
+
+
+/*
+** structure to chain all variables in the left-hand side of an
+** assignment
+*/
+struct LHS_assign {
+  struct LHS_assign *prev;
+  expdesc v;  /* variable (global, local, upvalue, or indexed) */
+};
+
+
+/*
+** check whether, in an assignment to an upvalue/local variable, the
+** upvalue/local variable is begin used in a previous assignment to a
+** table. If so, save original upvalue/local value in a safe place and
+** use this safe copy in the previous assignment.
+*/
+static void check_conflict (LexState *ls, struct LHS_assign *lh, expdesc *v) {
+  FuncState *fs = ls->fs;
+  int extra = fs->freereg;  /* eventual position to save local variable */
+  int conflict = 0;
+  for (; lh; lh = lh->prev) {  /* check all previous assignments */
+    if (lh->v.k == VINDEXED) {  /* assigning to a table? */
+      /* table is the upvalue/local being assigned now? */
+      if (lh->v.u.ind.vt == v->k && lh->v.u.ind.t == v->u.info) {
+        conflict = 1;
+        lh->v.u.ind.vt = VLOCAL;
+        lh->v.u.ind.t = extra;  /* previous assignment will use safe copy */
+      }
+      /* index is the local being assigned? (index cannot be upvalue) */
+      if (v->k == VLOCAL && lh->v.u.ind.idx == v->u.info) {
+        conflict = 1;
+        lh->v.u.ind.idx = extra;  /* previous assignment will use safe copy */
+      }
+    }
+  }
+  if (conflict) {
+    /* copy upvalue/local value to a temporary (in position 'extra') */
+    OpCode op = (v->k == VLOCAL) ? OP_MOVE : OP_GETUPVAL;
+    luaK_codeABC(fs, op, extra, v->u.info, 0);
+    luaK_reserveregs(fs, 1);
+  }
+}
+
+
+static void assignment (LexState *ls, struct LHS_assign *lh, int nvars) {
+  expdesc e;
+  check_condition(ls, vkisvar(lh->v.k), "syntax error");
+  if (testnext(ls, ',')) {  /* assignment -> ',' suffixedexp assignment */
+    struct LHS_assign nv;
+    nv.prev = lh;
+    suffixedexp(ls, &nv.v);
+    if (nv.v.k != VINDEXED)
+      check_conflict(ls, lh, &nv.v);
+    checklimit(ls->fs, nvars + ls->L->nCcalls, LUAI_MAXCCALLS,
+                    "C levels");
+    assignment(ls, &nv, nvars+1);
+  }
+  else {  /* assignment -> `=' explist */
+    int nexps;
+    checknext(ls, '=');
+    nexps = explist(ls, &e);
+    if (nexps != nvars) {
+      adjust_assign(ls, nvars, nexps, &e);
+      if (nexps > nvars)
+        ls->fs->freereg -= nexps - nvars;  /* remove extra values */
+    }
+    else {
+      luaK_setoneret(ls->fs, &e);  /* close last expression */
+      luaK_storevar(ls->fs, &lh->v, &e);
+      return;  /* avoid default */
+    }
+  }
+  init_exp(&e, VNONRELOC, ls->fs->freereg-1);  /* default assignment */
+  luaK_storevar(ls->fs, &lh->v, &e);
+}
+
+
+static int cond (LexState *ls) {
+  /* cond -> exp */
+  expdesc v;
+  expr(ls, &v);  /* read condition */
+  if (v.k == VNIL) v.k = VFALSE;  /* `falses' are all equal here */
+  luaK_goiftrue(ls->fs, &v);
+  return v.f;
+}
+
+
+static void gotostat (LexState *ls, int pc) {
+  int line = ls->linenumber;
+  TString *label;
+  int g;
+  if (testnext(ls, TK_GOTO))
+    label = str_checkname(ls);
+  else {
+    luaX_next(ls);  /* skip break */
+    label = luaS_new(ls->L, "break");
+  }
+  g = newlabelentry(ls, &ls->dyd->gt, label, line, pc);
+  findlabel(ls, g);  /* close it if label already defined */
+}
+
+
+/* check for repeated labels on the same block */
+static void checkrepeated (FuncState *fs, Labellist *ll, TString *label) {
+  int i;
+  for (i = fs->bl->firstlabel; i < ll->n; i++) {
+    if (luaS_eqstr(label, ll->arr[i].name)) {
+      const char *msg = luaO_pushfstring(fs->ls->L,
+                          "label " LUA_QS " already defined on line %d",
+                          getstr(label), ll->arr[i].line);
+      semerror(fs->ls, msg);
+    }
+  }
+}
+
+
+/* skip no-op statements */
+static void skipnoopstat (LexState *ls) {
+  while (ls->t.token == ';' || ls->t.token == TK_DBCOLON)
+    statement(ls);
+}
+
+
+static void labelstat (LexState *ls, TString *label, int line) {
+  /* label -> '::' NAME '::' */
+  FuncState *fs = ls->fs;
+  Labellist *ll = &ls->dyd->label;
+  int l;  /* index of new label being created */
+  checkrepeated(fs, ll, label);  /* check for repeated labels */
+  checknext(ls, TK_DBCOLON);  /* skip double colon */
+  /* create new entry for this label */
+  l = newlabelentry(ls, ll, label, line, fs->pc);
+  skipnoopstat(ls);  /* skip other no-op statements */
+  if (block_follow(ls, 0)) {  /* label is last no-op statement in the block? */
+    /* assume that locals are already out of scope */
+    ll->arr[l].nactvar = fs->bl->nactvar;
+  }
+  findgotos(ls, &ll->arr[l]);
+}
+
+
+static void whilestat (LexState *ls, int line) {
+  /* whilestat -> WHILE cond DO block END */
+  FuncState *fs = ls->fs;
+  int whileinit;
+  int condexit;
+  BlockCnt bl;
+  luaX_next(ls);  /* skip WHILE */
+  whileinit = luaK_getlabel(fs);
+  condexit = cond(ls);
+  enterblock(fs, &bl, 1);
+  checknext(ls, TK_DO);
+  block(ls);
+  luaK_jumpto(fs, whileinit);
+  check_match(ls, TK_END, TK_WHILE, line);
+  leaveblock(fs);
+  luaK_patchtohere(fs, condexit);  /* false conditions finish the loop */
+}
+
+
+static void repeatstat (LexState *ls, int line) {
+  /* repeatstat -> REPEAT block UNTIL cond */
+  int condexit;
+  FuncState *fs = ls->fs;
+  int repeat_init = luaK_getlabel(fs);
+  BlockCnt bl1, bl2;
+  enterblock(fs, &bl1, 1);  /* loop block */
+  enterblock(fs, &bl2, 0);  /* scope block */
+  luaX_next(ls);  /* skip REPEAT */
+  statlist(ls);
+  check_match(ls, TK_UNTIL, TK_REPEAT, line);
+  condexit = cond(ls);  /* read condition (inside scope block) */
+  if (bl2.upval)  /* upvalues? */
+    luaK_patchclose(fs, condexit, bl2.nactvar);
+  leaveblock(fs);  /* finish scope */
+  luaK_patchlist(fs, condexit, repeat_init);  /* close the loop */
+  leaveblock(fs);  /* finish loop */
+}
+
+
+static int exp1 (LexState *ls) {
+  expdesc e;
+  int reg;
+  expr(ls, &e);
+  luaK_exp2nextreg(ls->fs, &e);
+  lua_assert(e.k == VNONRELOC);
+  reg = e.u.info;
+  return reg;
+}
+
+
+static void forbody (LexState *ls, int base, int line, int nvars, int isnum) {
+  /* forbody -> DO block */
+  BlockCnt bl;
+  FuncState *fs = ls->fs;
+  int prep, endfor;
+  adjustlocalvars(ls, 3);  /* control variables */
+  checknext(ls, TK_DO);
+  prep = isnum ? luaK_codeAsBx(fs, OP_FORPREP, base, NO_JUMP) : luaK_jump(fs);
+  enterblock(fs, &bl, 0);  /* scope for declared variables */
+  adjustlocalvars(ls, nvars);
+  luaK_reserveregs(fs, nvars);
+  block(ls);
+  leaveblock(fs);  /* end of scope for declared variables */
+  luaK_patchtohere(fs, prep);
+  if (isnum)  /* numeric for? */
+    endfor = luaK_codeAsBx(fs, OP_FORLOOP, base, NO_JUMP);
+  else {  /* generic for */
+    luaK_codeABC(fs, OP_TFORCALL, base, 0, nvars);
+    luaK_fixline(fs, line);
+    endfor = luaK_codeAsBx(fs, OP_TFORLOOP, base + 2, NO_JUMP);
+  }
+  luaK_patchlist(fs, endfor, prep + 1);
+  luaK_fixline(fs, line);
+}
+
+
+static void fornum (LexState *ls, TString *varname, int line) {
+  /* fornum -> NAME = exp1,exp1[,exp1] forbody */
+  FuncState *fs = ls->fs;
+  int base = fs->freereg;
+  new_localvarliteral(ls, "(for index)");
+  new_localvarliteral(ls, "(for limit)");
+  new_localvarliteral(ls, "(for step)");
+  new_localvar(ls, varname);
+  checknext(ls, '=');
+  exp1(ls);  /* initial value */
+  checknext(ls, ',');
+  exp1(ls);  /* limit */
+  if (testnext(ls, ','))
+    exp1(ls);  /* optional step */
+  else {  /* default step = 1 */
+    luaK_codek(fs, fs->freereg, luaK_numberK(fs, 1));
+    luaK_reserveregs(fs, 1);
+  }
+  forbody(ls, base, line, 1, 1);
+}
+
+
+static void forlist (LexState *ls, TString *indexname) {
+  /* forlist -> NAME {,NAME} IN explist forbody */
+  FuncState *fs = ls->fs;
+  expdesc e;
+  int nvars = 4;  /* gen, state, control, plus at least one declared var */
+  int line;
+  int base = fs->freereg;
+  /* create control variables */
+  new_localvarliteral(ls, "(for generator)");
+  new_localvarliteral(ls, "(for state)");
+  new_localvarliteral(ls, "(for control)");
+  /* create declared variables */
+  new_localvar(ls, indexname);
+  while (testnext(ls, ',')) {
+    new_localvar(ls, str_checkname(ls));
+    nvars++;
+  }
+  checknext(ls, TK_IN);
+  line = ls->linenumber;
+  adjust_assign(ls, 3, explist(ls, &e), &e);
+  luaK_checkstack(fs, 3);  /* extra space to call generator */
+  forbody(ls, base, line, nvars - 3, 0);
+}
+
+
+static void forstat (LexState *ls, int line) {
+  /* forstat -> FOR (fornum | forlist) END */
+  FuncState *fs = ls->fs;
+  TString *varname;
+  BlockCnt bl;
+  enterblock(fs, &bl, 1);  /* scope for loop and control variables */
+  luaX_next(ls);  /* skip `for' */
+  varname = str_checkname(ls);  /* first variable name */
+  switch (ls->t.token) {
+    case '=': fornum(ls, varname, line); break;
+    case ',': case TK_IN: forlist(ls, varname); break;
+    default: luaX_syntaxerror(ls, LUA_QL("=") " or " LUA_QL("in") " expected");
+  }
+  check_match(ls, TK_END, TK_FOR, line);
+  leaveblock(fs);  /* loop scope (`break' jumps to this point) */
+}
+
+
+static void test_then_block (LexState *ls, int *escapelist) {
+  /* test_then_block -> [IF | ELSEIF] cond THEN block */
+  BlockCnt bl;
+  FuncState *fs = ls->fs;
+  expdesc v;
+  int jf;  /* instruction to skip 'then' code (if condition is false) */
+  luaX_next(ls);  /* skip IF or ELSEIF */
+  expr(ls, &v);  /* read condition */
+  checknext(ls, TK_THEN);
+  if (ls->t.token == TK_GOTO || ls->t.token == TK_BREAK) {
+    luaK_goiffalse(ls->fs, &v);  /* will jump to label if condition is true */
+    enterblock(fs, &bl, 0);  /* must enter block before 'goto' */
+    gotostat(ls, v.t);  /* handle goto/break */
+    skipnoopstat(ls);  /* skip other no-op statements */
+    if (block_follow(ls, 0)) {  /* 'goto' is the entire block? */
+      leaveblock(fs);
+      return;  /* and that is it */
+    }
+    else  /* must skip over 'then' part if condition is false */
+      jf = luaK_jump(fs);
+  }
+  else {  /* regular case (not goto/break) */
+    luaK_goiftrue(ls->fs, &v);  /* skip over block if condition is false */
+    enterblock(fs, &bl, 0);
+    jf = v.f;
+  }
+  statlist(ls);  /* `then' part */
+  leaveblock(fs);
+  if (ls->t.token == TK_ELSE ||
+      ls->t.token == TK_ELSEIF)  /* followed by 'else'/'elseif'? */
+    luaK_concat(fs, escapelist, luaK_jump(fs));  /* must jump over it */
+  luaK_patchtohere(fs, jf);
+}
+
+
+static void ifstat (LexState *ls, int line) {
+  /* ifstat -> IF cond THEN block {ELSEIF cond THEN block} [ELSE block] END */
+  FuncState *fs = ls->fs;
+  int escapelist = NO_JUMP;  /* exit list for finished parts */
+  test_then_block(ls, &escapelist);  /* IF cond THEN block */
+  while (ls->t.token == TK_ELSEIF)
+    test_then_block(ls, &escapelist);  /* ELSEIF cond THEN block */
+  if (testnext(ls, TK_ELSE))
+    block(ls);  /* `else' part */
+  check_match(ls, TK_END, TK_IF, line);
+  luaK_patchtohere(fs, escapelist);  /* patch escape list to 'if' end */
+}
+
+
+static void localfunc (LexState *ls) {
+  expdesc b;
+  FuncState *fs = ls->fs;
+  new_localvar(ls, str_checkname(ls));  /* new local variable */
+  adjustlocalvars(ls, 1);  /* enter its scope */
+  body(ls, &b, 0, ls->linenumber);  /* function created in next register */
+  /* debug information will only see the variable after this point! */
+  getlocvar(fs, b.u.info)->startpc = fs->pc;
+}
+
+
+static void localstat (LexState *ls) {
+  /* stat -> LOCAL NAME {`,' NAME} [`=' explist] */
+  int nvars = 0;
+  int nexps;
+  expdesc e;
+  do {
+    new_localvar(ls, str_checkname(ls));
+    nvars++;
+  } while (testnext(ls, ','));
+  if (testnext(ls, '='))
+    nexps = explist(ls, &e);
+  else {
+    e.k = VVOID;
+    nexps = 0;
+  }
+  adjust_assign(ls, nvars, nexps, &e);
+  adjustlocalvars(ls, nvars);
+}
+
+
+static int funcname (LexState *ls, expdesc *v) {
+  /* funcname -> NAME {fieldsel} [`:' NAME] */
+  int ismethod = 0;
+  singlevar(ls, v);
+  while (ls->t.token == '.')
+    fieldsel(ls, v);
+  if (ls->t.token == ':') {
+    ismethod = 1;
+    fieldsel(ls, v);
+  }
+  return ismethod;
+}
+
+
+static void funcstat (LexState *ls, int line) {
+  /* funcstat -> FUNCTION funcname body */
+  int ismethod;
+  expdesc v, b;
+  luaX_next(ls);  /* skip FUNCTION */
+  ismethod = funcname(ls, &v);
+  body(ls, &b, ismethod, line);
+  luaK_storevar(ls->fs, &v, &b);
+  luaK_fixline(ls->fs, line);  /* definition `happens' in the first line */
+}
+
+
+static void exprstat (LexState *ls) {
+  /* stat -> func | assignment */
+  FuncState *fs = ls->fs;
+  struct LHS_assign v;
+  suffixedexp(ls, &v.v);
+  if (ls->t.token == '=' || ls->t.token == ',') { /* stat -> assignment ? */
+    v.prev = NULL;
+    assignment(ls, &v, 1);
+  }
+  else {  /* stat -> func */
+    check_condition(ls, v.v.k == VCALL, "syntax error");
+    SETARG_C(getcode(fs, &v.v), 1);  /* call statement uses no results */
+  }
+}
+
+
+static void retstat (LexState *ls) {
+  /* stat -> RETURN [explist] [';'] */
+  FuncState *fs = ls->fs;
+  expdesc e;
+  int first, nret;  /* registers with returned values */
+  if (block_follow(ls, 1) || ls->t.token == ';')
+    first = nret = 0;  /* return no values */
+  else {
+    nret = explist(ls, &e);  /* optional return values */
+    if (hasmultret(e.k)) {
+      luaK_setmultret(fs, &e);
+      if (e.k == VCALL && nret == 1) {  /* tail call? */
+        SET_OPCODE(getcode(fs,&e), OP_TAILCALL);
+        lua_assert(GETARG_A(getcode(fs,&e)) == fs->nactvar);
+      }
+      first = fs->nactvar;
+      nret = LUA_MULTRET;  /* return all values */
+    }
+    else {
+      if (nret == 1)  /* only one single value? */
+        first = luaK_exp2anyreg(fs, &e);
+      else {
+        luaK_exp2nextreg(fs, &e);  /* values must go to the `stack' */
+        first = fs->nactvar;  /* return all `active' values */
+        lua_assert(nret == fs->freereg - first);
+      }
+    }
+  }
+  luaK_ret(fs, first, nret);
+  testnext(ls, ';');  /* skip optional semicolon */
+}
+
+
+static void statement (LexState *ls) {
+  int line = ls->linenumber;  /* may be needed for error messages */
+  enterlevel(ls);
+  switch (ls->t.token) {
+    case ';': {  /* stat -> ';' (empty statement) */
+      luaX_next(ls);  /* skip ';' */
+      break;
+    }
+    case TK_IF: {  /* stat -> ifstat */
+      ifstat(ls, line);
+      break;
+    }
+    case TK_WHILE: {  /* stat -> whilestat */
+      whilestat(ls, line);
+      break;
+    }
+    case TK_DO: {  /* stat -> DO block END */
+      luaX_next(ls);  /* skip DO */
+      block(ls);
+      check_match(ls, TK_END, TK_DO, line);
+      break;
+    }
+    case TK_FOR: {  /* stat -> forstat */
+      forstat(ls, line);
+      break;
+    }
+    case TK_REPEAT: {  /* stat -> repeatstat */
+      repeatstat(ls, line);
+      break;
+    }
+    case TK_FUNCTION: {  /* stat -> funcstat */
+      funcstat(ls, line);
+      break;
+    }
+    case TK_LOCAL: {  /* stat -> localstat */
+      luaX_next(ls);  /* skip LOCAL */
+      if (testnext(ls, TK_FUNCTION))  /* local function? */
+        localfunc(ls);
+      else
+        localstat(ls);
+      break;
+    }
+    case TK_DBCOLON: {  /* stat -> label */
+      luaX_next(ls);  /* skip double colon */
+      labelstat(ls, str_checkname(ls), line);
+      break;
+    }
+    case TK_RETURN: {  /* stat -> retstat */
+      luaX_next(ls);  /* skip RETURN */
+      retstat(ls);
+      break;
+    }
+    case TK_BREAK:   /* stat -> breakstat */
+    case TK_GOTO: {  /* stat -> 'goto' NAME */
+      gotostat(ls, luaK_jump(ls->fs));
+      break;
+    }
+    default: {  /* stat -> func | assignment */
+      exprstat(ls);
+      break;
+    }
+  }
+  lua_assert(ls->fs->f->maxstacksize >= ls->fs->freereg &&
+             ls->fs->freereg >= ls->fs->nactvar);
+  ls->fs->freereg = ls->fs->nactvar;  /* free registers */
+  leavelevel(ls);
+}
+
+/* }====================================================================== */
+
+
+/*
+** compiles the main function, which is a regular vararg function with an
+** upvalue named LUA_ENV
+*/
+static void mainfunc (LexState *ls, FuncState *fs) {
+  BlockCnt bl;
+  expdesc v;
+  open_func(ls, fs, &bl);
+  fs->f->is_vararg = 1;  /* main function is always vararg */
+  init_exp(&v, VLOCAL, 0);  /* create and... */
+  newupvalue(fs, ls->envn, &v);  /* ...set environment upvalue */
+  luaX_next(ls);  /* read first token */
+  statlist(ls);  /* parse main body */
+  check(ls, TK_EOS);
+  close_func(ls);
+}
+
+
+Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff,
+                      Dyndata *dyd, const char *name, int firstchar) {
+  LexState lexstate;
+  FuncState funcstate;
+  Closure *cl = luaF_newLclosure(L, 1);  /* create main closure */
+  /* anchor closure (to avoid being collected) */
+  setclLvalue(L, L->top, cl);
+  incr_top(L);
+  funcstate.f = cl->l.p = luaF_newproto(L);
+  funcstate.f->source = luaS_new(L, name);  /* create and anchor TString */
+  lexstate.buff = buff;
+  lexstate.dyd = dyd;
+  dyd->actvar.n = dyd->gt.n = dyd->label.n = 0;
+  luaX_setinput(L, &lexstate, z, funcstate.f->source, firstchar);
+  mainfunc(&lexstate, &funcstate);
+  lua_assert(!funcstate.prev && funcstate.nups == 1 && !lexstate.fs);
+  /* all scopes should be correctly finished */
+  lua_assert(dyd->actvar.n == 0 && dyd->gt.n == 0 && dyd->label.n == 0);
+  return cl;  /* it's on the stack too */
+}
+
diff --git a/ext/lua/src/lstate.c b/ext/lua/src/lstate.c
new file mode 100644
index 0000000..207a106
--- /dev/null
+++ b/ext/lua/src/lstate.c
@@ -0,0 +1,322 @@
+/*
+** $Id: lstate.c,v 2.99 2012/10/02 17:40:53 roberto Exp $
+** Global State
+** See Copyright Notice in lua.h
+*/
+
+
+#include <stddef.h>
+#include <string.h>
+
+#define lstate_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lapi.h"
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "llex.h"
+#include "lmem.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+
+
+#if !defined(LUAI_GCPAUSE)
+#define LUAI_GCPAUSE	200  /* 200% */
+#endif
+
+#if !defined(LUAI_GCMAJOR)
+#define LUAI_GCMAJOR	200  /* 200% */
+#endif
+
+#if !defined(LUAI_GCMUL)
+#define LUAI_GCMUL	200 /* GC runs 'twice the speed' of memory allocation */
+#endif
+
+
+#define MEMERRMSG	"not enough memory"
+
+
+/*
+** a macro to help the creation of a unique random seed when a state is
+** created; the seed is used to randomize hashes.
+*/
+#if !defined(luai_makeseed)
+#include <time.h>
+#define luai_makeseed()		cast(unsigned int, time(NULL))
+#endif
+
+
+
+/*
+** thread state + extra space
+*/
+typedef struct LX {
+#if defined(LUAI_EXTRASPACE)
+  char buff[LUAI_EXTRASPACE];
+#endif
+  lua_State l;
+} LX;
+
+
+/*
+** Main thread combines a thread state and the global state
+*/
+typedef struct LG {
+  LX l;
+  global_State g;
+} LG;
+
+
+
+#define fromstate(L)	(cast(LX *, cast(lu_byte *, (L)) - offsetof(LX, l)))
+
+
+/*
+** Compute an initial seed as random as possible. In ANSI, rely on
+** Address Space Layout Randomization (if present) to increase
+** randomness..
+*/
+#define addbuff(b,p,e) \
+  { size_t t = cast(size_t, e); \
+    memcpy(buff + p, &t, sizeof(t)); p += sizeof(t); }
+
+static unsigned int makeseed (lua_State *L) {
+  char buff[4 * sizeof(size_t)];
+  unsigned int h = luai_makeseed();
+  int p = 0;
+  addbuff(buff, p, L);  /* heap variable */
+  addbuff(buff, p, &h);  /* local variable */
+  addbuff(buff, p, luaO_nilobject);  /* global variable */
+  addbuff(buff, p, &lua_newstate);  /* public function */
+  lua_assert(p == sizeof(buff));
+  return luaS_hash(buff, p, h);
+}
+
+
+/*
+** set GCdebt to a new value keeping the value (totalbytes + GCdebt)
+** invariant
+*/
+void luaE_setdebt (global_State *g, l_mem debt) {
+  g->totalbytes -= (debt - g->GCdebt);
+  g->GCdebt = debt;
+}
+
+
+CallInfo *luaE_extendCI (lua_State *L) {
+  CallInfo *ci = luaM_new(L, CallInfo);
+  lua_assert(L->ci->next == NULL);
+  L->ci->next = ci;
+  ci->previous = L->ci;
+  ci->next = NULL;
+  return ci;
+}
+
+
+void luaE_freeCI (lua_State *L) {
+  CallInfo *ci = L->ci;
+  CallInfo *next = ci->next;
+  ci->next = NULL;
+  while ((ci = next) != NULL) {
+    next = ci->next;
+    luaM_free(L, ci);
+  }
+}
+
+
+static void stack_init (lua_State *L1, lua_State *L) {
+  int i; CallInfo *ci;
+  /* initialize stack array */
+  L1->stack = luaM_newvector(L, BASIC_STACK_SIZE, TValue);
+  L1->stacksize = BASIC_STACK_SIZE;
+  for (i = 0; i < BASIC_STACK_SIZE; i++)
+    setnilvalue(L1->stack + i);  /* erase new stack */
+  L1->top = L1->stack;
+  L1->stack_last = L1->stack + L1->stacksize - EXTRA_STACK;
+  /* initialize first ci */
+  ci = &L1->base_ci;
+  ci->next = ci->previous = NULL;
+  ci->callstatus = 0;
+  ci->func = L1->top;
+  setnilvalue(L1->top++);  /* 'function' entry for this 'ci' */
+  ci->top = L1->top + LUA_MINSTACK;
+  L1->ci = ci;
+}
+
+
+static void freestack (lua_State *L) {
+  if (L->stack == NULL)
+    return;  /* stack not completely built yet */
+  L->ci = &L->base_ci;  /* free the entire 'ci' list */
+  luaE_freeCI(L);
+  luaM_freearray(L, L->stack, L->stacksize);  /* free stack array */
+}
+
+
+/*
+** Create registry table and its predefined values
+*/
+static void init_registry (lua_State *L, global_State *g) {
+  TValue mt;
+  /* create registry */
+  Table *registry = luaH_new(L);
+  sethvalue(L, &g->l_registry, registry);
+  luaH_resize(L, registry, LUA_RIDX_LAST, 0);
+  /* registry[LUA_RIDX_MAINTHREAD] = L */
+  setthvalue(L, &mt, L);
+  luaH_setint(L, registry, LUA_RIDX_MAINTHREAD, &mt);
+  /* registry[LUA_RIDX_GLOBALS] = table of globals */
+  sethvalue(L, &mt, luaH_new(L));
+  luaH_setint(L, registry, LUA_RIDX_GLOBALS, &mt);
+}
+
+
+/*
+** open parts of the state that may cause memory-allocation errors
+*/
+static void f_luaopen (lua_State *L, void *ud) {
+  global_State *g = G(L);
+  UNUSED(ud);
+  stack_init(L, L);  /* init stack */
+  init_registry(L, g);
+  luaS_resize(L, MINSTRTABSIZE);  /* initial size of string table */
+  luaT_init(L);
+  luaX_init(L);
+  /* pre-create memory-error message */
+  g->memerrmsg = luaS_newliteral(L, MEMERRMSG);
+  luaS_fix(g->memerrmsg);  /* it should never be collected */
+  g->gcrunning = 1;  /* allow gc */
+}
+
+
+/*
+** preinitialize a state with consistent values without allocating
+** any memory (to avoid errors)
+*/
+static void preinit_state (lua_State *L, global_State *g) {
+  G(L) = g;
+  L->stack = NULL;
+  L->ci = NULL;
+  L->stacksize = 0;
+  L->errorJmp = NULL;
+  L->nCcalls = 0;
+  L->hook = NULL;
+  L->hookmask = 0;
+  L->basehookcount = 0;
+  L->allowhook = 1;
+  resethookcount(L);
+  L->openupval = NULL;
+  L->nny = 1;
+  L->status = LUA_OK;
+  L->errfunc = 0;
+}
+
+
+static void close_state (lua_State *L) {
+  global_State *g = G(L);
+  luaF_close(L, L->stack);  /* close all upvalues for this thread */
+  luaC_freeallobjects(L);  /* collect all objects */
+  luaM_freearray(L, G(L)->strt.hash, G(L)->strt.size);
+  luaZ_freebuffer(L, &g->buff);
+  freestack(L);
+  lua_assert(gettotalbytes(g) == sizeof(LG));
+  (*g->frealloc)(g->ud, fromstate(L), sizeof(LG), 0);  /* free main block */
+}
+
+
+LUA_API lua_State *lua_newthread (lua_State *L) {
+  lua_State *L1;
+  lua_lock(L);
+  luaC_checkGC(L);
+  L1 = &luaC_newobj(L, LUA_TTHREAD, sizeof(LX), NULL, offsetof(LX, l))->th;
+  setthvalue(L, L->top, L1);
+  api_incr_top(L);
+  preinit_state(L1, G(L));
+  L1->hookmask = L->hookmask;
+  L1->basehookcount = L->basehookcount;
+  L1->hook = L->hook;
+  resethookcount(L1);
+  luai_userstatethread(L, L1);
+  stack_init(L1, L);  /* init stack */
+  lua_unlock(L);
+  return L1;
+}
+
+
+void luaE_freethread (lua_State *L, lua_State *L1) {
+  LX *l = fromstate(L1);
+  luaF_close(L1, L1->stack);  /* close all upvalues for this thread */
+  lua_assert(L1->openupval == NULL);
+  luai_userstatefree(L, L1);
+  freestack(L1);
+  luaM_free(L, l);
+}
+
+
+LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) {
+  int i;
+  lua_State *L;
+  global_State *g;
+  LG *l = cast(LG *, (*f)(ud, NULL, LUA_TTHREAD, sizeof(LG)));
+  if (l == NULL) return NULL;
+  L = &l->l.l;
+  g = &l->g;
+  L->next = NULL;
+  L->tt = LUA_TTHREAD;
+  g->currentwhite = bit2mask(WHITE0BIT, FIXEDBIT);
+  L->marked = luaC_white(g);
+  g->gckind = KGC_NORMAL;
+  preinit_state(L, g);
+  g->frealloc = f;
+  g->ud = ud;
+  g->mainthread = L;
+  g->seed = makeseed(L);
+  g->uvhead.u.l.prev = &g->uvhead;
+  g->uvhead.u.l.next = &g->uvhead;
+  g->gcrunning = 0;  /* no GC while building state */
+  g->GCestimate = 0;
+  g->strt.size = 0;
+  g->strt.nuse = 0;
+  g->strt.hash = NULL;
+  setnilvalue(&g->l_registry);
+  luaZ_initbuffer(L, &g->buff);
+  g->panic = NULL;
+  g->version = lua_version(NULL);
+  g->gcstate = GCSpause;
+  g->allgc = NULL;
+  g->finobj = NULL;
+  g->tobefnz = NULL;
+  g->sweepgc = g->sweepfin = NULL;
+  g->gray = g->grayagain = NULL;
+  g->weak = g->ephemeron = g->allweak = NULL;
+  g->totalbytes = sizeof(LG);
+  g->GCdebt = 0;
+  g->gcpause = LUAI_GCPAUSE;
+  g->gcmajorinc = LUAI_GCMAJOR;
+  g->gcstepmul = LUAI_GCMUL;
+  for (i=0; i < LUA_NUMTAGS; i++) g->mt[i] = NULL;
+  if (luaD_rawrunprotected(L, f_luaopen, NULL) != LUA_OK) {
+    /* memory allocation error: free partial state */
+    close_state(L);
+    L = NULL;
+  }
+  else
+    luai_userstateopen(L);
+  return L;
+}
+
+
+LUA_API void lua_close (lua_State *L) {
+  L = G(L)->mainthread;  /* only the main thread can be closed */
+  lua_lock(L);
+  luai_userstateclose(L);
+  close_state(L);
+}
+
+
diff --git a/ext/lua/src/lstring.c b/ext/lua/src/lstring.c
new file mode 100644
index 0000000..8b5af0b
--- /dev/null
+++ b/ext/lua/src/lstring.c
@@ -0,0 +1,185 @@
+/*
+** $Id: lstring.c,v 2.26 2013/01/08 13:50:10 roberto Exp $
+** String table (keeps all strings handled by Lua)
+** See Copyright Notice in lua.h
+*/
+
+
+#include <string.h>
+
+#define lstring_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+
+
+/*
+** Lua will use at most ~(2^LUAI_HASHLIMIT) bytes from a string to
+** compute its hash
+*/
+#if !defined(LUAI_HASHLIMIT)
+#define LUAI_HASHLIMIT		5
+#endif
+
+
+/*
+** equality for long strings
+*/
+int luaS_eqlngstr (TString *a, TString *b) {
+  size_t len = a->tsv.len;
+  lua_assert(a->tsv.tt == LUA_TLNGSTR && b->tsv.tt == LUA_TLNGSTR);
+  return (a == b) ||  /* same instance or... */
+    ((len == b->tsv.len) &&  /* equal length and ... */
+     (memcmp(getstr(a), getstr(b), len) == 0));  /* equal contents */
+}
+
+
+/*
+** equality for strings
+*/
+int luaS_eqstr (TString *a, TString *b) {
+  return (a->tsv.tt == b->tsv.tt) &&
+         (a->tsv.tt == LUA_TSHRSTR ? eqshrstr(a, b) : luaS_eqlngstr(a, b));
+}
+
+
+unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) {
+  unsigned int h = seed ^ cast(unsigned int, l);
+  size_t l1;
+  size_t step = (l >> LUAI_HASHLIMIT) + 1;
+  for (l1 = l; l1 >= step; l1 -= step)
+    h = h ^ ((h<<5) + (h>>2) + cast_byte(str[l1 - 1]));
+  return h;
+}
+
+
+/*
+** resizes the string table
+*/
+void luaS_resize (lua_State *L, int newsize) {
+  int i;
+  stringtable *tb = &G(L)->strt;
+  /* cannot resize while GC is traversing strings */
+  luaC_runtilstate(L, ~bitmask(GCSsweepstring));
+  if (newsize > tb->size) {
+    luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *);
+    for (i = tb->size; i < newsize; i++) tb->hash[i] = NULL;
+  }
+  /* rehash */
+  for (i=0; i<tb->size; i++) {
+    GCObject *p = tb->hash[i];
+    tb->hash[i] = NULL;
+    while (p) {  /* for each node in the list */
+      GCObject *next = gch(p)->next;  /* save next */
+      unsigned int h = lmod(gco2ts(p)->hash, newsize);  /* new position */
+      gch(p)->next = tb->hash[h];  /* chain it */
+      tb->hash[h] = p;
+      resetoldbit(p);  /* see MOVE OLD rule */
+      p = next;
+    }
+  }
+  if (newsize < tb->size) {
+    /* shrinking slice must be empty */
+    lua_assert(tb->hash[newsize] == NULL && tb->hash[tb->size - 1] == NULL);
+    luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *);
+  }
+  tb->size = newsize;
+}
+
+
+/*
+** creates a new string object
+*/
+static TString *createstrobj (lua_State *L, const char *str, size_t l,
+                              int tag, unsigned int h, GCObject **list) {
+  TString *ts;
+  size_t totalsize;  /* total size of TString object */
+  totalsize = sizeof(TString) + ((l + 1) * sizeof(char));
+  ts = &luaC_newobj(L, tag, totalsize, list, 0)->ts;
+  ts->tsv.len = l;
+  ts->tsv.hash = h;
+  ts->tsv.extra = 0;
+  memcpy(ts+1, str, l*sizeof(char));
+  ((char *)(ts+1))[l] = '\0';  /* ending 0 */
+  return ts;
+}
+
+
+/*
+** creates a new short string, inserting it into string table
+*/
+static TString *newshrstr (lua_State *L, const char *str, size_t l,
+                                       unsigned int h) {
+  GCObject **list;  /* (pointer to) list where it will be inserted */
+  stringtable *tb = &G(L)->strt;
+  TString *s;
+  if (tb->nuse >= cast(lu_int32, tb->size) && tb->size <= MAX_INT/2)
+    luaS_resize(L, tb->size*2);  /* too crowded */
+  list = &tb->hash[lmod(h, tb->size)];
+  s = createstrobj(L, str, l, LUA_TSHRSTR, h, list);
+  tb->nuse++;
+  return s;
+}
+
+
+/*
+** checks whether short string exists and reuses it or creates a new one
+*/
+static TString *internshrstr (lua_State *L, const char *str, size_t l) {
+  GCObject *o;
+  global_State *g = G(L);
+  unsigned int h = luaS_hash(str, l, g->seed);
+  for (o = g->strt.hash[lmod(h, g->strt.size)];
+       o != NULL;
+       o = gch(o)->next) {
+    TString *ts = rawgco2ts(o);
+    if (h == ts->tsv.hash &&
+        l == ts->tsv.len &&
+        (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) {
+      if (isdead(G(L), o))  /* string is dead (but was not collected yet)? */
+        changewhite(o);  /* resurrect it */
+      return ts;
+    }
+  }
+  return newshrstr(L, str, l, h);  /* not found; create a new string */
+}
+
+
+/*
+** new string (with explicit length)
+*/
+TString *luaS_newlstr (lua_State *L, const char *str, size_t l) {
+  if (l <= LUAI_MAXSHORTLEN)  /* short string? */
+    return internshrstr(L, str, l);
+  else {
+    if (l + 1 > (MAX_SIZET - sizeof(TString))/sizeof(char))
+      luaM_toobig(L);
+    return createstrobj(L, str, l, LUA_TLNGSTR, G(L)->seed, NULL);
+  }
+}
+
+
+/*
+** new zero-terminated string
+*/
+TString *luaS_new (lua_State *L, const char *str) {
+  return luaS_newlstr(L, str, strlen(str));
+}
+
+
+Udata *luaS_newudata (lua_State *L, size_t s, Table *e) {
+  Udata *u;
+  if (s > MAX_SIZET - sizeof(Udata))
+    luaM_toobig(L);
+  u = &luaC_newobj(L, LUA_TUSERDATA, sizeof(Udata) + s, NULL, 0)->u;
+  u->uv.len = s;
+  u->uv.metatable = NULL;
+  u->uv.env = e;
+  return u;
+}
+
diff --git a/ext/lua/src/lstrlib.c b/ext/lua/src/lstrlib.c
new file mode 100644
index 0000000..fcc61c9
--- /dev/null
+++ b/ext/lua/src/lstrlib.c
@@ -0,0 +1,1019 @@
+/*
+** $Id: lstrlib.c,v 1.178 2012/08/14 18:12:34 roberto Exp $
+** Standard library for string operations and pattern-matching
+** See Copyright Notice in lua.h
+*/
+
+
+#include <ctype.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define lstrlib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+/*
+** maximum number of captures that a pattern can do during
+** pattern-matching. This limit is arbitrary.
+*/
+#if !defined(LUA_MAXCAPTURES)
+#define LUA_MAXCAPTURES		32
+#endif
+
+
+/* macro to `unsign' a character */
+#define uchar(c)	((unsigned char)(c))
+
+
+
+static int str_len (lua_State *L) {
+  size_t l;
+  luaL_checklstring(L, 1, &l);
+  lua_pushinteger(L, (lua_Integer)l);
+  return 1;
+}
+
+
+/* translate a relative string position: negative means back from end */
+static size_t posrelat (ptrdiff_t pos, size_t len) {
+  if (pos >= 0) return (size_t)pos;
+  else if (0u - (size_t)pos > len) return 0;
+  else return len - ((size_t)-pos) + 1;
+}
+
+
+static int str_sub (lua_State *L) {
+  size_t l;
+  const char *s = luaL_checklstring(L, 1, &l);
+  size_t start = posrelat(luaL_checkinteger(L, 2), l);
+  size_t end = posrelat(luaL_optinteger(L, 3, -1), l);
+  if (start < 1) start = 1;
+  if (end > l) end = l;
+  if (start <= end)
+    lua_pushlstring(L, s + start - 1, end - start + 1);
+  else lua_pushliteral(L, "");
+  return 1;
+}
+
+
+static int str_reverse (lua_State *L) {
+  size_t l, i;
+  luaL_Buffer b;
+  const char *s = luaL_checklstring(L, 1, &l);
+  char *p = luaL_buffinitsize(L, &b, l);
+  for (i = 0; i < l; i++)
+    p[i] = s[l - i - 1];
+  luaL_pushresultsize(&b, l);
+  return 1;
+}
+
+
+static int str_lower (lua_State *L) {
+  size_t l;
+  size_t i;
+  luaL_Buffer b;
+  const char *s = luaL_checklstring(L, 1, &l);
+  char *p = luaL_buffinitsize(L, &b, l);
+  for (i=0; i<l; i++)
+    p[i] = tolower(uchar(s[i]));
+  luaL_pushresultsize(&b, l);
+  return 1;
+}
+
+
+static int str_upper (lua_State *L) {
+  size_t l;
+  size_t i;
+  luaL_Buffer b;
+  const char *s = luaL_checklstring(L, 1, &l);
+  char *p = luaL_buffinitsize(L, &b, l);
+  for (i=0; i<l; i++)
+    p[i] = toupper(uchar(s[i]));
+  luaL_pushresultsize(&b, l);
+  return 1;
+}
+
+
+/* reasonable limit to avoid arithmetic overflow */
+#define MAXSIZE		((~(size_t)0) >> 1)
+
+static int str_rep (lua_State *L) {
+  size_t l, lsep;
+  const char *s = luaL_checklstring(L, 1, &l);
+  int n = luaL_checkint(L, 2);
+  const char *sep = luaL_optlstring(L, 3, "", &lsep);
+  if (n <= 0) lua_pushliteral(L, "");
+  else if (l + lsep < l || l + lsep >= MAXSIZE / n)  /* may overflow? */
+    return luaL_error(L, "resulting string too large");
+  else {
+    size_t totallen = n * l + (n - 1) * lsep;
+    luaL_Buffer b;
+    char *p = luaL_buffinitsize(L, &b, totallen);
+    while (n-- > 1) {  /* first n-1 copies (followed by separator) */
+      memcpy(p, s, l * sizeof(char)); p += l;
+      if (lsep > 0) {  /* avoid empty 'memcpy' (may be expensive) */
+        memcpy(p, sep, lsep * sizeof(char)); p += lsep;
+      }
+    }
+    memcpy(p, s, l * sizeof(char));  /* last copy (not followed by separator) */
+    luaL_pushresultsize(&b, totallen);
+  }
+  return 1;
+}
+
+
+static int str_byte (lua_State *L) {
+  size_t l;
+  const char *s = luaL_checklstring(L, 1, &l);
+  size_t posi = posrelat(luaL_optinteger(L, 2, 1), l);
+  size_t pose = posrelat(luaL_optinteger(L, 3, posi), l);
+  int n, i;
+  if (posi < 1) posi = 1;
+  if (pose > l) pose = l;
+  if (posi > pose) return 0;  /* empty interval; return no values */
+  n = (int)(pose -  posi + 1);
+  if (posi + n <= pose)  /* (size_t -> int) overflow? */
+    return luaL_error(L, "string slice too long");
+  luaL_checkstack(L, n, "string slice too long");
+  for (i=0; i<n; i++)
+    lua_pushinteger(L, uchar(s[posi+i-1]));
+  return n;
+}
+
+
+static int str_char (lua_State *L) {
+  int n = lua_gettop(L);  /* number of arguments */
+  int i;
+  luaL_Buffer b;
+  char *p = luaL_buffinitsize(L, &b, n);
+  for (i=1; i<=n; i++) {
+    int c = luaL_checkint(L, i);
+    luaL_argcheck(L, uchar(c) == c, i, "value out of range");
+    p[i - 1] = uchar(c);
+  }
+  luaL_pushresultsize(&b, n);
+  return 1;
+}
+
+
+static int writer (lua_State *L, const void* b, size_t size, void* B) {
+  (void)L;
+  luaL_addlstring((luaL_Buffer*) B, (const char *)b, size);
+  return 0;
+}
+
+
+static int str_dump (lua_State *L) {
+  luaL_Buffer b;
+  luaL_checktype(L, 1, LUA_TFUNCTION);
+  lua_settop(L, 1);
+  luaL_buffinit(L,&b);
+  if (lua_dump(L, writer, &b) != 0)
+    return luaL_error(L, "unable to dump given function");
+  luaL_pushresult(&b);
+  return 1;
+}
+
+
+
+/*
+** {======================================================
+** PATTERN MATCHING
+** =======================================================
+*/
+
+
+#define CAP_UNFINISHED	(-1)
+#define CAP_POSITION	(-2)
+
+
+typedef struct MatchState {
+  int matchdepth;  /* control for recursive depth (to avoid C stack overflow) */
+  const char *src_init;  /* init of source string */
+  const char *src_end;  /* end ('\0') of source string */
+  const char *p_end;  /* end ('\0') of pattern */
+  lua_State *L;
+  int level;  /* total number of captures (finished or unfinished) */
+  struct {
+    const char *init;
+    ptrdiff_t len;
+  } capture[LUA_MAXCAPTURES];
+} MatchState;
+
+
+/* recursive function */
+static const char *match (MatchState *ms, const char *s, const char *p);
+
+
+/* maximum recursion depth for 'match' */
+#if !defined(MAXCCALLS)
+#define MAXCCALLS	200
+#endif
+
+
+#define L_ESC		'%'
+#define SPECIALS	"^$*+?.([%-"
+
+
+static int check_capture (MatchState *ms, int l) {
+  l -= '1';
+  if (l < 0 || l >= ms->level || ms->capture[l].len == CAP_UNFINISHED)
+    return luaL_error(ms->L, "invalid capture index %%%d", l + 1);
+  return l;
+}
+
+
+static int capture_to_close (MatchState *ms) {
+  int level = ms->level;
+  for (level--; level>=0; level--)
+    if (ms->capture[level].len == CAP_UNFINISHED) return level;
+  return luaL_error(ms->L, "invalid pattern capture");
+}
+
+
+static const char *classend (MatchState *ms, const char *p) {
+  switch (*p++) {
+    case L_ESC: {
+      if (p == ms->p_end)
+        luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")");
+      return p+1;
+    }
+    case '[': {
+      if (*p == '^') p++;
+      do {  /* look for a `]' */
+        if (p == ms->p_end)
+          luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")");
+        if (*(p++) == L_ESC && p < ms->p_end)
+          p++;  /* skip escapes (e.g. `%]') */
+      } while (*p != ']');
+      return p+1;
+    }
+    default: {
+      return p;
+    }
+  }
+}
+
+
+static int match_class (int c, int cl) {
+  int res;
+  switch (tolower(cl)) {
+    case 'a' : res = isalpha(c); break;
+    case 'c' : res = iscntrl(c); break;
+    case 'd' : res = isdigit(c); break;
+    case 'g' : res = isgraph(c); break;
+    case 'l' : res = islower(c); break;
+    case 'p' : res = ispunct(c); break;
+    case 's' : res = isspace(c); break;
+    case 'u' : res = isupper(c); break;
+    case 'w' : res = isalnum(c); break;
+    case 'x' : res = isxdigit(c); break;
+    case 'z' : res = (c == 0); break;  /* deprecated option */
+    default: return (cl == c);
+  }
+  return (islower(cl) ? res : !res);
+}
+
+
+static int matchbracketclass (int c, const char *p, const char *ec) {
+  int sig = 1;
+  if (*(p+1) == '^') {
+    sig = 0;
+    p++;  /* skip the `^' */
+  }
+  while (++p < ec) {
+    if (*p == L_ESC) {
+      p++;
+      if (match_class(c, uchar(*p)))
+        return sig;
+    }
+    else if ((*(p+1) == '-') && (p+2 < ec)) {
+      p+=2;
+      if (uchar(*(p-2)) <= c && c <= uchar(*p))
+        return sig;
+    }
+    else if (uchar(*p) == c) return sig;
+  }
+  return !sig;
+}
+
+
+static int singlematch (MatchState *ms, const char *s, const char *p,
+                        const char *ep) {
+  if (s >= ms->src_end)
+    return 0;
+  else {
+    int c = uchar(*s);
+    switch (*p) {
+      case '.': return 1;  /* matches any char */
+      case L_ESC: return match_class(c, uchar(*(p+1)));
+      case '[': return matchbracketclass(c, p, ep-1);
+      default:  return (uchar(*p) == c);
+    }
+  }
+}
+
+
+static const char *matchbalance (MatchState *ms, const char *s,
+                                   const char *p) {
+  if (p >= ms->p_end - 1)
+    luaL_error(ms->L, "malformed pattern "
+                      "(missing arguments to " LUA_QL("%%b") ")");
+  if (*s != *p) return NULL;
+  else {
+    int b = *p;
+    int e = *(p+1);
+    int cont = 1;
+    while (++s < ms->src_end) {
+      if (*s == e) {
+        if (--cont == 0) return s+1;
+      }
+      else if (*s == b) cont++;
+    }
+  }
+  return NULL;  /* string ends out of balance */
+}
+
+
+static const char *max_expand (MatchState *ms, const char *s,
+                                 const char *p, const char *ep) {
+  ptrdiff_t i = 0;  /* counts maximum expand for item */
+  while (singlematch(ms, s + i, p, ep))
+    i++;
+  /* keeps trying to match with the maximum repetitions */
+  while (i>=0) {
+    const char *res = match(ms, (s+i), ep+1);
+    if (res) return res;
+    i--;  /* else didn't match; reduce 1 repetition to try again */
+  }
+  return NULL;
+}
+
+
+static const char *min_expand (MatchState *ms, const char *s,
+                                 const char *p, const char *ep) {
+  for (;;) {
+    const char *res = match(ms, s, ep+1);
+    if (res != NULL)
+      return res;
+    else if (singlematch(ms, s, p, ep))
+      s++;  /* try with one more repetition */
+    else return NULL;
+  }
+}
+
+
+static const char *start_capture (MatchState *ms, const char *s,
+                                    const char *p, int what) {
+  const char *res;
+  int level = ms->level;
+  if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures");
+  ms->capture[level].init = s;
+  ms->capture[level].len = what;
+  ms->level = level+1;
+  if ((res=match(ms, s, p)) == NULL)  /* match failed? */
+    ms->level--;  /* undo capture */
+  return res;
+}
+
+
+static const char *end_capture (MatchState *ms, const char *s,
+                                  const char *p) {
+  int l = capture_to_close(ms);
+  const char *res;
+  ms->capture[l].len = s - ms->capture[l].init;  /* close capture */
+  if ((res = match(ms, s, p)) == NULL)  /* match failed? */
+    ms->capture[l].len = CAP_UNFINISHED;  /* undo capture */
+  return res;
+}
+
+
+static const char *match_capture (MatchState *ms, const char *s, int l) {
+  size_t len;
+  l = check_capture(ms, l);
+  len = ms->capture[l].len;
+  if ((size_t)(ms->src_end-s) >= len &&
+      memcmp(ms->capture[l].init, s, len) == 0)
+    return s+len;
+  else return NULL;
+}
+
+
+static const char *match (MatchState *ms, const char *s, const char *p) {
+  if (ms->matchdepth-- == 0)
+    luaL_error(ms->L, "pattern too complex");
+  init: /* using goto's to optimize tail recursion */
+  if (p != ms->p_end) {  /* end of pattern? */
+    switch (*p) {
+      case '(': {  /* start capture */
+        if (*(p + 1) == ')')  /* position capture? */
+          s = start_capture(ms, s, p + 2, CAP_POSITION);
+        else
+          s = start_capture(ms, s, p + 1, CAP_UNFINISHED);
+        break;
+      }
+      case ')': {  /* end capture */
+        s = end_capture(ms, s, p + 1);
+        break;
+      }
+      case '$': {
+        if ((p + 1) != ms->p_end)  /* is the `$' the last char in pattern? */
+          goto dflt;  /* no; go to default */
+        s = (s == ms->src_end) ? s : NULL;  /* check end of string */
+        break;
+      }
+      case L_ESC: {  /* escaped sequences not in the format class[*+?-]? */
+        switch (*(p + 1)) {
+          case 'b': {  /* balanced string? */
+            s = matchbalance(ms, s, p + 2);
+            if (s != NULL) {
+              p += 4; goto init;  /* return match(ms, s, p + 4); */
+            }  /* else fail (s == NULL) */
+            break;
+          }
+          case 'f': {  /* frontier? */
+            const char *ep; char previous;
+            p += 2;
+            if (*p != '[')
+              luaL_error(ms->L, "missing " LUA_QL("[") " after "
+                                 LUA_QL("%%f") " in pattern");
+            ep = classend(ms, p);  /* points to what is next */
+            previous = (s == ms->src_init) ? '\0' : *(s - 1);
+            if (!matchbracketclass(uchar(previous), p, ep - 1) &&
+               matchbracketclass(uchar(*s), p, ep - 1)) {
+              p = ep; goto init;  /* return match(ms, s, ep); */
+            }
+            s = NULL;  /* match failed */
+            break;
+          }
+          case '0': case '1': case '2': case '3':
+          case '4': case '5': case '6': case '7':
+          case '8': case '9': {  /* capture results (%0-%9)? */
+            s = match_capture(ms, s, uchar(*(p + 1)));
+            if (s != NULL) {
+              p += 2; goto init;  /* return match(ms, s, p + 2) */
+            }
+            break;
+          }
+          default: goto dflt;
+        }
+        break;
+      }
+      default: dflt: {  /* pattern class plus optional suffix */
+        const char *ep = classend(ms, p);  /* points to optional suffix */
+        /* does not match at least once? */
+        if (!singlematch(ms, s, p, ep)) {
+          if (*ep == '*' || *ep == '?' || *ep == '-') {  /* accept empty? */
+            p = ep + 1; goto init;  /* return match(ms, s, ep + 1); */
+          }
+          else  /* '+' or no suffix */
+            s = NULL;  /* fail */
+        }
+        else {  /* matched once */
+          switch (*ep) {  /* handle optional suffix */
+            case '?': {  /* optional */
+              const char *res;
+              if ((res = match(ms, s + 1, ep + 1)) != NULL)
+                s = res;
+              else {
+                p = ep + 1; goto init;  /* else return match(ms, s, ep + 1); */
+              }
+              break;
+            }
+            case '+':  /* 1 or more repetitions */
+              s++;  /* 1 match already done */
+              /* go through */
+            case '*':  /* 0 or more repetitions */
+              s = max_expand(ms, s, p, ep);
+              break;
+            case '-':  /* 0 or more repetitions (minimum) */
+              s = min_expand(ms, s, p, ep);
+              break;
+            default:  /* no suffix */
+              s++; p = ep; goto init;  /* return match(ms, s + 1, ep); */
+          }
+        }
+        break;
+      }
+    }
+  }
+  ms->matchdepth++;
+  return s;
+}
+
+
+
+static const char *lmemfind (const char *s1, size_t l1,
+                               const char *s2, size_t l2) {
+  if (l2 == 0) return s1;  /* empty strings are everywhere */
+  else if (l2 > l1) return NULL;  /* avoids a negative `l1' */
+  else {
+    const char *init;  /* to search for a `*s2' inside `s1' */
+    l2--;  /* 1st char will be checked by `memchr' */
+    l1 = l1-l2;  /* `s2' cannot be found after that */
+    while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) {
+      init++;   /* 1st char is already checked */
+      if (memcmp(init, s2+1, l2) == 0)
+        return init-1;
+      else {  /* correct `l1' and `s1' to try again */
+        l1 -= init-s1;
+        s1 = init;
+      }
+    }
+    return NULL;  /* not found */
+  }
+}
+
+
+static void push_onecapture (MatchState *ms, int i, const char *s,
+                                                    const char *e) {
+  if (i >= ms->level) {
+    if (i == 0)  /* ms->level == 0, too */
+      lua_pushlstring(ms->L, s, e - s);  /* add whole match */
+    else
+      luaL_error(ms->L, "invalid capture index");
+  }
+  else {
+    ptrdiff_t l = ms->capture[i].len;
+    if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture");
+    if (l == CAP_POSITION)
+      lua_pushinteger(ms->L, ms->capture[i].init - ms->src_init + 1);
+    else
+      lua_pushlstring(ms->L, ms->capture[i].init, l);
+  }
+}
+
+
+static int push_captures (MatchState *ms, const char *s, const char *e) {
+  int i;
+  int nlevels = (ms->level == 0 && s) ? 1 : ms->level;
+  luaL_checkstack(ms->L, nlevels, "too many captures");
+  for (i = 0; i < nlevels; i++)
+    push_onecapture(ms, i, s, e);
+  return nlevels;  /* number of strings pushed */
+}
+
+
+/* check whether pattern has no special characters */
+static int nospecials (const char *p, size_t l) {
+  size_t upto = 0;
+  do {
+    if (strpbrk(p + upto, SPECIALS))
+      return 0;  /* pattern has a special character */
+    upto += strlen(p + upto) + 1;  /* may have more after \0 */
+  } while (upto <= l);
+  return 1;  /* no special chars found */
+}
+
+
+static int str_find_aux (lua_State *L, int find) {
+  size_t ls, lp;
+  const char *s = luaL_checklstring(L, 1, &ls);
+  const char *p = luaL_checklstring(L, 2, &lp);
+  size_t init = posrelat(luaL_optinteger(L, 3, 1), ls);
+  if (init < 1) init = 1;
+  else if (init > ls + 1) {  /* start after string's end? */
+    lua_pushnil(L);  /* cannot find anything */
+    return 1;
+  }
+  /* explicit request or no special characters? */
+  if (find && (lua_toboolean(L, 4) || nospecials(p, lp))) {
+    /* do a plain search */
+    const char *s2 = lmemfind(s + init - 1, ls - init + 1, p, lp);
+    if (s2) {
+      lua_pushinteger(L, s2 - s + 1);
+      lua_pushinteger(L, s2 - s + lp);
+      return 2;
+    }
+  }
+  else {
+    MatchState ms;
+    const char *s1 = s + init - 1;
+    int anchor = (*p == '^');
+    if (anchor) {
+      p++; lp--;  /* skip anchor character */
+    }
+    ms.L = L;
+    ms.matchdepth = MAXCCALLS;
+    ms.src_init = s;
+    ms.src_end = s + ls;
+    ms.p_end = p + lp;
+    do {
+      const char *res;
+      ms.level = 0;
+      lua_assert(ms.matchdepth == MAXCCALLS);
+      if ((res=match(&ms, s1, p)) != NULL) {
+        if (find) {
+          lua_pushinteger(L, s1 - s + 1);  /* start */
+          lua_pushinteger(L, res - s);   /* end */
+          return push_captures(&ms, NULL, 0) + 2;
+        }
+        else
+          return push_captures(&ms, s1, res);
+      }
+    } while (s1++ < ms.src_end && !anchor);
+  }
+  lua_pushnil(L);  /* not found */
+  return 1;
+}
+
+
+static int str_find (lua_State *L) {
+  return str_find_aux(L, 1);
+}
+
+
+static int str_match (lua_State *L) {
+  return str_find_aux(L, 0);
+}
+
+
+static int gmatch_aux (lua_State *L) {
+  MatchState ms;
+  size_t ls, lp;
+  const char *s = lua_tolstring(L, lua_upvalueindex(1), &ls);
+  const char *p = lua_tolstring(L, lua_upvalueindex(2), &lp);
+  const char *src;
+  ms.L = L;
+  ms.matchdepth = MAXCCALLS;
+  ms.src_init = s;
+  ms.src_end = s+ls;
+  ms.p_end = p + lp;
+  for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3));
+       src <= ms.src_end;
+       src++) {
+    const char *e;
+    ms.level = 0;
+    lua_assert(ms.matchdepth == MAXCCALLS);
+    if ((e = match(&ms, src, p)) != NULL) {
+      lua_Integer newstart = e-s;
+      if (e == src) newstart++;  /* empty match? go at least one position */
+      lua_pushinteger(L, newstart);
+      lua_replace(L, lua_upvalueindex(3));
+      return push_captures(&ms, src, e);
+    }
+  }
+  return 0;  /* not found */
+}
+
+
+static int gmatch (lua_State *L) {
+  luaL_checkstring(L, 1);
+  luaL_checkstring(L, 2);
+  lua_settop(L, 2);
+  lua_pushinteger(L, 0);
+  lua_pushcclosure(L, gmatch_aux, 3);
+  return 1;
+}
+
+
+static void add_s (MatchState *ms, luaL_Buffer *b, const char *s,
+                                                   const char *e) {
+  size_t l, i;
+  const char *news = lua_tolstring(ms->L, 3, &l);
+  for (i = 0; i < l; i++) {
+    if (news[i] != L_ESC)
+      luaL_addchar(b, news[i]);
+    else {
+      i++;  /* skip ESC */
+      if (!isdigit(uchar(news[i]))) {
+        if (news[i] != L_ESC)
+          luaL_error(ms->L, "invalid use of " LUA_QL("%c")
+                           " in replacement string", L_ESC);
+        luaL_addchar(b, news[i]);
+      }
+      else if (news[i] == '0')
+          luaL_addlstring(b, s, e - s);
+      else {
+        push_onecapture(ms, news[i] - '1', s, e);
+        luaL_addvalue(b);  /* add capture to accumulated result */
+      }
+    }
+  }
+}
+
+
+static void add_value (MatchState *ms, luaL_Buffer *b, const char *s,
+                                       const char *e, int tr) {
+  lua_State *L = ms->L;
+  switch (tr) {
+    case LUA_TFUNCTION: {
+      int n;
+      lua_pushvalue(L, 3);
+      n = push_captures(ms, s, e);
+      lua_call(L, n, 1);
+      break;
+    }
+    case LUA_TTABLE: {
+      push_onecapture(ms, 0, s, e);
+      lua_gettable(L, 3);
+      break;
+    }
+    default: {  /* LUA_TNUMBER or LUA_TSTRING */
+      add_s(ms, b, s, e);
+      return;
+    }
+  }
+  if (!lua_toboolean(L, -1)) {  /* nil or false? */
+    lua_pop(L, 1);
+    lua_pushlstring(L, s, e - s);  /* keep original text */
+  }
+  else if (!lua_isstring(L, -1))
+    luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1));
+  luaL_addvalue(b);  /* add result to accumulator */
+}
+
+
+static int str_gsub (lua_State *L) {
+  size_t srcl, lp;
+  const char *src = luaL_checklstring(L, 1, &srcl);
+  const char *p = luaL_checklstring(L, 2, &lp);
+  int tr = lua_type(L, 3);
+  size_t max_s = luaL_optinteger(L, 4, srcl+1);
+  int anchor = (*p == '^');
+  size_t n = 0;
+  MatchState ms;
+  luaL_Buffer b;
+  luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING ||
+                   tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3,
+                      "string/function/table expected");
+  luaL_buffinit(L, &b);
+  if (anchor) {
+    p++; lp--;  /* skip anchor character */
+  }
+  ms.L = L;
+  ms.matchdepth = MAXCCALLS;
+  ms.src_init = src;
+  ms.src_end = src+srcl;
+  ms.p_end = p + lp;
+  while (n < max_s) {
+    const char *e;
+    ms.level = 0;
+    lua_assert(ms.matchdepth == MAXCCALLS);
+    e = match(&ms, src, p);
+    if (e) {
+      n++;
+      add_value(&ms, &b, src, e, tr);
+    }
+    if (e && e>src) /* non empty match? */
+      src = e;  /* skip it */
+    else if (src < ms.src_end)
+      luaL_addchar(&b, *src++);
+    else break;
+    if (anchor) break;
+  }
+  luaL_addlstring(&b, src, ms.src_end-src);
+  luaL_pushresult(&b);
+  lua_pushinteger(L, n);  /* number of substitutions */
+  return 2;
+}
+
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** STRING FORMAT
+** =======================================================
+*/
+
+/*
+** LUA_INTFRMLEN is the length modifier for integer conversions in
+** 'string.format'; LUA_INTFRM_T is the integer type corresponding to
+** the previous length
+*/
+#if !defined(LUA_INTFRMLEN)	/* { */
+#if defined(LUA_USE_LONGLONG)
+
+#define LUA_INTFRMLEN		"ll"
+#define LUA_INTFRM_T		long long
+
+#else
+
+#define LUA_INTFRMLEN		"l"
+#define LUA_INTFRM_T		long
+
+#endif
+#endif				/* } */
+
+
+/*
+** LUA_FLTFRMLEN is the length modifier for float conversions in
+** 'string.format'; LUA_FLTFRM_T is the float type corresponding to
+** the previous length
+*/
+#if !defined(LUA_FLTFRMLEN)
+
+#define LUA_FLTFRMLEN		""
+#define LUA_FLTFRM_T		double
+
+#endif
+
+
+/* maximum size of each formatted item (> len(format('%99.99f', -1e308))) */
+#define MAX_ITEM	512
+/* valid flags in a format specification */
+#define FLAGS	"-+ #0"
+/*
+** maximum size of each format specification (such as '%-099.99d')
+** (+10 accounts for %99.99x plus margin of error)
+*/
+#define MAX_FORMAT	(sizeof(FLAGS) + sizeof(LUA_INTFRMLEN) + 10)
+
+
+static void addquoted (lua_State *L, luaL_Buffer *b, int arg) {
+  size_t l;
+  const char *s = luaL_checklstring(L, arg, &l);
+  luaL_addchar(b, '"');
+  while (l--) {
+    if (*s == '"' || *s == '\\' || *s == '\n') {
+      luaL_addchar(b, '\\');
+      luaL_addchar(b, *s);
+    }
+    else if (*s == '\0' || iscntrl(uchar(*s))) {
+      char buff[10];
+      if (!isdigit(uchar(*(s+1))))
+        sprintf(buff, "\\%d", (int)uchar(*s));
+      else
+        sprintf(buff, "\\%03d", (int)uchar(*s));
+      luaL_addstring(b, buff);
+    }
+    else
+      luaL_addchar(b, *s);
+    s++;
+  }
+  luaL_addchar(b, '"');
+}
+
+static const char *scanformat (lua_State *L, const char *strfrmt, char *form) {
+  const char *p = strfrmt;
+  while (*p != '\0' && strchr(FLAGS, *p) != NULL) p++;  /* skip flags */
+  if ((size_t)(p - strfrmt) >= sizeof(FLAGS)/sizeof(char))
+    luaL_error(L, "invalid format (repeated flags)");
+  if (isdigit(uchar(*p))) p++;  /* skip width */
+  if (isdigit(uchar(*p))) p++;  /* (2 digits at most) */
+  if (*p == '.') {
+    p++;
+    if (isdigit(uchar(*p))) p++;  /* skip precision */
+    if (isdigit(uchar(*p))) p++;  /* (2 digits at most) */
+  }
+  if (isdigit(uchar(*p)))
+    luaL_error(L, "invalid format (width or precision too long)");
+  *(form++) = '%';
+  memcpy(form, strfrmt, (p - strfrmt + 1) * sizeof(char));
+  form += p - strfrmt + 1;
+  *form = '\0';
+  return p;
+}
+
+
+/*
+** add length modifier into formats
+*/
+static void addlenmod (char *form, const char *lenmod) {
+  size_t l = strlen(form);
+  size_t lm = strlen(lenmod);
+  char spec = form[l - 1];
+  strcpy(form + l - 1, lenmod);
+  form[l + lm - 1] = spec;
+  form[l + lm] = '\0';
+}
+
+
+static int str_format (lua_State *L) {
+  int top = lua_gettop(L);
+  int arg = 1;
+  size_t sfl;
+  const char *strfrmt = luaL_checklstring(L, arg, &sfl);
+  const char *strfrmt_end = strfrmt+sfl;
+  luaL_Buffer b;
+  luaL_buffinit(L, &b);
+  while (strfrmt < strfrmt_end) {
+    if (*strfrmt != L_ESC)
+      luaL_addchar(&b, *strfrmt++);
+    else if (*++strfrmt == L_ESC)
+      luaL_addchar(&b, *strfrmt++);  /* %% */
+    else { /* format item */
+      char form[MAX_FORMAT];  /* to store the format (`%...') */
+      char *buff = luaL_prepbuffsize(&b, MAX_ITEM);  /* to put formatted item */
+      int nb = 0;  /* number of bytes in added item */
+      if (++arg > top)
+        luaL_argerror(L, arg, "no value");
+      strfrmt = scanformat(L, strfrmt, form);
+      switch (*strfrmt++) {
+        case 'c': {
+          nb = sprintf(buff, form, luaL_checkint(L, arg));
+          break;
+        }
+        case 'd': case 'i': {
+          lua_Number n = luaL_checknumber(L, arg);
+          LUA_INTFRM_T ni = (LUA_INTFRM_T)n;
+          lua_Number diff = n - (lua_Number)ni;
+          luaL_argcheck(L, -1 < diff && diff < 1, arg,
+                        "not a number in proper range");
+          addlenmod(form, LUA_INTFRMLEN);
+          nb = sprintf(buff, form, ni);
+          break;
+        }
+        case 'o': case 'u': case 'x': case 'X': {
+          lua_Number n = luaL_checknumber(L, arg);
+          unsigned LUA_INTFRM_T ni = (unsigned LUA_INTFRM_T)n;
+          lua_Number diff = n - (lua_Number)ni;
+          luaL_argcheck(L, -1 < diff && diff < 1, arg,
+                        "not a non-negative number in proper range");
+          addlenmod(form, LUA_INTFRMLEN);
+          nb = sprintf(buff, form, ni);
+          break;
+        }
+        case 'e': case 'E': case 'f':
+#if defined(LUA_USE_AFORMAT)
+        case 'a': case 'A':
+#endif
+        case 'g': case 'G': {
+          addlenmod(form, LUA_FLTFRMLEN);
+          nb = sprintf(buff, form, (LUA_FLTFRM_T)luaL_checknumber(L, arg));
+          break;
+        }
+        case 'q': {
+          addquoted(L, &b, arg);
+          break;
+        }
+        case 's': {
+          size_t l;
+          const char *s = luaL_tolstring(L, arg, &l);
+          if (!strchr(form, '.') && l >= 100) {
+            /* no precision and string is too long to be formatted;
+               keep original string */
+            luaL_addvalue(&b);
+            break;
+          }
+          else {
+            nb = sprintf(buff, form, s);
+            lua_pop(L, 1);  /* remove result from 'luaL_tolstring' */
+            break;
+          }
+        }
+        default: {  /* also treat cases `pnLlh' */
+          return luaL_error(L, "invalid option " LUA_QL("%%%c") " to "
+                               LUA_QL("format"), *(strfrmt - 1));
+        }
+      }
+      luaL_addsize(&b, nb);
+    }
+  }
+  luaL_pushresult(&b);
+  return 1;
+}
+
+/* }====================================================== */
+
+
+static const luaL_Reg strlib[] = {
+  {"byte", str_byte},
+  {"char", str_char},
+  {"dump", str_dump},
+  {"find", str_find},
+  {"format", str_format},
+  {"gmatch", gmatch},
+  {"gsub", str_gsub},
+  {"len", str_len},
+  {"lower", str_lower},
+  {"match", str_match},
+  {"rep", str_rep},
+  {"reverse", str_reverse},
+  {"sub", str_sub},
+  {"upper", str_upper},
+  {NULL, NULL}
+};
+
+
+static void createmetatable (lua_State *L) {
+  lua_createtable(L, 0, 1);  /* table to be metatable for strings */
+  lua_pushliteral(L, "");  /* dummy string */
+  lua_pushvalue(L, -2);  /* copy table */
+  lua_setmetatable(L, -2);  /* set table as metatable for strings */
+  lua_pop(L, 1);  /* pop dummy string */
+  lua_pushvalue(L, -2);  /* get string library */
+  lua_setfield(L, -2, "__index");  /* metatable.__index = string */
+  lua_pop(L, 1);  /* pop metatable */
+}
+
+
+/*
+** Open string library
+*/
+LUAMOD_API int luaopen_string (lua_State *L) {
+  luaL_newlib(L, strlib);
+  createmetatable(L);
+  return 1;
+}
+
diff --git a/ext/lua/src/ltable.c b/ext/lua/src/ltable.c
new file mode 100644
index 0000000..420391f
--- /dev/null
+++ b/ext/lua/src/ltable.c
@@ -0,0 +1,588 @@
+/*
+** $Id: ltable.c,v 2.72 2012/09/11 19:37:16 roberto Exp $
+** Lua tables (hash)
+** See Copyright Notice in lua.h
+*/
+
+
+/*
+** Implementation of tables (aka arrays, objects, or hash tables).
+** Tables keep its elements in two parts: an array part and a hash part.
+** Non-negative integer keys are all candidates to be kept in the array
+** part. The actual size of the array is the largest `n' such that at
+** least half the slots between 0 and n are in use.
+** Hash uses a mix of chained scatter table with Brent's variation.
+** A main invariant of these tables is that, if an element is not
+** in its main position (i.e. the `original' position that its hash gives
+** to it), then the colliding element is in its own main position.
+** Hence even when the load factor reaches 100%, performance remains good.
+*/
+
+#include <string.h>
+
+#define ltable_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lgc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "lvm.h"
+
+
+/*
+** max size of array part is 2^MAXBITS
+*/
+#if LUAI_BITSINT >= 32
+#define MAXBITS		30
+#else
+#define MAXBITS		(LUAI_BITSINT-2)
+#endif
+
+#define MAXASIZE	(1 << MAXBITS)
+
+
+#define hashpow2(t,n)		(gnode(t, lmod((n), sizenode(t))))
+
+#define hashstr(t,str)		hashpow2(t, (str)->tsv.hash)
+#define hashboolean(t,p)	hashpow2(t, p)
+
+
+/*
+** for some types, it is better to avoid modulus by power of 2, as
+** they tend to have many 2 factors.
+*/
+#define hashmod(t,n)	(gnode(t, ((n) % ((sizenode(t)-1)|1))))
+
+
+#define hashpointer(t,p)	hashmod(t, IntPoint(p))
+
+
+#define dummynode		(&dummynode_)
+
+#define isdummy(n)		((n) == dummynode)
+
+static const Node dummynode_ = {
+  {NILCONSTANT},  /* value */
+  {{NILCONSTANT, NULL}}  /* key */
+};
+
+
+/*
+** hash for lua_Numbers
+*/
+static Node *hashnum (const Table *t, lua_Number n) {
+  int i;
+  luai_hashnum(i, n);
+  if (i < 0) {
+    if (cast(unsigned int, i) == 0u - i)  /* use unsigned to avoid overflows */
+      i = 0;  /* handle INT_MIN */
+    i = -i;  /* must be a positive value */
+  }
+  return hashmod(t, i);
+}
+
+
+
+/*
+** returns the `main' position of an element in a table (that is, the index
+** of its hash value)
+*/
+static Node *mainposition (const Table *t, const TValue *key) {
+  switch (ttype(key)) {
+    case LUA_TNUMBER:
+      return hashnum(t, nvalue(key));
+    case LUA_TLNGSTR: {
+      TString *s = rawtsvalue(key);
+      if (s->tsv.extra == 0) {  /* no hash? */
+        s->tsv.hash = luaS_hash(getstr(s), s->tsv.len, s->tsv.hash);
+        s->tsv.extra = 1;  /* now it has its hash */
+      }
+      return hashstr(t, rawtsvalue(key));
+    }
+    case LUA_TSHRSTR:
+      return hashstr(t, rawtsvalue(key));
+    case LUA_TBOOLEAN:
+      return hashboolean(t, bvalue(key));
+    case LUA_TLIGHTUSERDATA:
+      return hashpointer(t, pvalue(key));
+    case LUA_TLCF:
+      return hashpointer(t, fvalue(key));
+    default:
+      return hashpointer(t, gcvalue(key));
+  }
+}
+
+
+/*
+** returns the index for `key' if `key' is an appropriate key to live in
+** the array part of the table, -1 otherwise.
+*/
+static int arrayindex (const TValue *key) {
+  if (ttisnumber(key)) {
+    lua_Number n = nvalue(key);
+    int k;
+    lua_number2int(k, n);
+    if (luai_numeq(cast_num(k), n))
+      return k;
+  }
+  return -1;  /* `key' did not match some condition */
+}
+
+
+/*
+** returns the index of a `key' for table traversals. First goes all
+** elements in the array part, then elements in the hash part. The
+** beginning of a traversal is signaled by -1.
+*/
+static int findindex (lua_State *L, Table *t, StkId key) {
+  int i;
+  if (ttisnil(key)) return -1;  /* first iteration */
+  i = arrayindex(key);
+  if (0 < i && i <= t->sizearray)  /* is `key' inside array part? */
+    return i-1;  /* yes; that's the index (corrected to C) */
+  else {
+    Node *n = mainposition(t, key);
+    for (;;) {  /* check whether `key' is somewhere in the chain */
+      /* key may be dead already, but it is ok to use it in `next' */
+      if (luaV_rawequalobj(gkey(n), key) ||
+            (ttisdeadkey(gkey(n)) && iscollectable(key) &&
+             deadvalue(gkey(n)) == gcvalue(key))) {
+        i = cast_int(n - gnode(t, 0));  /* key index in hash table */
+        /* hash elements are numbered after array ones */
+        return i + t->sizearray;
+      }
+      else n = gnext(n);
+      if (n == NULL)
+        luaG_runerror(L, "invalid key to " LUA_QL("next"));  /* key not found */
+    }
+  }
+}
+
+
+int luaH_next (lua_State *L, Table *t, StkId key) {
+  int i = findindex(L, t, key);  /* find original element */
+  for (i++; i < t->sizearray; i++) {  /* try first array part */
+    if (!ttisnil(&t->array[i])) {  /* a non-nil value? */
+      setnvalue(key, cast_num(i+1));
+      setobj2s(L, key+1, &t->array[i]);
+      return 1;
+    }
+  }
+  for (i -= t->sizearray; i < sizenode(t); i++) {  /* then hash part */
+    if (!ttisnil(gval(gnode(t, i)))) {  /* a non-nil value? */
+      setobj2s(L, key, gkey(gnode(t, i)));
+      setobj2s(L, key+1, gval(gnode(t, i)));
+      return 1;
+    }
+  }
+  return 0;  /* no more elements */
+}
+
+
+/*
+** {=============================================================
+** Rehash
+** ==============================================================
+*/
+
+
+static int computesizes (int nums[], int *narray) {
+  int i;
+  int twotoi;  /* 2^i */
+  int a = 0;  /* number of elements smaller than 2^i */
+  int na = 0;  /* number of elements to go to array part */
+  int n = 0;  /* optimal size for array part */
+  for (i = 0, twotoi = 1; twotoi/2 < *narray; i++, twotoi *= 2) {
+    if (nums[i] > 0) {
+      a += nums[i];
+      if (a > twotoi/2) {  /* more than half elements present? */
+        n = twotoi;  /* optimal size (till now) */
+        na = a;  /* all elements smaller than n will go to array part */
+      }
+    }
+    if (a == *narray) break;  /* all elements already counted */
+  }
+  *narray = n;
+  lua_assert(*narray/2 <= na && na <= *narray);
+  return na;
+}
+
+
+static int countint (const TValue *key, int *nums) {
+  int k = arrayindex(key);
+  if (0 < k && k <= MAXASIZE) {  /* is `key' an appropriate array index? */
+    nums[luaO_ceillog2(k)]++;  /* count as such */
+    return 1;
+  }
+  else
+    return 0;
+}
+
+
+static int numusearray (const Table *t, int *nums) {
+  int lg;
+  int ttlg;  /* 2^lg */
+  int ause = 0;  /* summation of `nums' */
+  int i = 1;  /* count to traverse all array keys */
+  for (lg=0, ttlg=1; lg<=MAXBITS; lg++, ttlg*=2) {  /* for each slice */
+    int lc = 0;  /* counter */
+    int lim = ttlg;
+    if (lim > t->sizearray) {
+      lim = t->sizearray;  /* adjust upper limit */
+      if (i > lim)
+        break;  /* no more elements to count */
+    }
+    /* count elements in range (2^(lg-1), 2^lg] */
+    for (; i <= lim; i++) {
+      if (!ttisnil(&t->array[i-1]))
+        lc++;
+    }
+    nums[lg] += lc;
+    ause += lc;
+  }
+  return ause;
+}
+
+
+static int numusehash (const Table *t, int *nums, int *pnasize) {
+  int totaluse = 0;  /* total number of elements */
+  int ause = 0;  /* summation of `nums' */
+  int i = sizenode(t);
+  while (i--) {
+    Node *n = &t->node[i];
+    if (!ttisnil(gval(n))) {
+      ause += countint(gkey(n), nums);
+      totaluse++;
+    }
+  }
+  *pnasize += ause;
+  return totaluse;
+}
+
+
+static void setarrayvector (lua_State *L, Table *t, int size) {
+  int i;
+  luaM_reallocvector(L, t->array, t->sizearray, size, TValue);
+  for (i=t->sizearray; i<size; i++)
+     setnilvalue(&t->array[i]);
+  t->sizearray = size;
+}
+
+
+static void setnodevector (lua_State *L, Table *t, int size) {
+  int lsize;
+  if (size == 0) {  /* no elements to hash part? */
+    t->node = cast(Node *, dummynode);  /* use common `dummynode' */
+    lsize = 0;
+  }
+  else {
+    int i;
+    lsize = luaO_ceillog2(size);
+    if (lsize > MAXBITS)
+      luaG_runerror(L, "table overflow");
+    size = twoto(lsize);
+    t->node = luaM_newvector(L, size, Node);
+    for (i=0; i<size; i++) {
+      Node *n = gnode(t, i);
+      gnext(n) = NULL;
+      setnilvalue(gkey(n));
+      setnilvalue(gval(n));
+    }
+  }
+  t->lsizenode = cast_byte(lsize);
+  t->lastfree = gnode(t, size);  /* all positions are free */
+}
+
+
+void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize) {
+  int i;
+  int oldasize = t->sizearray;
+  int oldhsize = t->lsizenode;
+  Node *nold = t->node;  /* save old hash ... */
+  if (nasize > oldasize)  /* array part must grow? */
+    setarrayvector(L, t, nasize);
+  /* create new hash part with appropriate size */
+  setnodevector(L, t, nhsize);
+  if (nasize < oldasize) {  /* array part must shrink? */
+    t->sizearray = nasize;
+    /* re-insert elements from vanishing slice */
+    for (i=nasize; i<oldasize; i++) {
+      if (!ttisnil(&t->array[i]))
+        luaH_setint(L, t, i + 1, &t->array[i]);
+    }
+    /* shrink array */
+    luaM_reallocvector(L, t->array, oldasize, nasize, TValue);
+  }
+  /* re-insert elements from hash part */
+  for (i = twoto(oldhsize) - 1; i >= 0; i--) {
+    Node *old = nold+i;
+    if (!ttisnil(gval(old))) {
+      /* doesn't need barrier/invalidate cache, as entry was
+         already present in the table */
+      setobjt2t(L, luaH_set(L, t, gkey(old)), gval(old));
+    }
+  }
+  if (!isdummy(nold))
+    luaM_freearray(L, nold, cast(size_t, twoto(oldhsize))); /* free old array */
+}
+
+
+void luaH_resizearray (lua_State *L, Table *t, int nasize) {
+  int nsize = isdummy(t->node) ? 0 : sizenode(t);
+  luaH_resize(L, t, nasize, nsize);
+}
+
+
+static void rehash (lua_State *L, Table *t, const TValue *ek) {
+  int nasize, na;
+  int nums[MAXBITS+1];  /* nums[i] = number of keys with 2^(i-1) < k <= 2^i */
+  int i;
+  int totaluse;
+  for (i=0; i<=MAXBITS; i++) nums[i] = 0;  /* reset counts */
+  nasize = numusearray(t, nums);  /* count keys in array part */
+  totaluse = nasize;  /* all those keys are integer keys */
+  totaluse += numusehash(t, nums, &nasize);  /* count keys in hash part */
+  /* count extra key */
+  nasize += countint(ek, nums);
+  totaluse++;
+  /* compute new size for array part */
+  na = computesizes(nums, &nasize);
+  /* resize the table to new computed sizes */
+  luaH_resize(L, t, nasize, totaluse - na);
+}
+
+
+
+/*
+** }=============================================================
+*/
+
+
+Table *luaH_new (lua_State *L) {
+  Table *t = &luaC_newobj(L, LUA_TTABLE, sizeof(Table), NULL, 0)->h;
+  t->metatable = NULL;
+  t->flags = cast_byte(~0);
+  t->array = NULL;
+  t->sizearray = 0;
+  setnodevector(L, t, 0);
+  return t;
+}
+
+
+void luaH_free (lua_State *L, Table *t) {
+  if (!isdummy(t->node))
+    luaM_freearray(L, t->node, cast(size_t, sizenode(t)));
+  luaM_freearray(L, t->array, t->sizearray);
+  luaM_free(L, t);
+}
+
+
+static Node *getfreepos (Table *t) {
+  while (t->lastfree > t->node) {
+    t->lastfree--;
+    if (ttisnil(gkey(t->lastfree)))
+      return t->lastfree;
+  }
+  return NULL;  /* could not find a free place */
+}
+
+
+
+/*
+** inserts a new key into a hash table; first, check whether key's main
+** position is free. If not, check whether colliding node is in its main
+** position or not: if it is not, move colliding node to an empty place and
+** put new key in its main position; otherwise (colliding node is in its main
+** position), new key goes to an empty position.
+*/
+TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key) {
+  Node *mp;
+  if (ttisnil(key)) luaG_runerror(L, "table index is nil");
+  else if (ttisnumber(key) && luai_numisnan(L, nvalue(key)))
+    luaG_runerror(L, "table index is NaN");
+  mp = mainposition(t, key);
+  if (!ttisnil(gval(mp)) || isdummy(mp)) {  /* main position is taken? */
+    Node *othern;
+    Node *n = getfreepos(t);  /* get a free place */
+    if (n == NULL) {  /* cannot find a free place? */
+      rehash(L, t, key);  /* grow table */
+      /* whatever called 'newkey' take care of TM cache and GC barrier */
+      return luaH_set(L, t, key);  /* insert key into grown table */
+    }
+    lua_assert(!isdummy(n));
+    othern = mainposition(t, gkey(mp));
+    if (othern != mp) {  /* is colliding node out of its main position? */
+      /* yes; move colliding node into free position */
+      while (gnext(othern) != mp) othern = gnext(othern);  /* find previous */
+      gnext(othern) = n;  /* redo the chain with `n' in place of `mp' */
+      *n = *mp;  /* copy colliding node into free pos. (mp->next also goes) */
+      gnext(mp) = NULL;  /* now `mp' is free */
+      setnilvalue(gval(mp));
+    }
+    else {  /* colliding node is in its own main position */
+      /* new node will go into free position */
+      gnext(n) = gnext(mp);  /* chain new position */
+      gnext(mp) = n;
+      mp = n;
+    }
+  }
+  setobj2t(L, gkey(mp), key);
+  luaC_barrierback(L, obj2gco(t), key);
+  lua_assert(ttisnil(gval(mp)));
+  return gval(mp);
+}
+
+
+/*
+** search function for integers
+*/
+const TValue *luaH_getint (Table *t, int key) {
+  /* (1 <= key && key <= t->sizearray) */
+  if (cast(unsigned int, key-1) < cast(unsigned int, t->sizearray))
+    return &t->array[key-1];
+  else {
+    lua_Number nk = cast_num(key);
+    Node *n = hashnum(t, nk);
+    do {  /* check whether `key' is somewhere in the chain */
+      if (ttisnumber(gkey(n)) && luai_numeq(nvalue(gkey(n)), nk))
+        return gval(n);  /* that's it */
+      else n = gnext(n);
+    } while (n);
+    return luaO_nilobject;
+  }
+}
+
+
+/*
+** search function for short strings
+*/
+const TValue *luaH_getstr (Table *t, TString *key) {
+  Node *n = hashstr(t, key);
+  lua_assert(key->tsv.tt == LUA_TSHRSTR);
+  do {  /* check whether `key' is somewhere in the chain */
+    if (ttisshrstring(gkey(n)) && eqshrstr(rawtsvalue(gkey(n)), key))
+      return gval(n);  /* that's it */
+    else n = gnext(n);
+  } while (n);
+  return luaO_nilobject;
+}
+
+
+/*
+** main search function
+*/
+const TValue *luaH_get (Table *t, const TValue *key) {
+  switch (ttype(key)) {
+    case LUA_TSHRSTR: return luaH_getstr(t, rawtsvalue(key));
+    case LUA_TNIL: return luaO_nilobject;
+    case LUA_TNUMBER: {
+      int k;
+      lua_Number n = nvalue(key);
+      lua_number2int(k, n);
+      if (luai_numeq(cast_num(k), n)) /* index is int? */
+        return luaH_getint(t, k);  /* use specialized version */
+      /* else go through */
+    }
+    default: {
+      Node *n = mainposition(t, key);
+      do {  /* check whether `key' is somewhere in the chain */
+        if (luaV_rawequalobj(gkey(n), key))
+          return gval(n);  /* that's it */
+        else n = gnext(n);
+      } while (n);
+      return luaO_nilobject;
+    }
+  }
+}
+
+
+/*
+** beware: when using this function you probably need to check a GC
+** barrier and invalidate the TM cache.
+*/
+TValue *luaH_set (lua_State *L, Table *t, const TValue *key) {
+  const TValue *p = luaH_get(t, key);
+  if (p != luaO_nilobject)
+    return cast(TValue *, p);
+  else return luaH_newkey(L, t, key);
+}
+
+
+void luaH_setint (lua_State *L, Table *t, int key, TValue *value) {
+  const TValue *p = luaH_getint(t, key);
+  TValue *cell;
+  if (p != luaO_nilobject)
+    cell = cast(TValue *, p);
+  else {
+    TValue k;
+    setnvalue(&k, cast_num(key));
+    cell = luaH_newkey(L, t, &k);
+  }
+  setobj2t(L, cell, value);
+}
+
+
+static int unbound_search (Table *t, unsigned int j) {
+  unsigned int i = j;  /* i is zero or a present index */
+  j++;
+  /* find `i' and `j' such that i is present and j is not */
+  while (!ttisnil(luaH_getint(t, j))) {
+    i = j;
+    j *= 2;
+    if (j > cast(unsigned int, MAX_INT)) {  /* overflow? */
+      /* table was built with bad purposes: resort to linear search */
+      i = 1;
+      while (!ttisnil(luaH_getint(t, i))) i++;
+      return i - 1;
+    }
+  }
+  /* now do a binary search between them */
+  while (j - i > 1) {
+    unsigned int m = (i+j)/2;
+    if (ttisnil(luaH_getint(t, m))) j = m;
+    else i = m;
+  }
+  return i;
+}
+
+
+/*
+** Try to find a boundary in table `t'. A `boundary' is an integer index
+** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil).
+*/
+int luaH_getn (Table *t) {
+  unsigned int j = t->sizearray;
+  if (j > 0 && ttisnil(&t->array[j - 1])) {
+    /* there is a boundary in the array part: (binary) search for it */
+    unsigned int i = 0;
+    while (j - i > 1) {
+      unsigned int m = (i+j)/2;
+      if (ttisnil(&t->array[m - 1])) j = m;
+      else i = m;
+    }
+    return i;
+  }
+  /* else must find a boundary in hash part */
+  else if (isdummy(t->node))  /* hash part is empty? */
+    return j;  /* that is easy... */
+  else return unbound_search(t, j);
+}
+
+
+
+#if defined(LUA_DEBUG)
+
+Node *luaH_mainposition (const Table *t, const TValue *key) {
+  return mainposition(t, key);
+}
+
+int luaH_isdummy (Node *n) { return isdummy(n); }
+
+#endif
diff --git a/ext/lua/src/ltablib.c b/ext/lua/src/ltablib.c
new file mode 100644
index 0000000..ad798b4
--- /dev/null
+++ b/ext/lua/src/ltablib.c
@@ -0,0 +1,283 @@
+/*
+** $Id: ltablib.c,v 1.65 2013/03/07 18:17:24 roberto Exp $
+** Library for Table Manipulation
+** See Copyright Notice in lua.h
+*/
+
+
+#include <stddef.h>
+
+#define ltablib_c
+#define LUA_LIB
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+#define aux_getn(L,n)	(luaL_checktype(L, n, LUA_TTABLE), luaL_len(L, n))
+
+
+
+#if defined(LUA_COMPAT_MAXN)
+static int maxn (lua_State *L) {
+  lua_Number max = 0;
+  luaL_checktype(L, 1, LUA_TTABLE);
+  lua_pushnil(L);  /* first key */
+  while (lua_next(L, 1)) {
+    lua_pop(L, 1);  /* remove value */
+    if (lua_type(L, -1) == LUA_TNUMBER) {
+      lua_Number v = lua_tonumber(L, -1);
+      if (v > max) max = v;
+    }
+  }
+  lua_pushnumber(L, max);
+  return 1;
+}
+#endif
+
+
+static int tinsert (lua_State *L) {
+  int e = aux_getn(L, 1) + 1;  /* first empty element */
+  int pos;  /* where to insert new element */
+  switch (lua_gettop(L)) {
+    case 2: {  /* called with only 2 arguments */
+      pos = e;  /* insert new element at the end */
+      break;
+    }
+    case 3: {
+      int i;
+      pos = luaL_checkint(L, 2);  /* 2nd argument is the position */
+      luaL_argcheck(L, 1 <= pos && pos <= e, 2, "position out of bounds");
+      for (i = e; i > pos; i--) {  /* move up elements */
+        lua_rawgeti(L, 1, i-1);
+        lua_rawseti(L, 1, i);  /* t[i] = t[i-1] */
+      }
+      break;
+    }
+    default: {
+      return luaL_error(L, "wrong number of arguments to " LUA_QL("insert"));
+    }
+  }
+  lua_rawseti(L, 1, pos);  /* t[pos] = v */
+  return 0;
+}
+
+
+static int tremove (lua_State *L) {
+  int size = aux_getn(L, 1);
+  int pos = luaL_optint(L, 2, size);
+  if (pos != size)  /* validate 'pos' if given */
+    luaL_argcheck(L, 1 <= pos && pos <= size + 1, 1, "position out of bounds");
+  lua_rawgeti(L, 1, pos);  /* result = t[pos] */
+  for ( ; pos < size; pos++) {
+    lua_rawgeti(L, 1, pos+1);
+    lua_rawseti(L, 1, pos);  /* t[pos] = t[pos+1] */
+  }
+  lua_pushnil(L);
+  lua_rawseti(L, 1, pos);  /* t[pos] = nil */
+  return 1;
+}
+
+
+static void addfield (lua_State *L, luaL_Buffer *b, int i) {
+  lua_rawgeti(L, 1, i);
+  if (!lua_isstring(L, -1))
+    luaL_error(L, "invalid value (%s) at index %d in table for "
+                  LUA_QL("concat"), luaL_typename(L, -1), i);
+  luaL_addvalue(b);
+}
+
+
+static int tconcat (lua_State *L) {
+  luaL_Buffer b;
+  size_t lsep;
+  int i, last;
+  const char *sep = luaL_optlstring(L, 2, "", &lsep);
+  luaL_checktype(L, 1, LUA_TTABLE);
+  i = luaL_optint(L, 3, 1);
+  last = luaL_opt(L, luaL_checkint, 4, luaL_len(L, 1));
+  luaL_buffinit(L, &b);
+  for (; i < last; i++) {
+    addfield(L, &b, i);
+    luaL_addlstring(&b, sep, lsep);
+  }
+  if (i == last)  /* add last value (if interval was not empty) */
+    addfield(L, &b, i);
+  luaL_pushresult(&b);
+  return 1;
+}
+
+
+/*
+** {======================================================
+** Pack/unpack
+** =======================================================
+*/
+
+static int pack (lua_State *L) {
+  int n = lua_gettop(L);  /* number of elements to pack */
+  lua_createtable(L, n, 1);  /* create result table */
+  lua_pushinteger(L, n);
+  lua_setfield(L, -2, "n");  /* t.n = number of elements */
+  if (n > 0) {  /* at least one element? */
+    int i;
+    lua_pushvalue(L, 1);
+    lua_rawseti(L, -2, 1);  /* insert first element */
+    lua_replace(L, 1);  /* move table into index 1 */
+    for (i = n; i >= 2; i--)  /* assign other elements */
+      lua_rawseti(L, 1, i);
+  }
+  return 1;  /* return table */
+}
+
+
+static int unpack (lua_State *L) {
+  int i, e, n;
+  luaL_checktype(L, 1, LUA_TTABLE);
+  i = luaL_optint(L, 2, 1);
+  e = luaL_opt(L, luaL_checkint, 3, luaL_len(L, 1));
+  if (i > e) return 0;  /* empty range */
+  n = e - i + 1;  /* number of elements */
+  if (n <= 0 || !lua_checkstack(L, n))  /* n <= 0 means arith. overflow */
+    return luaL_error(L, "too many results to unpack");
+  lua_rawgeti(L, 1, i);  /* push arg[i] (avoiding overflow problems) */
+  while (i++ < e)  /* push arg[i + 1...e] */
+    lua_rawgeti(L, 1, i);
+  return n;
+}
+
+/* }====================================================== */
+
+
+
+/*
+** {======================================================
+** Quicksort
+** (based on `Algorithms in MODULA-3', Robert Sedgewick;
+**  Addison-Wesley, 1993.)
+** =======================================================
+*/
+
+
+static void set2 (lua_State *L, int i, int j) {
+  lua_rawseti(L, 1, i);
+  lua_rawseti(L, 1, j);
+}
+
+static int sort_comp (lua_State *L, int a, int b) {
+  if (!lua_isnil(L, 2)) {  /* function? */
+    int res;
+    lua_pushvalue(L, 2);
+    lua_pushvalue(L, a-1);  /* -1 to compensate function */
+    lua_pushvalue(L, b-2);  /* -2 to compensate function and `a' */
+    lua_call(L, 2, 1);
+    res = lua_toboolean(L, -1);
+    lua_pop(L, 1);
+    return res;
+  }
+  else  /* a < b? */
+    return lua_compare(L, a, b, LUA_OPLT);
+}
+
+static void auxsort (lua_State *L, int l, int u) {
+  while (l < u) {  /* for tail recursion */
+    int i, j;
+    /* sort elements a[l], a[(l+u)/2] and a[u] */
+    lua_rawgeti(L, 1, l);
+    lua_rawgeti(L, 1, u);
+    if (sort_comp(L, -1, -2))  /* a[u] < a[l]? */
+      set2(L, l, u);  /* swap a[l] - a[u] */
+    else
+      lua_pop(L, 2);
+    if (u-l == 1) break;  /* only 2 elements */
+    i = (l+u)/2;
+    lua_rawgeti(L, 1, i);
+    lua_rawgeti(L, 1, l);
+    if (sort_comp(L, -2, -1))  /* a[i]<a[l]? */
+      set2(L, i, l);
+    else {
+      lua_pop(L, 1);  /* remove a[l] */
+      lua_rawgeti(L, 1, u);
+      if (sort_comp(L, -1, -2))  /* a[u]<a[i]? */
+        set2(L, i, u);
+      else
+        lua_pop(L, 2);
+    }
+    if (u-l == 2) break;  /* only 3 elements */
+    lua_rawgeti(L, 1, i);  /* Pivot */
+    lua_pushvalue(L, -1);
+    lua_rawgeti(L, 1, u-1);
+    set2(L, i, u-1);
+    /* a[l] <= P == a[u-1] <= a[u], only need to sort from l+1 to u-2 */
+    i = l; j = u-1;
+    for (;;) {  /* invariant: a[l..i] <= P <= a[j..u] */
+      /* repeat ++i until a[i] >= P */
+      while (lua_rawgeti(L, 1, ++i), sort_comp(L, -1, -2)) {
+        if (i>=u) luaL_error(L, "invalid order function for sorting");
+        lua_pop(L, 1);  /* remove a[i] */
+      }
+      /* repeat --j until a[j] <= P */
+      while (lua_rawgeti(L, 1, --j), sort_comp(L, -3, -1)) {
+        if (j<=l) luaL_error(L, "invalid order function for sorting");
+        lua_pop(L, 1);  /* remove a[j] */
+      }
+      if (j<i) {
+        lua_pop(L, 3);  /* pop pivot, a[i], a[j] */
+        break;
+      }
+      set2(L, i, j);
+    }
+    lua_rawgeti(L, 1, u-1);
+    lua_rawgeti(L, 1, i);
+    set2(L, u-1, i);  /* swap pivot (a[u-1]) with a[i] */
+    /* a[l..i-1] <= a[i] == P <= a[i+1..u] */
+    /* adjust so that smaller half is in [j..i] and larger one in [l..u] */
+    if (i-l < u-i) {
+      j=l; i=i-1; l=i+2;
+    }
+    else {
+      j=i+1; i=u; u=j-2;
+    }
+    auxsort(L, j, i);  /* call recursively the smaller one */
+  }  /* repeat the routine for the larger one */
+}
+
+static int sort (lua_State *L) {
+  int n = aux_getn(L, 1);
+  luaL_checkstack(L, 40, "");  /* assume array is smaller than 2^40 */
+  if (!lua_isnoneornil(L, 2))  /* is there a 2nd argument? */
+    luaL_checktype(L, 2, LUA_TFUNCTION);
+  lua_settop(L, 2);  /* make sure there is two arguments */
+  auxsort(L, 1, n);
+  return 0;
+}
+
+/* }====================================================== */
+
+
+static const luaL_Reg tab_funcs[] = {
+  {"concat", tconcat},
+#if defined(LUA_COMPAT_MAXN)
+  {"maxn", maxn},
+#endif
+  {"insert", tinsert},
+  {"pack", pack},
+  {"unpack", unpack},
+  {"remove", tremove},
+  {"sort", sort},
+  {NULL, NULL}
+};
+
+
+LUAMOD_API int luaopen_table (lua_State *L) {
+  luaL_newlib(L, tab_funcs);
+#if defined(LUA_COMPAT_UNPACK)
+  /* _G.unpack = table.unpack */
+  lua_getfield(L, -1, "unpack");
+  lua_setglobal(L, "unpack");
+#endif
+  return 1;
+}
+
diff --git a/ext/lua/src/ltm.c b/ext/lua/src/ltm.c
new file mode 100644
index 0000000..e70006d
--- /dev/null
+++ b/ext/lua/src/ltm.c
@@ -0,0 +1,77 @@
+/*
+** $Id: ltm.c,v 2.14 2011/06/02 19:31:40 roberto Exp $
+** Tag methods
+** See Copyright Notice in lua.h
+*/
+
+
+#include <string.h>
+
+#define ltm_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "lobject.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+
+
+static const char udatatypename[] = "userdata";
+
+LUAI_DDEF const char *const luaT_typenames_[LUA_TOTALTAGS] = {
+  "no value",
+  "nil", "boolean", udatatypename, "number",
+  "string", "table", "function", udatatypename, "thread",
+  "proto", "upval"  /* these last two cases are used for tests only */
+};
+
+
+void luaT_init (lua_State *L) {
+  static const char *const luaT_eventname[] = {  /* ORDER TM */
+    "__index", "__newindex",
+    "__gc", "__mode", "__len", "__eq",
+    "__add", "__sub", "__mul", "__div", "__mod",
+    "__pow", "__unm", "__lt", "__le",
+    "__concat", "__call"
+  };
+  int i;
+  for (i=0; i<TM_N; i++) {
+    G(L)->tmname[i] = luaS_new(L, luaT_eventname[i]);
+    luaS_fix(G(L)->tmname[i]);  /* never collect these names */
+  }
+}
+
+
+/*
+** function to be used with macro "fasttm": optimized for absence of
+** tag methods
+*/
+const TValue *luaT_gettm (Table *events, TMS event, TString *ename) {
+  const TValue *tm = luaH_getstr(events, ename);
+  lua_assert(event <= TM_EQ);
+  if (ttisnil(tm)) {  /* no tag method? */
+    events->flags |= cast_byte(1u<<event);  /* cache this fact */
+    return NULL;
+  }
+  else return tm;
+}
+
+
+const TValue *luaT_gettmbyobj (lua_State *L, const TValue *o, TMS event) {
+  Table *mt;
+  switch (ttypenv(o)) {
+    case LUA_TTABLE:
+      mt = hvalue(o)->metatable;
+      break;
+    case LUA_TUSERDATA:
+      mt = uvalue(o)->metatable;
+      break;
+    default:
+      mt = G(L)->mt[ttypenv(o)];
+  }
+  return (mt ? luaH_getstr(mt, G(L)->tmname[event]) : luaO_nilobject);
+}
+
diff --git a/ext/lua/src/lua.c b/ext/lua/src/lua.c
new file mode 100644
index 0000000..6a00712
--- /dev/null
+++ b/ext/lua/src/lua.c
@@ -0,0 +1,497 @@
+/*
+** $Id: lua.c,v 1.206 2012/09/29 20:07:06 roberto Exp $
+** Lua stand-alone interpreter
+** See Copyright Notice in lua.h
+*/
+
+
+#include <signal.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define lua_c
+
+#include "lua.h"
+
+#include "lauxlib.h"
+#include "lualib.h"
+
+
+#if !defined(LUA_PROMPT)
+#define LUA_PROMPT		"> "
+#define LUA_PROMPT2		">> "
+#endif
+
+#if !defined(LUA_PROGNAME)
+#define LUA_PROGNAME		"lua"
+#endif
+
+#if !defined(LUA_MAXINPUT)
+#define LUA_MAXINPUT		512
+#endif
+
+#if !defined(LUA_INIT)
+#define LUA_INIT		"LUA_INIT"
+#endif
+
+#define LUA_INITVERSION  \
+	LUA_INIT "_" LUA_VERSION_MAJOR "_" LUA_VERSION_MINOR
+
+
+/*
+** lua_stdin_is_tty detects whether the standard input is a 'tty' (that
+** is, whether we're running lua interactively).
+*/
+#if defined(LUA_USE_ISATTY)
+#include <unistd.h>
+#define lua_stdin_is_tty()	isatty(0)
+#elif defined(LUA_WIN)
+#include <io.h>
+#include <stdio.h>
+#define lua_stdin_is_tty()	_isatty(_fileno(stdin))
+#else
+#define lua_stdin_is_tty()	1  /* assume stdin is a tty */
+#endif
+
+
+/*
+** lua_readline defines how to show a prompt and then read a line from
+** the standard input.
+** lua_saveline defines how to "save" a read line in a "history".
+** lua_freeline defines how to free a line read by lua_readline.
+*/
+#if defined(LUA_USE_READLINE)
+
+#include <stdio.h>
+#include <readline/readline.h>
+#include <readline/history.h>
+#define lua_readline(L,b,p)	((void)L, ((b)=readline(p)) != NULL)
+#define lua_saveline(L,idx) \
+        if (lua_rawlen(L,idx) > 0)  /* non-empty line? */ \
+          add_history(lua_tostring(L, idx));  /* add it to history */
+#define lua_freeline(L,b)	((void)L, free(b))
+
+#elif !defined(lua_readline)
+
+#define lua_readline(L,b,p) \
+        ((void)L, fputs(p, stdout), fflush(stdout),  /* show prompt */ \
+        fgets(b, LUA_MAXINPUT, stdin) != NULL)  /* get line */
+#define lua_saveline(L,idx)	{ (void)L; (void)idx; }
+#define lua_freeline(L,b)	{ (void)L; (void)b; }
+
+#endif
+
+
+
+
+static lua_State *globalL = NULL;
+
+static const char *progname = LUA_PROGNAME;
+
+
+
+static void lstop (lua_State *L, lua_Debug *ar) {
+  (void)ar;  /* unused arg. */
+  lua_sethook(L, NULL, 0, 0);
+  luaL_error(L, "interrupted!");
+}
+
+
+static void laction (int i) {
+  signal(i, SIG_DFL); /* if another SIGINT happens before lstop,
+                              terminate process (default action) */
+  lua_sethook(globalL, lstop, LUA_MASKCALL | LUA_MASKRET | LUA_MASKCOUNT, 1);
+}
+
+
+static void print_usage (const char *badoption) {
+  luai_writestringerror("%s: ", progname);
+  if (badoption[1] == 'e' || badoption[1] == 'l')
+    luai_writestringerror("'%s' needs argument\n", badoption);
+  else
+    luai_writestringerror("unrecognized option '%s'\n", badoption);
+  luai_writestringerror(
+  "usage: %s [options] [script [args]]\n"
+  "Available options are:\n"
+  "  -e stat  execute string " LUA_QL("stat") "\n"
+  "  -i       enter interactive mode after executing " LUA_QL("script") "\n"
+  "  -l name  require library " LUA_QL("name") "\n"
+  "  -v       show version information\n"
+  "  -E       ignore environment variables\n"
+  "  --       stop handling options\n"
+  "  -        stop handling options and execute stdin\n"
+  ,
+  progname);
+}
+
+
+static void l_message (const char *pname, const char *msg) {
+  if (pname) luai_writestringerror("%s: ", pname);
+  luai_writestringerror("%s\n", msg);
+}
+
+
+static int report (lua_State *L, int status) {
+  if (status != LUA_OK && !lua_isnil(L, -1)) {
+    const char *msg = lua_tostring(L, -1);
+    if (msg == NULL) msg = "(error object is not a string)";
+    l_message(progname, msg);
+    lua_pop(L, 1);
+    /* force a complete garbage collection in case of errors */
+    lua_gc(L, LUA_GCCOLLECT, 0);
+  }
+  return status;
+}
+
+
+/* the next function is called unprotected, so it must avoid errors */
+static void finalreport (lua_State *L, int status) {
+  if (status != LUA_OK) {
+    const char *msg = (lua_type(L, -1) == LUA_TSTRING) ? lua_tostring(L, -1)
+                                                       : NULL;
+    if (msg == NULL) msg = "(error object is not a string)";
+    l_message(progname, msg);
+    lua_pop(L, 1);
+  }
+}
+
+
+static int traceback (lua_State *L) {
+  const char *msg = lua_tostring(L, 1);
+  if (msg)
+    luaL_traceback(L, L, msg, 1);
+  else if (!lua_isnoneornil(L, 1)) {  /* is there an error object? */
+    if (!luaL_callmeta(L, 1, "__tostring"))  /* try its 'tostring' metamethod */
+      lua_pushliteral(L, "(no error message)");
+  }
+  return 1;
+}
+
+
+static int docall (lua_State *L, int narg, int nres) {
+  int status;
+  int base = lua_gettop(L) - narg;  /* function index */
+  lua_pushcfunction(L, traceback);  /* push traceback function */
+  lua_insert(L, base);  /* put it under chunk and args */
+  globalL = L;  /* to be available to 'laction' */
+  signal(SIGINT, laction);
+  status = lua_pcall(L, narg, nres, base);
+  signal(SIGINT, SIG_DFL);
+  lua_remove(L, base);  /* remove traceback function */
+  return status;
+}
+
+
+static void print_version (void) {
+  luai_writestring(LUA_COPYRIGHT, strlen(LUA_COPYRIGHT));
+  luai_writeline();
+}
+
+
+static int getargs (lua_State *L, char **argv, int n) {
+  int narg;
+  int i;
+  int argc = 0;
+  while (argv[argc]) argc++;  /* count total number of arguments */
+  narg = argc - (n + 1);  /* number of arguments to the script */
+  luaL_checkstack(L, narg + 3, "too many arguments to script");
+  for (i=n+1; i < argc; i++)
+    lua_pushstring(L, argv[i]);
+  lua_createtable(L, narg, n + 1);
+  for (i=0; i < argc; i++) {
+    lua_pushstring(L, argv[i]);
+    lua_rawseti(L, -2, i - n);
+  }
+  return narg;
+}
+
+
+static int dofile (lua_State *L, const char *name) {
+  int status = luaL_loadfile(L, name);
+  if (status == LUA_OK) status = docall(L, 0, 0);
+  return report(L, status);
+}
+
+
+static int dostring (lua_State *L, const char *s, const char *name) {
+  int status = luaL_loadbuffer(L, s, strlen(s), name);
+  if (status == LUA_OK) status = docall(L, 0, 0);
+  return report(L, status);
+}
+
+
+static int dolibrary (lua_State *L, const char *name) {
+  int status;
+  lua_getglobal(L, "require");
+  lua_pushstring(L, name);
+  status = docall(L, 1, 1);  /* call 'require(name)' */
+  if (status == LUA_OK)
+    lua_setglobal(L, name);  /* global[name] = require return */
+  return report(L, status);
+}
+
+
+static const char *get_prompt (lua_State *L, int firstline) {
+  const char *p;
+  lua_getglobal(L, firstline ? "_PROMPT" : "_PROMPT2");
+  p = lua_tostring(L, -1);
+  if (p == NULL) p = (firstline ? LUA_PROMPT : LUA_PROMPT2);
+  return p;
+}
+
+/* mark in error messages for incomplete statements */
+#define EOFMARK		"<eof>"
+#define marklen		(sizeof(EOFMARK)/sizeof(char) - 1)
+
+static int incomplete (lua_State *L, int status) {
+  if (status == LUA_ERRSYNTAX) {
+    size_t lmsg;
+    const char *msg = lua_tolstring(L, -1, &lmsg);
+    if (lmsg >= marklen && strcmp(msg + lmsg - marklen, EOFMARK) == 0) {
+      lua_pop(L, 1);
+      return 1;
+    }
+  }
+  return 0;  /* else... */
+}
+
+
+static int pushline (lua_State *L, int firstline) {
+  char buffer[LUA_MAXINPUT];
+  char *b = buffer;
+  size_t l;
+  const char *prmt = get_prompt(L, firstline);
+  int readstatus = lua_readline(L, b, prmt);
+  lua_pop(L, 1);  /* remove result from 'get_prompt' */
+  if (readstatus == 0)
+    return 0;  /* no input */
+  l = strlen(b);
+  if (l > 0 && b[l-1] == '\n')  /* line ends with newline? */
+    b[l-1] = '\0';  /* remove it */
+  if (firstline && b[0] == '=')  /* first line starts with `=' ? */
+    lua_pushfstring(L, "return %s", b+1);  /* change it to `return' */
+  else
+    lua_pushstring(L, b);
+  lua_freeline(L, b);
+  return 1;
+}
+
+
+static int loadline (lua_State *L) {
+  int status;
+  lua_settop(L, 0);
+  if (!pushline(L, 1))
+    return -1;  /* no input */
+  for (;;) {  /* repeat until gets a complete line */
+    size_t l;
+    const char *line = lua_tolstring(L, 1, &l);
+    status = luaL_loadbuffer(L, line, l, "=stdin");
+    if (!incomplete(L, status)) break;  /* cannot try to add lines? */
+    if (!pushline(L, 0))  /* no more input? */
+      return -1;
+    lua_pushliteral(L, "\n");  /* add a new line... */
+    lua_insert(L, -2);  /* ...between the two lines */
+    lua_concat(L, 3);  /* join them */
+  }
+  lua_saveline(L, 1);
+  lua_remove(L, 1);  /* remove line */
+  return status;
+}
+
+
+static void dotty (lua_State *L) {
+  int status;
+  const char *oldprogname = progname;
+  progname = NULL;
+  while ((status = loadline(L)) != -1) {
+    if (status == LUA_OK) status = docall(L, 0, LUA_MULTRET);
+    report(L, status);
+    if (status == LUA_OK && lua_gettop(L) > 0) {  /* any result to print? */
+      luaL_checkstack(L, LUA_MINSTACK, "too many results to print");
+      lua_getglobal(L, "print");
+      lua_insert(L, 1);
+      if (lua_pcall(L, lua_gettop(L)-1, 0, 0) != LUA_OK)
+        l_message(progname, lua_pushfstring(L,
+                               "error calling " LUA_QL("print") " (%s)",
+                               lua_tostring(L, -1)));
+    }
+  }
+  lua_settop(L, 0);  /* clear stack */
+  luai_writeline();
+  progname = oldprogname;
+}
+
+
+static int handle_script (lua_State *L, char **argv, int n) {
+  int status;
+  const char *fname;
+  int narg = getargs(L, argv, n);  /* collect arguments */
+  lua_setglobal(L, "arg");
+  fname = argv[n];
+  if (strcmp(fname, "-") == 0 && strcmp(argv[n-1], "--") != 0)
+    fname = NULL;  /* stdin */
+  status = luaL_loadfile(L, fname);
+  lua_insert(L, -(narg+1));
+  if (status == LUA_OK)
+    status = docall(L, narg, LUA_MULTRET);
+  else
+    lua_pop(L, narg);
+  return report(L, status);
+}
+
+
+/* check that argument has no extra characters at the end */
+#define noextrachars(x)		{if ((x)[2] != '\0') return -1;}
+
+
+/* indices of various argument indicators in array args */
+#define has_i		0	/* -i */
+#define has_v		1	/* -v */
+#define has_e		2	/* -e */
+#define has_E		3	/* -E */
+
+#define num_has		4	/* number of 'has_*' */
+
+
+static int collectargs (char **argv, int *args) {
+  int i;
+  for (i = 1; argv[i] != NULL; i++) {
+    if (argv[i][0] != '-')  /* not an option? */
+        return i;
+    switch (argv[i][1]) {  /* option */
+      case '-':
+        noextrachars(argv[i]);
+        return (argv[i+1] != NULL ? i+1 : 0);
+      case '\0':
+        return i;
+      case 'E':
+        args[has_E] = 1;
+        break;
+      case 'i':
+        noextrachars(argv[i]);
+        args[has_i] = 1;  /* go through */
+      case 'v':
+        noextrachars(argv[i]);
+        args[has_v] = 1;
+        break;
+      case 'e':
+        args[has_e] = 1;  /* go through */
+      case 'l':  /* both options need an argument */
+        if (argv[i][2] == '\0') {  /* no concatenated argument? */
+          i++;  /* try next 'argv' */
+          if (argv[i] == NULL || argv[i][0] == '-')
+            return -(i - 1);  /* no next argument or it is another option */
+        }
+        break;
+      default:  /* invalid option; return its index... */
+        return -i;  /* ...as a negative value */
+    }
+  }
+  return 0;
+}
+
+
+static int runargs (lua_State *L, char **argv, int n) {
+  int i;
+  for (i = 1; i < n; i++) {
+    lua_assert(argv[i][0] == '-');
+    switch (argv[i][1]) {  /* option */
+      case 'e': {
+        const char *chunk = argv[i] + 2;
+        if (*chunk == '\0') chunk = argv[++i];
+        lua_assert(chunk != NULL);
+        if (dostring(L, chunk, "=(command line)") != LUA_OK)
+          return 0;
+        break;
+      }
+      case 'l': {
+        const char *filename = argv[i] + 2;
+        if (*filename == '\0') filename = argv[++i];
+        lua_assert(filename != NULL);
+        if (dolibrary(L, filename) != LUA_OK)
+          return 0;  /* stop if file fails */
+        break;
+      }
+      default: break;
+    }
+  }
+  return 1;
+}
+
+
+static int handle_luainit (lua_State *L) {
+  const char *name = "=" LUA_INITVERSION;
+  const char *init = getenv(name + 1);
+  if (init == NULL) {
+    name = "=" LUA_INIT;
+    init = getenv(name + 1);  /* try alternative name */
+  }
+  if (init == NULL) return LUA_OK;
+  else if (init[0] == '@')
+    return dofile(L, init+1);
+  else
+    return dostring(L, init, name);
+}
+
+
+static int pmain (lua_State *L) {
+  int argc = (int)lua_tointeger(L, 1);
+  char **argv = (char **)lua_touserdata(L, 2);
+  int script;
+  int args[num_has];
+  args[has_i] = args[has_v] = args[has_e] = args[has_E] = 0;
+  if (argv[0] && argv[0][0]) progname = argv[0];
+  script = collectargs(argv, args);
+  if (script < 0) {  /* invalid arg? */
+    print_usage(argv[-script]);
+    return 0;
+  }
+  if (args[has_v]) print_version();
+  if (args[has_E]) {  /* option '-E'? */
+    lua_pushboolean(L, 1);  /* signal for libraries to ignore env. vars. */
+    lua_setfield(L, LUA_REGISTRYINDEX, "LUA_NOENV");
+  }
+  /* open standard libraries */
+  luaL_checkversion(L);
+  lua_gc(L, LUA_GCSTOP, 0);  /* stop collector during initialization */
+  luaL_openlibs(L);  /* open libraries */
+  lua_gc(L, LUA_GCRESTART, 0);
+  if (!args[has_E] && handle_luainit(L) != LUA_OK)
+    return 0;  /* error running LUA_INIT */
+  /* execute arguments -e and -l */
+  if (!runargs(L, argv, (script > 0) ? script : argc)) return 0;
+  /* execute main script (if there is one) */
+  if (script && handle_script(L, argv, script) != LUA_OK) return 0;
+  if (args[has_i])  /* -i option? */
+    dotty(L);
+  else if (script == 0 && !args[has_e] && !args[has_v]) {  /* no arguments? */
+    if (lua_stdin_is_tty()) {
+      print_version();
+      dotty(L);
+    }
+    else dofile(L, NULL);  /* executes stdin as a file */
+  }
+  lua_pushboolean(L, 1);  /* signal no errors */
+  return 1;
+}
+
+
+int main (int argc, char **argv) {
+  int status, result;
+  lua_State *L = luaL_newstate();  /* create state */
+  if (L == NULL) {
+    l_message(argv[0], "cannot create state: not enough memory");
+    return EXIT_FAILURE;
+  }
+  /* call 'pmain' in protected mode */
+  lua_pushcfunction(L, &pmain);
+  lua_pushinteger(L, argc);  /* 1st argument */
+  lua_pushlightuserdata(L, argv); /* 2nd argument */
+  status = lua_pcall(L, 2, 1, 0);
+  result = lua_toboolean(L, -1);  /* get result */
+  finalreport(L, status);
+  lua_close(L);
+  return (result && status == LUA_OK) ? EXIT_SUCCESS : EXIT_FAILURE;
+}
+
diff --git a/ext/lua/src/lundump.c b/ext/lua/src/lundump.c
new file mode 100644
index 0000000..54de011
--- /dev/null
+++ b/ext/lua/src/lundump.c
@@ -0,0 +1,258 @@
+/*
+** $Id: lundump.c,v 2.22 2012/05/08 13:53:33 roberto Exp $
+** load precompiled Lua chunks
+** See Copyright Notice in lua.h
+*/
+
+#include <string.h>
+
+#define lundump_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lmem.h"
+#include "lobject.h"
+#include "lstring.h"
+#include "lundump.h"
+#include "lzio.h"
+
+typedef struct {
+ lua_State* L;
+ ZIO* Z;
+ Mbuffer* b;
+ const char* name;
+} LoadState;
+
+static l_noret error(LoadState* S, const char* why)
+{
+ luaO_pushfstring(S->L,"%s: %s precompiled chunk",S->name,why);
+ luaD_throw(S->L,LUA_ERRSYNTAX);
+}
+
+#define LoadMem(S,b,n,size)	LoadBlock(S,b,(n)*(size))
+#define LoadByte(S)		(lu_byte)LoadChar(S)
+#define LoadVar(S,x)		LoadMem(S,&x,1,sizeof(x))
+#define LoadVector(S,b,n,size)	LoadMem(S,b,n,size)
+
+#if !defined(luai_verifycode)
+#define luai_verifycode(L,b,f)	/* empty */
+#endif
+
+static void LoadBlock(LoadState* S, void* b, size_t size)
+{
+ if (luaZ_read(S->Z,b,size)!=0) error(S,"truncated");
+}
+
+static int LoadChar(LoadState* S)
+{
+ char x;
+ LoadVar(S,x);
+ return x;
+}
+
+static int LoadInt(LoadState* S)
+{
+ int x;
+ LoadVar(S,x);
+ if (x<0) error(S,"corrupted");
+ return x;
+}
+
+static lua_Number LoadNumber(LoadState* S)
+{
+ lua_Number x;
+ LoadVar(S,x);
+ return x;
+}
+
+static TString* LoadString(LoadState* S)
+{
+ size_t size;
+ LoadVar(S,size);
+ if (size==0)
+  return NULL;
+ else
+ {
+  char* s=luaZ_openspace(S->L,S->b,size);
+  LoadBlock(S,s,size*sizeof(char));
+  return luaS_newlstr(S->L,s,size-1);		/* remove trailing '\0' */
+ }
+}
+
+static void LoadCode(LoadState* S, Proto* f)
+{
+ int n=LoadInt(S);
+ f->code=luaM_newvector(S->L,n,Instruction);
+ f->sizecode=n;
+ LoadVector(S,f->code,n,sizeof(Instruction));
+}
+
+static void LoadFunction(LoadState* S, Proto* f);
+
+static void LoadConstants(LoadState* S, Proto* f)
+{
+ int i,n;
+ n=LoadInt(S);
+ f->k=luaM_newvector(S->L,n,TValue);
+ f->sizek=n;
+ for (i=0; i<n; i++) setnilvalue(&f->k[i]);
+ for (i=0; i<n; i++)
+ {
+  TValue* o=&f->k[i];
+  int t=LoadChar(S);
+  switch (t)
+  {
+   case LUA_TNIL:
+	setnilvalue(o);
+	break;
+   case LUA_TBOOLEAN:
+	setbvalue(o,LoadChar(S));
+	break;
+   case LUA_TNUMBER:
+	setnvalue(o,LoadNumber(S));
+	break;
+   case LUA_TSTRING:
+	setsvalue2n(S->L,o,LoadString(S));
+	break;
+    default: lua_assert(0);
+  }
+ }
+ n=LoadInt(S);
+ f->p=luaM_newvector(S->L,n,Proto*);
+ f->sizep=n;
+ for (i=0; i<n; i++) f->p[i]=NULL;
+ for (i=0; i<n; i++)
+ {
+  f->p[i]=luaF_newproto(S->L);
+  LoadFunction(S,f->p[i]);
+ }
+}
+
+static void LoadUpvalues(LoadState* S, Proto* f)
+{
+ int i,n;
+ n=LoadInt(S);
+ f->upvalues=luaM_newvector(S->L,n,Upvaldesc);
+ f->sizeupvalues=n;
+ for (i=0; i<n; i++) f->upvalues[i].name=NULL;
+ for (i=0; i<n; i++)
+ {
+  f->upvalues[i].instack=LoadByte(S);
+  f->upvalues[i].idx=LoadByte(S);
+ }
+}
+
+static void LoadDebug(LoadState* S, Proto* f)
+{
+ int i,n;
+ f->source=LoadString(S);
+ n=LoadInt(S);
+ f->lineinfo=luaM_newvector(S->L,n,int);
+ f->sizelineinfo=n;
+ LoadVector(S,f->lineinfo,n,sizeof(int));
+ n=LoadInt(S);
+ f->locvars=luaM_newvector(S->L,n,LocVar);
+ f->sizelocvars=n;
+ for (i=0; i<n; i++) f->locvars[i].varname=NULL;
+ for (i=0; i<n; i++)
+ {
+  f->locvars[i].varname=LoadString(S);
+  f->locvars[i].startpc=LoadInt(S);
+  f->locvars[i].endpc=LoadInt(S);
+ }
+ n=LoadInt(S);
+ for (i=0; i<n; i++) f->upvalues[i].name=LoadString(S);
+}
+
+static void LoadFunction(LoadState* S, Proto* f)
+{
+ f->linedefined=LoadInt(S);
+ f->lastlinedefined=LoadInt(S);
+ f->numparams=LoadByte(S);
+ f->is_vararg=LoadByte(S);
+ f->maxstacksize=LoadByte(S);
+ LoadCode(S,f);
+ LoadConstants(S,f);
+ LoadUpvalues(S,f);
+ LoadDebug(S,f);
+}
+
+/* the code below must be consistent with the code in luaU_header */
+#define N0	LUAC_HEADERSIZE
+#define N1	(sizeof(LUA_SIGNATURE)-sizeof(char))
+#define N2	N1+2
+#define N3	N2+6
+
+static void LoadHeader(LoadState* S)
+{
+ lu_byte h[LUAC_HEADERSIZE];
+ lu_byte s[LUAC_HEADERSIZE];
+ luaU_header(h);
+ memcpy(s,h,sizeof(char));			/* first char already read */
+ LoadBlock(S,s+sizeof(char),LUAC_HEADERSIZE-sizeof(char));
+ if (memcmp(h,s,N0)==0) return;
+ if (memcmp(h,s,N1)!=0) error(S,"not a");
+ if (memcmp(h,s,N2)!=0) error(S,"version mismatch in");
+ if (memcmp(h,s,N3)!=0) error(S,"incompatible"); else error(S,"corrupted");
+}
+
+/*
+** load precompiled chunk
+*/
+Closure* luaU_undump (lua_State* L, ZIO* Z, Mbuffer* buff, const char* name)
+{
+ LoadState S;
+ Closure* cl;
+ if (*name=='@' || *name=='=')
+  S.name=name+1;
+ else if (*name==LUA_SIGNATURE[0])
+  S.name="binary string";
+ else
+  S.name=name;
+ S.L=L;
+ S.Z=Z;
+ S.b=buff;
+ LoadHeader(&S);
+ cl=luaF_newLclosure(L,1);
+ setclLvalue(L,L->top,cl); incr_top(L);
+ cl->l.p=luaF_newproto(L);
+ LoadFunction(&S,cl->l.p);
+ if (cl->l.p->sizeupvalues != 1)
+ {
+  Proto* p=cl->l.p;
+  cl=luaF_newLclosure(L,cl->l.p->sizeupvalues);
+  cl->l.p=p;
+  setclLvalue(L,L->top-1,cl);
+ }
+ luai_verifycode(L,buff,cl->l.p);
+ return cl;
+}
+
+#define MYINT(s)	(s[0]-'0')
+#define VERSION		MYINT(LUA_VERSION_MAJOR)*16+MYINT(LUA_VERSION_MINOR)
+#define FORMAT		0		/* this is the official format */
+
+/*
+* make header for precompiled chunks
+* if you change the code below be sure to update LoadHeader and FORMAT above
+* and LUAC_HEADERSIZE in lundump.h
+*/
+void luaU_header (lu_byte* h)
+{
+ int x=1;
+ memcpy(h,LUA_SIGNATURE,sizeof(LUA_SIGNATURE)-sizeof(char));
+ h+=sizeof(LUA_SIGNATURE)-sizeof(char);
+ *h++=cast_byte(VERSION);
+ *h++=cast_byte(FORMAT);
+ *h++=cast_byte(*(char*)&x);			/* endianness */
+ *h++=cast_byte(sizeof(int));
+ *h++=cast_byte(sizeof(size_t));
+ *h++=cast_byte(sizeof(Instruction));
+ *h++=cast_byte(sizeof(lua_Number));
+ *h++=cast_byte(((lua_Number)0.5)==0);		/* is lua_Number integral? */
+ memcpy(h,LUAC_TAIL,sizeof(LUAC_TAIL)-sizeof(char));
+}
diff --git a/ext/lua/src/lvm.c b/ext/lua/src/lvm.c
new file mode 100644
index 0000000..657d5c4
--- /dev/null
+++ b/ext/lua/src/lvm.c
@@ -0,0 +1,867 @@
+/*
+** $Id: lvm.c,v 2.155 2013/03/16 21:10:18 roberto Exp $
+** Lua virtual machine
+** See Copyright Notice in lua.h
+*/
+
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define lvm_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "ldebug.h"
+#include "ldo.h"
+#include "lfunc.h"
+#include "lgc.h"
+#include "lobject.h"
+#include "lopcodes.h"
+#include "lstate.h"
+#include "lstring.h"
+#include "ltable.h"
+#include "ltm.h"
+#include "lvm.h"
+
+
+
+/* limit for table tag-method chains (to avoid loops) */
+#define MAXTAGLOOP	100
+
+
+const TValue *luaV_tonumber (const TValue *obj, TValue *n) {
+  lua_Number num;
+  if (ttisnumber(obj)) return obj;
+  if (ttisstring(obj) && luaO_str2d(svalue(obj), tsvalue(obj)->len, &num)) {
+    setnvalue(n, num);
+    return n;
+  }
+  else
+    return NULL;
+}
+
+
+int luaV_tostring (lua_State *L, StkId obj) {
+  if (!ttisnumber(obj))
+    return 0;
+  else {
+    char s[LUAI_MAXNUMBER2STR];
+    lua_Number n = nvalue(obj);
+    int l = lua_number2str(s, n);
+    setsvalue2s(L, obj, luaS_newlstr(L, s, l));
+    return 1;
+  }
+}
+
+
+static void traceexec (lua_State *L) {
+  CallInfo *ci = L->ci;
+  lu_byte mask = L->hookmask;
+  int counthook = ((mask & LUA_MASKCOUNT) && L->hookcount == 0);
+  if (counthook)
+    resethookcount(L);  /* reset count */
+  if (ci->callstatus & CIST_HOOKYIELD) {  /* called hook last time? */
+    ci->callstatus &= ~CIST_HOOKYIELD;  /* erase mark */
+    return;  /* do not call hook again (VM yielded, so it did not move) */
+  }
+  if (counthook)
+    luaD_hook(L, LUA_HOOKCOUNT, -1);  /* call count hook */
+  if (mask & LUA_MASKLINE) {
+    Proto *p = ci_func(ci)->p;
+    int npc = pcRel(ci->u.l.savedpc, p);
+    int newline = getfuncline(p, npc);
+    if (npc == 0 ||  /* call linehook when enter a new function, */
+        ci->u.l.savedpc <= L->oldpc ||  /* when jump back (loop), or when */
+        newline != getfuncline(p, pcRel(L->oldpc, p)))  /* enter a new line */
+      luaD_hook(L, LUA_HOOKLINE, newline);  /* call line hook */
+  }
+  L->oldpc = ci->u.l.savedpc;
+  if (L->status == LUA_YIELD) {  /* did hook yield? */
+    if (counthook)
+      L->hookcount = 1;  /* undo decrement to zero */
+    ci->u.l.savedpc--;  /* undo increment (resume will increment it again) */
+    ci->callstatus |= CIST_HOOKYIELD;  /* mark that it yielded */
+    ci->func = L->top - 1;  /* protect stack below results */
+    luaD_throw(L, LUA_YIELD);
+  }
+}
+
+
+static void callTM (lua_State *L, const TValue *f, const TValue *p1,
+                    const TValue *p2, TValue *p3, int hasres) {
+  ptrdiff_t result = savestack(L, p3);
+  setobj2s(L, L->top++, f);  /* push function */
+  setobj2s(L, L->top++, p1);  /* 1st argument */
+  setobj2s(L, L->top++, p2);  /* 2nd argument */
+  if (!hasres)  /* no result? 'p3' is third argument */
+    setobj2s(L, L->top++, p3);  /* 3rd argument */
+  /* metamethod may yield only when called from Lua code */
+  luaD_call(L, L->top - (4 - hasres), hasres, isLua(L->ci));
+  if (hasres) {  /* if has result, move it to its place */
+    p3 = restorestack(L, result);
+    setobjs2s(L, p3, --L->top);
+  }
+}
+
+
+void luaV_gettable (lua_State *L, const TValue *t, TValue *key, StkId val) {
+  int loop;
+  for (loop = 0; loop < MAXTAGLOOP; loop++) {
+    const TValue *tm;
+    if (ttistable(t)) {  /* `t' is a table? */
+      Table *h = hvalue(t);
+      const TValue *res = luaH_get(h, key); /* do a primitive get */
+      if (!ttisnil(res) ||  /* result is not nil? */
+          (tm = fasttm(L, h->metatable, TM_INDEX)) == NULL) { /* or no TM? */
+        setobj2s(L, val, res);
+        return;
+      }
+      /* else will try the tag method */
+    }
+    else if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_INDEX)))
+      luaG_typeerror(L, t, "index");
+    if (ttisfunction(tm)) {
+      callTM(L, tm, t, key, val, 1);
+      return;
+    }
+    t = tm;  /* else repeat with 'tm' */
+  }
+  luaG_runerror(L, "loop in gettable");
+}
+
+
+void luaV_settable (lua_State *L, const TValue *t, TValue *key, StkId val) {
+  int loop;
+  for (loop = 0; loop < MAXTAGLOOP; loop++) {
+    const TValue *tm;
+    if (ttistable(t)) {  /* `t' is a table? */
+      Table *h = hvalue(t);
+      TValue *oldval = cast(TValue *, luaH_get(h, key));
+      /* if previous value is not nil, there must be a previous entry
+         in the table; moreover, a metamethod has no relevance */
+      if (!ttisnil(oldval) ||
+         /* previous value is nil; must check the metamethod */
+         ((tm = fasttm(L, h->metatable, TM_NEWINDEX)) == NULL &&
+         /* no metamethod; is there a previous entry in the table? */
+         (oldval != luaO_nilobject ||
+         /* no previous entry; must create one. (The next test is
+            always true; we only need the assignment.) */
+         (oldval = luaH_newkey(L, h, key), 1)))) {
+        /* no metamethod and (now) there is an entry with given key */
+        setobj2t(L, oldval, val);  /* assign new value to that entry */
+        invalidateTMcache(h);
+        luaC_barrierback(L, obj2gco(h), val);
+        return;
+      }
+      /* else will try the metamethod */
+    }
+    else  /* not a table; check metamethod */
+      if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_NEWINDEX)))
+        luaG_typeerror(L, t, "index");
+    /* there is a metamethod */
+    if (ttisfunction(tm)) {
+      callTM(L, tm, t, key, val, 0);
+      return;
+    }
+    t = tm;  /* else repeat with 'tm' */
+  }
+  luaG_runerror(L, "loop in settable");
+}
+
+
+static int call_binTM (lua_State *L, const TValue *p1, const TValue *p2,
+                       StkId res, TMS event) {
+  const TValue *tm = luaT_gettmbyobj(L, p1, event);  /* try first operand */
+  if (ttisnil(tm))
+    tm = luaT_gettmbyobj(L, p2, event);  /* try second operand */
+  if (ttisnil(tm)) return 0;
+  callTM(L, tm, p1, p2, res, 1);
+  return 1;
+}
+
+
+static const TValue *get_equalTM (lua_State *L, Table *mt1, Table *mt2,
+                                  TMS event) {
+  const TValue *tm1 = fasttm(L, mt1, event);
+  const TValue *tm2;
+  if (tm1 == NULL) return NULL;  /* no metamethod */
+  if (mt1 == mt2) return tm1;  /* same metatables => same metamethods */
+  tm2 = fasttm(L, mt2, event);
+  if (tm2 == NULL) return NULL;  /* no metamethod */
+  if (luaV_rawequalobj(tm1, tm2))  /* same metamethods? */
+    return tm1;
+  return NULL;
+}
+
+
+static int call_orderTM (lua_State *L, const TValue *p1, const TValue *p2,
+                         TMS event) {
+  if (!call_binTM(L, p1, p2, L->top, event))
+    return -1;  /* no metamethod */
+  else
+    return !l_isfalse(L->top);
+}
+
+
+static int l_strcmp (const TString *ls, const TString *rs) {
+  const char *l = getstr(ls);
+  size_t ll = ls->tsv.len;
+  const char *r = getstr(rs);
+  size_t lr = rs->tsv.len;
+  for (;;) {
+    int temp = strcoll(l, r);
+    if (temp != 0) return temp;
+    else {  /* strings are equal up to a `\0' */
+      size_t len = strlen(l);  /* index of first `\0' in both strings */
+      if (len == lr)  /* r is finished? */
+        return (len == ll) ? 0 : 1;
+      else if (len == ll)  /* l is finished? */
+        return -1;  /* l is smaller than r (because r is not finished) */
+      /* both strings longer than `len'; go on comparing (after the `\0') */
+      len++;
+      l += len; ll -= len; r += len; lr -= len;
+    }
+  }
+}
+
+
+int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r) {
+  int res;
+  if (ttisnumber(l) && ttisnumber(r))
+    return luai_numlt(L, nvalue(l), nvalue(r));
+  else if (ttisstring(l) && ttisstring(r))
+    return l_strcmp(rawtsvalue(l), rawtsvalue(r)) < 0;
+  else if ((res = call_orderTM(L, l, r, TM_LT)) < 0)
+    luaG_ordererror(L, l, r);
+  return res;
+}
+
+
+int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r) {
+  int res;
+  if (ttisnumber(l) && ttisnumber(r))
+    return luai_numle(L, nvalue(l), nvalue(r));
+  else if (ttisstring(l) && ttisstring(r))
+    return l_strcmp(rawtsvalue(l), rawtsvalue(r)) <= 0;
+  else if ((res = call_orderTM(L, l, r, TM_LE)) >= 0)  /* first try `le' */
+    return res;
+  else if ((res = call_orderTM(L, r, l, TM_LT)) < 0)  /* else try `lt' */
+    luaG_ordererror(L, l, r);
+  return !res;
+}
+
+
+/*
+** equality of Lua values. L == NULL means raw equality (no metamethods)
+*/
+int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2) {
+  const TValue *tm;
+  lua_assert(ttisequal(t1, t2));
+  switch (ttype(t1)) {
+    case LUA_TNIL: return 1;
+    case LUA_TNUMBER: return luai_numeq(nvalue(t1), nvalue(t2));
+    case LUA_TBOOLEAN: return bvalue(t1) == bvalue(t2);  /* true must be 1 !! */
+    case LUA_TLIGHTUSERDATA: return pvalue(t1) == pvalue(t2);
+    case LUA_TLCF: return fvalue(t1) == fvalue(t2);
+    case LUA_TSHRSTR: return eqshrstr(rawtsvalue(t1), rawtsvalue(t2));
+    case LUA_TLNGSTR: return luaS_eqlngstr(rawtsvalue(t1), rawtsvalue(t2));
+    case LUA_TUSERDATA: {
+      if (uvalue(t1) == uvalue(t2)) return 1;
+      else if (L == NULL) return 0;
+      tm = get_equalTM(L, uvalue(t1)->metatable, uvalue(t2)->metatable, TM_EQ);
+      break;  /* will try TM */
+    }
+    case LUA_TTABLE: {
+      if (hvalue(t1) == hvalue(t2)) return 1;
+      else if (L == NULL) return 0;
+      tm = get_equalTM(L, hvalue(t1)->metatable, hvalue(t2)->metatable, TM_EQ);
+      break;  /* will try TM */
+    }
+    default:
+      lua_assert(iscollectable(t1));
+      return gcvalue(t1) == gcvalue(t2);
+  }
+  if (tm == NULL) return 0;  /* no TM? */
+  callTM(L, tm, t1, t2, L->top, 1);  /* call TM */
+  return !l_isfalse(L->top);
+}
+
+
+void luaV_concat (lua_State *L, int total) {
+  lua_assert(total >= 2);
+  do {
+    StkId top = L->top;
+    int n = 2;  /* number of elements handled in this pass (at least 2) */
+    if (!(ttisstring(top-2) || ttisnumber(top-2)) || !tostring(L, top-1)) {
+      if (!call_binTM(L, top-2, top-1, top-2, TM_CONCAT))
+        luaG_concaterror(L, top-2, top-1);
+    }
+    else if (tsvalue(top-1)->len == 0)  /* second operand is empty? */
+      (void)tostring(L, top - 2);  /* result is first operand */
+    else if (ttisstring(top-2) && tsvalue(top-2)->len == 0) {
+      setobjs2s(L, top - 2, top - 1);  /* result is second op. */
+    }
+    else {
+      /* at least two non-empty string values; get as many as possible */
+      size_t tl = tsvalue(top-1)->len;
+      char *buffer;
+      int i;
+      /* collect total length */
+      for (i = 1; i < total && tostring(L, top-i-1); i++) {
+        size_t l = tsvalue(top-i-1)->len;
+        if (l >= (MAX_SIZET/sizeof(char)) - tl)
+          luaG_runerror(L, "string length overflow");
+        tl += l;
+      }
+      buffer = luaZ_openspace(L, &G(L)->buff, tl);
+      tl = 0;
+      n = i;
+      do {  /* concat all strings */
+        size_t l = tsvalue(top-i)->len;
+        memcpy(buffer+tl, svalue(top-i), l * sizeof(char));
+        tl += l;
+      } while (--i > 0);
+      setsvalue2s(L, top-n, luaS_newlstr(L, buffer, tl));
+    }
+    total -= n-1;  /* got 'n' strings to create 1 new */
+    L->top -= n-1;  /* popped 'n' strings and pushed one */
+  } while (total > 1);  /* repeat until only 1 result left */
+}
+
+
+void luaV_objlen (lua_State *L, StkId ra, const TValue *rb) {
+  const TValue *tm;
+  switch (ttypenv(rb)) {
+    case LUA_TTABLE: {
+      Table *h = hvalue(rb);
+      tm = fasttm(L, h->metatable, TM_LEN);
+      if (tm) break;  /* metamethod? break switch to call it */
+      setnvalue(ra, cast_num(luaH_getn(h)));  /* else primitive len */
+      return;
+    }
+    case LUA_TSTRING: {
+      setnvalue(ra, cast_num(tsvalue(rb)->len));
+      return;
+    }
+    default: {  /* try metamethod */
+      tm = luaT_gettmbyobj(L, rb, TM_LEN);
+      if (ttisnil(tm))  /* no metamethod? */
+        luaG_typeerror(L, rb, "get length of");
+      break;
+    }
+  }
+  callTM(L, tm, rb, rb, ra, 1);
+}
+
+
+void luaV_arith (lua_State *L, StkId ra, const TValue *rb,
+                 const TValue *rc, TMS op) {
+  TValue tempb, tempc;
+  const TValue *b, *c;
+  if ((b = luaV_tonumber(rb, &tempb)) != NULL &&
+      (c = luaV_tonumber(rc, &tempc)) != NULL) {
+    lua_Number res = luaO_arith(op - TM_ADD + LUA_OPADD, nvalue(b), nvalue(c));
+    setnvalue(ra, res);
+  }
+  else if (!call_binTM(L, rb, rc, ra, op))
+    luaG_aritherror(L, rb, rc);
+}
+
+
+/*
+** check whether cached closure in prototype 'p' may be reused, that is,
+** whether there is a cached closure with the same upvalues needed by
+** new closure to be created.
+*/
+static Closure *getcached (Proto *p, UpVal **encup, StkId base) {
+  Closure *c = p->cache;
+  if (c != NULL) {  /* is there a cached closure? */
+    int nup = p->sizeupvalues;
+    Upvaldesc *uv = p->upvalues;
+    int i;
+    for (i = 0; i < nup; i++) {  /* check whether it has right upvalues */
+      TValue *v = uv[i].instack ? base + uv[i].idx : encup[uv[i].idx]->v;
+      if (c->l.upvals[i]->v != v)
+        return NULL;  /* wrong upvalue; cannot reuse closure */
+    }
+  }
+  return c;  /* return cached closure (or NULL if no cached closure) */
+}
+
+
+/*
+** create a new Lua closure, push it in the stack, and initialize
+** its upvalues. Note that the call to 'luaC_barrierproto' must come
+** before the assignment to 'p->cache', as the function needs the
+** original value of that field.
+*/
+static void pushclosure (lua_State *L, Proto *p, UpVal **encup, StkId base,
+                         StkId ra) {
+  int nup = p->sizeupvalues;
+  Upvaldesc *uv = p->upvalues;
+  int i;
+  Closure *ncl = luaF_newLclosure(L, nup);
+  ncl->l.p = p;
+  setclLvalue(L, ra, ncl);  /* anchor new closure in stack */
+  for (i = 0; i < nup; i++) {  /* fill in its upvalues */
+    if (uv[i].instack)  /* upvalue refers to local variable? */
+      ncl->l.upvals[i] = luaF_findupval(L, base + uv[i].idx);
+    else  /* get upvalue from enclosing function */
+      ncl->l.upvals[i] = encup[uv[i].idx];
+  }
+  luaC_barrierproto(L, p, ncl);
+  p->cache = ncl;  /* save it on cache for reuse */
+}
+
+
+/*
+** finish execution of an opcode interrupted by an yield
+*/
+void luaV_finishOp (lua_State *L) {
+  CallInfo *ci = L->ci;
+  StkId base = ci->u.l.base;
+  Instruction inst = *(ci->u.l.savedpc - 1);  /* interrupted instruction */
+  OpCode op = GET_OPCODE(inst);
+  switch (op) {  /* finish its execution */
+    case OP_ADD: case OP_SUB: case OP_MUL: case OP_DIV:
+    case OP_MOD: case OP_POW: case OP_UNM: case OP_LEN:
+    case OP_GETTABUP: case OP_GETTABLE: case OP_SELF: {
+      setobjs2s(L, base + GETARG_A(inst), --L->top);
+      break;
+    }
+    case OP_LE: case OP_LT: case OP_EQ: {
+      int res = !l_isfalse(L->top - 1);
+      L->top--;
+      /* metamethod should not be called when operand is K */
+      lua_assert(!ISK(GETARG_B(inst)));
+      if (op == OP_LE &&  /* "<=" using "<" instead? */
+          ttisnil(luaT_gettmbyobj(L, base + GETARG_B(inst), TM_LE)))
+        res = !res;  /* invert result */
+      lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_JMP);
+      if (res != GETARG_A(inst))  /* condition failed? */
+        ci->u.l.savedpc++;  /* skip jump instruction */
+      break;
+    }
+    case OP_CONCAT: {
+      StkId top = L->top - 1;  /* top when 'call_binTM' was called */
+      int b = GETARG_B(inst);      /* first element to concatenate */
+      int total = cast_int(top - 1 - (base + b));  /* yet to concatenate */
+      setobj2s(L, top - 2, top);  /* put TM result in proper position */
+      if (total > 1) {  /* are there elements to concat? */
+        L->top = top - 1;  /* top is one after last element (at top-2) */
+        luaV_concat(L, total);  /* concat them (may yield again) */
+      }
+      /* move final result to final position */
+      setobj2s(L, ci->u.l.base + GETARG_A(inst), L->top - 1);
+      L->top = ci->top;  /* restore top */
+      break;
+    }
+    case OP_TFORCALL: {
+      lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_TFORLOOP);
+      L->top = ci->top;  /* correct top */
+      break;
+    }
+    case OP_CALL: {
+      if (GETARG_C(inst) - 1 >= 0)  /* nresults >= 0? */
+        L->top = ci->top;  /* adjust results */
+      break;
+    }
+    case OP_TAILCALL: case OP_SETTABUP: case OP_SETTABLE:
+      break;
+    default: lua_assert(0);
+  }
+}
+
+
+
+/*
+** some macros for common tasks in `luaV_execute'
+*/
+
+#if !defined luai_runtimecheck
+#define luai_runtimecheck(L, c)		/* void */
+#endif
+
+
+#define RA(i)	(base+GETARG_A(i))
+/* to be used after possible stack reallocation */
+#define RB(i)	check_exp(getBMode(GET_OPCODE(i)) == OpArgR, base+GETARG_B(i))
+#define RC(i)	check_exp(getCMode(GET_OPCODE(i)) == OpArgR, base+GETARG_C(i))
+#define RKB(i)	check_exp(getBMode(GET_OPCODE(i)) == OpArgK, \
+	ISK(GETARG_B(i)) ? k+INDEXK(GETARG_B(i)) : base+GETARG_B(i))
+#define RKC(i)	check_exp(getCMode(GET_OPCODE(i)) == OpArgK, \
+	ISK(GETARG_C(i)) ? k+INDEXK(GETARG_C(i)) : base+GETARG_C(i))
+#define KBx(i)  \
+  (k + (GETARG_Bx(i) != 0 ? GETARG_Bx(i) - 1 : GETARG_Ax(*ci->u.l.savedpc++)))
+
+
+/* execute a jump instruction */
+#define dojump(ci,i,e) \
+  { int a = GETARG_A(i); \
+    if (a > 0) luaF_close(L, ci->u.l.base + a - 1); \
+    ci->u.l.savedpc += GETARG_sBx(i) + e; }
+
+/* for test instructions, execute the jump instruction that follows it */
+#define donextjump(ci)	{ i = *ci->u.l.savedpc; dojump(ci, i, 1); }
+
+
+#define Protect(x)	{ {x;}; base = ci->u.l.base; }
+
+#define checkGC(L,c)  \
+  Protect( luaC_condGC(L,{L->top = (c);  /* limit of live values */ \
+                          luaC_step(L); \
+                          L->top = ci->top;})  /* restore top */ \
+           luai_threadyield(L); )
+
+
+#define arith_op(op,tm) { \
+        TValue *rb = RKB(i); \
+        TValue *rc = RKC(i); \
+        if (ttisnumber(rb) && ttisnumber(rc)) { \
+          lua_Number nb = nvalue(rb), nc = nvalue(rc); \
+          setnvalue(ra, op(L, nb, nc)); \
+        } \
+        else { Protect(luaV_arith(L, ra, rb, rc, tm)); } }
+
+
+#define vmdispatch(o)	switch(o)
+#define vmcase(l,b)	case l: {b}  break;
+#define vmcasenb(l,b)	case l: {b}		/* nb = no break */
+
+void luaV_execute (lua_State *L) {
+  CallInfo *ci = L->ci;
+  LClosure *cl;
+  TValue *k;
+  StkId base;
+ newframe:  /* reentry point when frame changes (call/return) */
+  lua_assert(ci == L->ci);
+  cl = clLvalue(ci->func);
+  k = cl->p->k;
+  base = ci->u.l.base;
+  /* main loop of interpreter */
+  for (;;) {
+    Instruction i = *(ci->u.l.savedpc++);
+    StkId ra;
+    if ((L->hookmask & (LUA_MASKLINE | LUA_MASKCOUNT)) &&
+        (--L->hookcount == 0 || L->hookmask & LUA_MASKLINE)) {
+      Protect(traceexec(L));
+    }
+    /* WARNING: several calls may realloc the stack and invalidate `ra' */
+    ra = RA(i);
+    lua_assert(base == ci->u.l.base);
+    lua_assert(base <= L->top && L->top < L->stack + L->stacksize);
+    vmdispatch (GET_OPCODE(i)) {
+      vmcase(OP_MOVE,
+        setobjs2s(L, ra, RB(i));
+      )
+      vmcase(OP_LOADK,
+        TValue *rb = k + GETARG_Bx(i);
+        setobj2s(L, ra, rb);
+      )
+      vmcase(OP_LOADKX,
+        TValue *rb;
+        lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG);
+        rb = k + GETARG_Ax(*ci->u.l.savedpc++);
+        setobj2s(L, ra, rb);
+      )
+      vmcase(OP_LOADBOOL,
+        setbvalue(ra, GETARG_B(i));
+        if (GETARG_C(i)) ci->u.l.savedpc++;  /* skip next instruction (if C) */
+      )
+      vmcase(OP_LOADNIL,
+        int b = GETARG_B(i);
+        do {
+          setnilvalue(ra++);
+        } while (b--);
+      )
+      vmcase(OP_GETUPVAL,
+        int b = GETARG_B(i);
+        setobj2s(L, ra, cl->upvals[b]->v);
+      )
+      vmcase(OP_GETTABUP,
+        int b = GETARG_B(i);
+        Protect(luaV_gettable(L, cl->upvals[b]->v, RKC(i), ra));
+      )
+      vmcase(OP_GETTABLE,
+        Protect(luaV_gettable(L, RB(i), RKC(i), ra));
+      )
+      vmcase(OP_SETTABUP,
+        int a = GETARG_A(i);
+        Protect(luaV_settable(L, cl->upvals[a]->v, RKB(i), RKC(i)));
+      )
+      vmcase(OP_SETUPVAL,
+        UpVal *uv = cl->upvals[GETARG_B(i)];
+        setobj(L, uv->v, ra);
+        luaC_barrier(L, uv, ra);
+      )
+      vmcase(OP_SETTABLE,
+        Protect(luaV_settable(L, ra, RKB(i), RKC(i)));
+      )
+      vmcase(OP_NEWTABLE,
+        int b = GETARG_B(i);
+        int c = GETARG_C(i);
+        Table *t = luaH_new(L);
+        sethvalue(L, ra, t);
+        if (b != 0 || c != 0)
+          luaH_resize(L, t, luaO_fb2int(b), luaO_fb2int(c));
+        checkGC(L, ra + 1);
+      )
+      vmcase(OP_SELF,
+        StkId rb = RB(i);
+        setobjs2s(L, ra+1, rb);
+        Protect(luaV_gettable(L, rb, RKC(i), ra));
+      )
+      vmcase(OP_ADD,
+        arith_op(luai_numadd, TM_ADD);
+      )
+      vmcase(OP_SUB,
+        arith_op(luai_numsub, TM_SUB);
+      )
+      vmcase(OP_MUL,
+        arith_op(luai_nummul, TM_MUL);
+      )
+      vmcase(OP_DIV,
+        arith_op(luai_numdiv, TM_DIV);
+      )
+      vmcase(OP_MOD,
+        arith_op(luai_nummod, TM_MOD);
+      )
+      vmcase(OP_POW,
+        arith_op(luai_numpow, TM_POW);
+      )
+      vmcase(OP_UNM,
+        TValue *rb = RB(i);
+        if (ttisnumber(rb)) {
+          lua_Number nb = nvalue(rb);
+          setnvalue(ra, luai_numunm(L, nb));
+        }
+        else {
+          Protect(luaV_arith(L, ra, rb, rb, TM_UNM));
+        }
+      )
+      vmcase(OP_NOT,
+        TValue *rb = RB(i);
+        int res = l_isfalse(rb);  /* next assignment may change this value */
+        setbvalue(ra, res);
+      )
+      vmcase(OP_LEN,
+        Protect(luaV_objlen(L, ra, RB(i)));
+      )
+      vmcase(OP_CONCAT,
+        int b = GETARG_B(i);
+        int c = GETARG_C(i);
+        StkId rb;
+        L->top = base + c + 1;  /* mark the end of concat operands */
+        Protect(luaV_concat(L, c - b + 1));
+        ra = RA(i);  /* 'luav_concat' may invoke TMs and move the stack */
+        rb = b + base;
+        setobjs2s(L, ra, rb);
+        checkGC(L, (ra >= rb ? ra + 1 : rb));
+        L->top = ci->top;  /* restore top */
+      )
+      vmcase(OP_JMP,
+        dojump(ci, i, 0);
+      )
+      vmcase(OP_EQ,
+        TValue *rb = RKB(i);
+        TValue *rc = RKC(i);
+        Protect(
+          if (cast_int(equalobj(L, rb, rc)) != GETARG_A(i))
+            ci->u.l.savedpc++;
+          else
+            donextjump(ci);
+        )
+      )
+      vmcase(OP_LT,
+        Protect(
+          if (luaV_lessthan(L, RKB(i), RKC(i)) != GETARG_A(i))
+            ci->u.l.savedpc++;
+          else
+            donextjump(ci);
+        )
+      )
+      vmcase(OP_LE,
+        Protect(
+          if (luaV_lessequal(L, RKB(i), RKC(i)) != GETARG_A(i))
+            ci->u.l.savedpc++;
+          else
+            donextjump(ci);
+        )
+      )
+      vmcase(OP_TEST,
+        if (GETARG_C(i) ? l_isfalse(ra) : !l_isfalse(ra))
+            ci->u.l.savedpc++;
+          else
+          donextjump(ci);
+      )
+      vmcase(OP_TESTSET,
+        TValue *rb = RB(i);
+        if (GETARG_C(i) ? l_isfalse(rb) : !l_isfalse(rb))
+          ci->u.l.savedpc++;
+        else {
+          setobjs2s(L, ra, rb);
+          donextjump(ci);
+        }
+      )
+      vmcase(OP_CALL,
+        int b = GETARG_B(i);
+        int nresults = GETARG_C(i) - 1;
+        if (b != 0) L->top = ra+b;  /* else previous instruction set top */
+        if (luaD_precall(L, ra, nresults)) {  /* C function? */
+          if (nresults >= 0) L->top = ci->top;  /* adjust results */
+          base = ci->u.l.base;
+        }
+        else {  /* Lua function */
+          ci = L->ci;
+          ci->callstatus |= CIST_REENTRY;
+          goto newframe;  /* restart luaV_execute over new Lua function */
+        }
+      )
+      vmcase(OP_TAILCALL,
+        int b = GETARG_B(i);
+        if (b != 0) L->top = ra+b;  /* else previous instruction set top */
+        lua_assert(GETARG_C(i) - 1 == LUA_MULTRET);
+        if (luaD_precall(L, ra, LUA_MULTRET))  /* C function? */
+          base = ci->u.l.base;
+        else {
+          /* tail call: put called frame (n) in place of caller one (o) */
+          CallInfo *nci = L->ci;  /* called frame */
+          CallInfo *oci = nci->previous;  /* caller frame */
+          StkId nfunc = nci->func;  /* called function */
+          StkId ofunc = oci->func;  /* caller function */
+          /* last stack slot filled by 'precall' */
+          StkId lim = nci->u.l.base + getproto(nfunc)->numparams;
+          int aux;
+          /* close all upvalues from previous call */
+          if (cl->p->sizep > 0) luaF_close(L, oci->u.l.base);
+          /* move new frame into old one */
+          for (aux = 0; nfunc + aux < lim; aux++)
+            setobjs2s(L, ofunc + aux, nfunc + aux);
+          oci->u.l.base = ofunc + (nci->u.l.base - nfunc);  /* correct base */
+          oci->top = L->top = ofunc + (L->top - nfunc);  /* correct top */
+          oci->u.l.savedpc = nci->u.l.savedpc;
+          oci->callstatus |= CIST_TAIL;  /* function was tail called */
+          ci = L->ci = oci;  /* remove new frame */
+          lua_assert(L->top == oci->u.l.base + getproto(ofunc)->maxstacksize);
+          goto newframe;  /* restart luaV_execute over new Lua function */
+        }
+      )
+      vmcasenb(OP_RETURN,
+        int b = GETARG_B(i);
+        if (b != 0) L->top = ra+b-1;
+        if (cl->p->sizep > 0) luaF_close(L, base);
+        b = luaD_poscall(L, ra);
+        if (!(ci->callstatus & CIST_REENTRY))  /* 'ci' still the called one */
+          return;  /* external invocation: return */
+        else {  /* invocation via reentry: continue execution */
+          ci = L->ci;
+          if (b) L->top = ci->top;
+          lua_assert(isLua(ci));
+          lua_assert(GET_OPCODE(*((ci)->u.l.savedpc - 1)) == OP_CALL);
+          goto newframe;  /* restart luaV_execute over new Lua function */
+        }
+      )
+      vmcase(OP_FORLOOP,
+        lua_Number step = nvalue(ra+2);
+        lua_Number idx = luai_numadd(L, nvalue(ra), step); /* increment index */
+        lua_Number limit = nvalue(ra+1);
+        if (luai_numlt(L, 0, step) ? luai_numle(L, idx, limit)
+                                   : luai_numle(L, limit, idx)) {
+          ci->u.l.savedpc += GETARG_sBx(i);  /* jump back */
+          setnvalue(ra, idx);  /* update internal index... */
+          setnvalue(ra+3, idx);  /* ...and external index */
+        }
+      )
+      vmcase(OP_FORPREP,
+        const TValue *init = ra;
+        const TValue *plimit = ra+1;
+        const TValue *pstep = ra+2;
+        if (!tonumber(init, ra))
+          luaG_runerror(L, LUA_QL("for") " initial value must be a number");
+        else if (!tonumber(plimit, ra+1))
+          luaG_runerror(L, LUA_QL("for") " limit must be a number");
+        else if (!tonumber(pstep, ra+2))
+          luaG_runerror(L, LUA_QL("for") " step must be a number");
+        setnvalue(ra, luai_numsub(L, nvalue(ra), nvalue(pstep)));
+        ci->u.l.savedpc += GETARG_sBx(i);
+      )
+      vmcasenb(OP_TFORCALL,
+        StkId cb = ra + 3;  /* call base */
+        setobjs2s(L, cb+2, ra+2);
+        setobjs2s(L, cb+1, ra+1);
+        setobjs2s(L, cb, ra);
+        L->top = cb + 3;  /* func. + 2 args (state and index) */
+        Protect(luaD_call(L, cb, GETARG_C(i), 1));
+        L->top = ci->top;
+        i = *(ci->u.l.savedpc++);  /* go to next instruction */
+        ra = RA(i);
+        lua_assert(GET_OPCODE(i) == OP_TFORLOOP);
+        goto l_tforloop;
+      )
+      vmcase(OP_TFORLOOP,
+        l_tforloop:
+        if (!ttisnil(ra + 1)) {  /* continue loop? */
+          setobjs2s(L, ra, ra + 1);  /* save control variable */
+           ci->u.l.savedpc += GETARG_sBx(i);  /* jump back */
+        }
+      )
+      vmcase(OP_SETLIST,
+        int n = GETARG_B(i);
+        int c = GETARG_C(i);
+        int last;
+        Table *h;
+        if (n == 0) n = cast_int(L->top - ra) - 1;
+        if (c == 0) {
+          lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG);
+          c = GETARG_Ax(*ci->u.l.savedpc++);
+        }
+        luai_runtimecheck(L, ttistable(ra));
+        h = hvalue(ra);
+        last = ((c-1)*LFIELDS_PER_FLUSH) + n;
+        if (last > h->sizearray)  /* needs more space? */
+          luaH_resizearray(L, h, last);  /* pre-allocate it at once */
+        for (; n > 0; n--) {
+          TValue *val = ra+n;
+          luaH_setint(L, h, last--, val);
+          luaC_barrierback(L, obj2gco(h), val);
+        }
+        L->top = ci->top;  /* correct top (in case of previous open call) */
+      )
+      vmcase(OP_CLOSURE,
+        Proto *p = cl->p->p[GETARG_Bx(i)];
+        Closure *ncl = getcached(p, cl->upvals, base);  /* cached closure */
+        if (ncl == NULL)  /* no match? */
+          pushclosure(L, p, cl->upvals, base, ra);  /* create a new one */
+        else
+          setclLvalue(L, ra, ncl);  /* push cashed closure */
+        checkGC(L, ra + 1);
+      )
+      vmcase(OP_VARARG,
+        int b = GETARG_B(i) - 1;
+        int j;
+        int n = cast_int(base - ci->func) - cl->p->numparams - 1;
+        if (b < 0) {  /* B == 0? */
+          b = n;  /* get all var. arguments */
+          Protect(luaD_checkstack(L, n));
+          ra = RA(i);  /* previous call may change the stack */
+          L->top = ra + n;
+        }
+        for (j = 0; j < b; j++) {
+          if (j < n) {
+            setobjs2s(L, ra + j, base - n + j);
+          }
+          else {
+            setnilvalue(ra + j);
+          }
+        }
+      )
+      vmcase(OP_EXTRAARG,
+        lua_assert(0);
+      )
+    }
+  }
+}
+
diff --git a/ext/lua/src/lzio.c b/ext/lua/src/lzio.c
new file mode 100644
index 0000000..8b77054
--- /dev/null
+++ b/ext/lua/src/lzio.c
@@ -0,0 +1,76 @@
+/*
+** $Id: lzio.c,v 1.35 2012/05/14 13:34:18 roberto Exp $
+** Buffered streams
+** See Copyright Notice in lua.h
+*/
+
+
+#include <string.h>
+
+#define lzio_c
+#define LUA_CORE
+
+#include "lua.h"
+
+#include "llimits.h"
+#include "lmem.h"
+#include "lstate.h"
+#include "lzio.h"
+
+
+int luaZ_fill (ZIO *z) {
+  size_t size;
+  lua_State *L = z->L;
+  const char *buff;
+  lua_unlock(L);
+  buff = z->reader(L, z->data, &size);
+  lua_lock(L);
+  if (buff == NULL || size == 0)
+    return EOZ;
+  z->n = size - 1;  /* discount char being returned */
+  z->p = buff;
+  return cast_uchar(*(z->p++));
+}
+
+
+void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader, void *data) {
+  z->L = L;
+  z->reader = reader;
+  z->data = data;
+  z->n = 0;
+  z->p = NULL;
+}
+
+
+/* --------------------------------------------------------------- read --- */
+size_t luaZ_read (ZIO *z, void *b, size_t n) {
+  while (n) {
+    size_t m;
+    if (z->n == 0) {  /* no bytes in buffer? */
+      if (luaZ_fill(z) == EOZ)  /* try to read more */
+        return n;  /* no more input; return number of missing bytes */
+      else {
+        z->n++;  /* luaZ_fill consumed first byte; put it back */
+        z->p--;
+      }
+    }
+    m = (n <= z->n) ? n : z->n;  /* min. between n and z->n */
+    memcpy(b, z->p, m);
+    z->n -= m;
+    z->p += m;
+    b = (char *)b + m;
+    n -= m;
+  }
+  return 0;
+}
+
+/* ------------------------------------------------------------------------ */
+char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n) {
+  if (n > buff->buffsize) {
+    if (n < LUA_MINBUFFER) n = LUA_MINBUFFER;
+    luaZ_resizebuffer(L, buff, n);
+  }
+  return buff->buffer;
+}
+
+
diff --git a/filters/csv b/filters/csv
deleted file mode 100755
index 654f204..0000000
--- a/filters/csv
+++ /dev/null
@@ -1,114 +0,0 @@
-#!/usr/bin/perl -w
-
-use strict;
-use warnings;
-
-my $FILTERTYPE = 'csv';
-
-my $SEP = ',';
-my $NL = "\n";
-
-if ($#ARGV < 1) {
-    die "Filter failed! Please report bug.\n";
-}
-
-my $filename = $ARGV[0];
-my $fileType  = $ARGV[1];
-my $infile = $filename;
-
-open INFILE,"< $filename";
-$filename =~ s/\.tmp/\.$FILTERTYPE/;
-open OUTFILE,"> $filename";
-
-if ($fileType eq 'topology') {
-    my $region = 'topo';
-    print OUTFILE 'THREADS'.$NL;
-
-    while (<INFILE>) {
-
-        if (/Cache Topology/) {
-            $region = 'cache';
-            print OUTFILE 'CACHES'.$NL;
-        } elsif (/NUMA Topology/) {
-            $region = 'numa';
-            print OUTFILE 'NUMA'.$NL;
-        }
-
-        if ($region eq 'topo') {
-            if (/(CPU type):\t(.*)/) {
-                print OUTFILE $1.$SEP.$2.$NL;
-            }
-            elsif (/([A-Za-z ]*):\t([0-9]*)/) {
-                print OUTFILE $1.$SEP.$2.$NL;
-            } elsif (/(HWThread)\t(Thread)\t\t(Core)\t\t(Socket)/) {
-                print OUTFILE $1.$SEP.$2.$SEP.$3.$SEP.$4.$NL;
-            } elsif (/([0-9]*)\t\t([0-9]*)\t\t([0-9]*)\t\t([0-9]*)/) {
-                print OUTFILE $1.$SEP.$2.$SEP.$3.$SEP.$4.$NL;
-            }
-        } elsif ($region eq 'cache') {
-            if (/(Size):\t([0-9]*) ([kMB]*)/) {
-                my $size = $2;
-                if ($3 eq 'MB') {
-                    $size *= 1024;
-                }
-                print OUTFILE $1.'[kB]'.$SEP.$size.$NL;
-            } elsif (/(Cache groups):\t*(.*)/) {
-                my @groups = split('\) \(',$2);
-
-                my $grpId = 0;
-                foreach (@groups) {
-                    /([0-9 ]+)/;
-                    print OUTFILE 'Cache group '.$grpId.$SEP.$1.$NL;
-                    $grpId++;
-                }
-            } elsif (/(.*):\t*(.*)/) {
-                print OUTFILE $1.$SEP.$2.$NL;
-            }
-        } elsif ($region eq 'numa') {
-            if (/Domain ([0-9]*)/) {
-                print OUTFILE 'Domain ID'.$SEP.$1.$NL;
-            } elsif (/Memory: ([0-9.]+) MB free of total ([0-9.]+) MB/) {
-                print OUTFILE 'Free Memory [MB]'.$SEP.$1.$NL;
-                print OUTFILE 'Total Memory [MB]'.$SEP.$2.$NL;
-            } elsif (/(.*):\t*[ ]*(.*)/) {
-                print OUTFILE $1.$SEP.$2.$NL;
-            }
-        }
-    }
-} elsif ($fileType eq 'perfctr') {
-    my $header = 0;
-    while (<INFILE>) {
-        if (/Event[ ]*\|[ ]*(core.*)\|/) {
-            if (not $header) {
-                my @col = split('\|',$1);
-                my $numcol = $#col+1;
-                print OUTFILE 'NumColumns'.$SEP.$numcol.$NL;
-                print OUTFILE 'Event/Metric';
-                foreach (@col) {
-                    s/[ ]//g;
-                    print OUTFILE $SEP.$_;
-                }
-                print OUTFILE $NL;
-                $header = 1;
-            }
-        }elsif (/STAT/) {
-
-        }elsif (/\|[ ]+([A-Z0-9_]+)[ ]+\|[ ]*(.*)\|/) {
-            my @col = split('\|',$2);
-            print OUTFILE $1;
-            foreach (@col) {
-                s/[ ]//g;
-                print OUTFILE $SEP.$_;
-            }
-            print OUTFILE $NL;
-        } 
-    }
-} else {
-    die "Filter failed! Unknown application type $fileType!\n";
-}
-
-unlink($infile);
-close INFILE;
-close OUTFILE;
-
-
diff --git a/filters/xml b/filters/xml
index b72c430..fa24a9d 100755
--- a/filters/xml
+++ b/filters/xml
@@ -15,62 +15,91 @@ my $filename = $ARGV[0];
 my $fileType  = $ARGV[1];
 my $infile = $filename;
 
-open INFILE,"< $filename";
+if (! -e $filename)
+{
+    die "Input file does not exist!\n";
+}
+
+open INFILE,"<$filename";
 $filename =~ s/\.tmp/\.$FILTERTYPE/;
-open OUTFILE,"> $filename";
+open OUTFILE,">$filename";
 
 
 if ($fileType eq 'topology') {
     my $region = 'topo';
     my $indomain = 0;
     print OUTFILE '<node>'.$NL;
+    print OUTFILE '<info>'.$NL;
 
     while (<INFILE>) {
-
-        if (/Cache Topology/) {
+        if (/STRUCT,Cache Topology L1/) {
             $region = 'cache';
             print OUTFILE '<caches>'.$NL;
-        } elsif (/NUMA Topology/) {
+        } elsif (/STRUCT,NUMA Topology/) {
             print OUTFILE '</caches>'.$NL;
             print OUTFILE '<numa>'.$NL;
             $region = 'numa';
         }
 
         if ($region eq 'topo') {
-            if (/(CPU type):\t([\w ]*)/) {
+            if (/(CPU type):,([\w ]*),/) {
                 print OUTFILE '<cpu>'.$2.'</cpu>'.$NL;
-            } elsif (/CPU clock:\t([\d.]) GHz/) {
+            } elsif (/CPU name:,([^,]+),/) {
+                print OUTFILE '<name>'.$1.'</name>'.$NL;
+            } elsif (/CPU stepping:,(\d+),/) {
+                print OUTFILE '<stepping>'.$1.'</stepping>'.$NL;
+            } elsif (/CPU clock:,([\d.]+) GHz/) {
                 print OUTFILE '<clock>'.$1.'</clock>'.$NL;
-            } elsif (/(Sockets):\t(\d*)/) {
+            } elsif (/(Sockets):,(\d+),/) {
                 print OUTFILE '<socketsPerNode>'.$2.'</socketsPerNode>'.$NL;
-            } elsif (/(Cores per socket):\t(\d*)/) {
+            } elsif (/(Cores per socket):,(\d+),/) {
                 print OUTFILE '<coresPerSocket>'.$2.'</coresPerSocket>'.$NL;
-            } elsif (/(Threads per core):\t(\d*)/) {
+            } elsif (/(Threads per core):,(\d+),/) {
                 print OUTFILE '<threadsPerCore>'.$2.'</threadsPerCore>'.$NL;
-            } elsif (/([0-9]*)\t\t([0-9]*)\t\t([0-9]*)\t\t([0-9]*)/) {
+            } elsif (/HWThread,Thread,Core,Socket,Available/) {
+                print OUTFILE '</info>'.$NL;
+                print OUTFILE '<threads>'.$NL;
+            } elsif (/(\d+),(\d+),(\d+),(\d+),/) {
                 #TODO Build tree for XML output from table!
+                print OUTFILE '<thread>'.$NL;
+                print OUTFILE '<id>'.$1.'</id>'.$NL;
+                print OUTFILE '<threadid>'.$2.'</threadid>'.$NL;
+                print OUTFILE '<coreid>'.$3.'</coreid>'.$NL;
+                print OUTFILE '<socketid>'.$4.'</socketid>'.$NL;
+                print OUTFILE '</thread>'.$NL;
+            } elsif (/STRUCT,Sockets,/) {
+                print OUTFILE '</threads>'.$NL;
+                $region = 'cache';
             }
         } elsif ($region eq 'cache') {
-            if (/(Size):\t([0-9]*) ([kMB]*)/) {
+            if (/(Size):,(\d+) ([kMB]*)/) {
                 my $size = $2;
                 if ($3 eq 'MB') {
                     $size *= 1024;
                 }
                 print OUTFILE '<size>'.$size.'</size>'.$NL;
-            } elsif (/(Cache groups):\t*(.*)/) {
+            } elsif (/(Cache groups):,([\d ]+),/) {
                 print OUTFILE '</cache>'.$NL;
-            } elsif (/(Associativity):\t*(.*)/) {
+            } elsif (/Type:,(\w+) cache,/) {
+                print OUTFILE '<type>'.lc $1.'</type>'.$NL;
+            } elsif (/(Associativity):,(\d+)/) {
                 print OUTFILE '<associativity>'.$2.'</associativity>'.$NL;
-            } elsif (/(Number of sets):\t*(.*)/) {
+            } elsif (/(Number of sets):,(\d+)/) {
                 print OUTFILE '<sets>'.$2.'</sets>'.$NL;
-            } elsif (/(Cache line size):\t*(.*)/) {
+            } elsif (/(Cache line size):,(\d+)/) {
                 print OUTFILE '<linesize>'.$2.'</linesize>'.$NL;
-            } elsif (/(Level):\t*(.*)/) {
+            } elsif (/Shared by threads:,(\d+),/) {
+                print OUTFILE '<sharedby>'.$1.'</sharedby>'.$NL;
+            } elsif (/Cache type:,Inclusive/) {
+                print OUTFILE '<inclusive>true</inclusive>'.$NL;
+            } elsif (/Cache type:,Non Inclusive/) {
+                print OUTFILE '<inclusive>false</inclusive>'.$NL;
+            } elsif (/(Level):,(\d+)/) {
                 print OUTFILE '<cache>'.$NL;
                 print OUTFILE '<level>'.$2.'</level>'.$NL;
             }
         } elsif ($region eq 'numa') {
-            if (/Domain ([0-9]*)/) {
+            if (/Domain:,(\d+),/) {
                 if ($indomain )
                 {
                     print OUTFILE '</domain>'.$NL;
@@ -78,10 +107,11 @@ if ($fileType eq 'topology') {
                 print OUTFILE '<domain>'.$NL;
                 print OUTFILE '<id>'.$1.'</id>'.$NL;
                 $indomain = 1
-            } elsif (/Memory: ([0-9.]+) MB free of total ([0-9.]+) MB/) {
+            } elsif (/Free memory:,([\d.]+) MB,/) {
                 print OUTFILE '<freememory>'.$1.'</freememory>'.$NL;
-                print OUTFILE '<totalmemory>'.$2.'</totalmemory>'.$NL;
-            } elsif (/Processors:[ ]+([0-9. ]+)/) {
+            } elsif (/Total memory:,([\d.]+) MB,/) {
+                print OUTFILE '<totalmemory>'.$1.'</totalmemory>'.$NL;
+            } elsif (/Processors:,([\d, ]+)/) {
                 print OUTFILE '<processors>'.$1.'</processors>'.$NL;
             }
         }
@@ -96,41 +126,105 @@ if ($fileType eq 'topology') {
 } elsif ($fileType eq 'perfctr') {
     my $header = 0;
     my @col;
+    my @cpus;
+    my $region = 'info';
+    my $group = "1";
     print OUTFILE '<perfctr>'.$NL;
     while (<INFILE>) {
-        if (/Event[ ]*\|[ ]*(core.*)\|/) {
-            if (not $header) {
-                @col = split('\|',$1);
-                foreach (@col) {
-                    s/core //g;
-                    s/[ ]//g;
+        if (/TABLE,Info/) {
+            $region = 'info';
+            print OUTFILE '<info>'.$NL;
+        } elsif (/TABLE,Group (\d+) Raw/) {
+            $group = $1;
+            if (/Stat/) {
+                $region = '';
+            } else {
+                $region = 'raw';
+                if ($region eq 'info') {
+                    print OUTFILE '</info>'.$NL;
                 }
-                $header = 1;
+                print OUTFILE '<group'.$group.'>'.$NL;
+                print OUTFILE '<rawvalues>'.$NL;
             }
-        }elsif (/STAT/) {
-
-        }elsif (/\|[ ]+([A-Z0-9_]+)[ ]+\|[ ]*(.*)\|/) {
-            my @rescol = split('\|',$2);
-            my $id = 0;
-            print OUTFILE '<result>'.$NL;
-            print OUTFILE '<event>'.$1.'</event>'.$NL;
-            foreach (@rescol) {
-                s/[ ]//g;
-                print OUTFILE '<core>'.$NL;
-                print OUTFILE '<id>'.$col[$id].'</id>'.$NL;
-                print OUTFILE '<value>'.$_.'</value>'.$NL;
-                print OUTFILE '</core>'.$NL;
-                $id++;
+        } elsif (/TABLE,Group (\d+) Metric/) {
+            $group = $1;
+            if (/Stat/) {
+                if ($region eq 'metric')
+                {
+                    print OUTFILE '</metrics>'.$NL;
+                    print OUTFILE '</group'.$group.'>'.$NL;
+                }
+                $region = '';
+            } else {
+                $region = 'metric';
+                print OUTFILE '</rawvalues>'.$NL;
+                print OUTFILE '<metrics>'.$NL;
             }
-            print OUTFILE '</result>'.$NL;
-        } 
+        }
+        if ($region eq 'info') {
+            if (/(CPU type):,([\w ]*),/) {
+                print OUTFILE '<cpu>'.$2.'</cpu>'.$NL;
+            } elsif (/CPU name:,([^,]+),/) {
+                print OUTFILE '<name>'.$1.'</name>'.$NL;
+            } elsif (/CPU clock:,([\d.]+) GHz/) {
+                print OUTFILE '<clock>'.$1.'</clock>'.$NL;
+            }
+        } elsif ($region eq 'raw') {
+            if (/Event,Counter,(.*)/) {
+                if (not $header) {
+                    @cpus = split(',',$1);
+                    foreach (@cpus) {
+                        s/Core //g;
+                        s/[ ]//g;
+                    }
+                    $header = 1;
+                }
+            } elsif (!/TABLE/) {
+                @col = split(',',$_);
+                print OUTFILE '<event>'.$NL;
+                print OUTFILE '<name>'.$col[0].'</name>'.$NL;
+                print OUTFILE '<counter>'.$col[1].'</counter>'.$NL;
+
+                
+                for (my $i=0; $i<@cpus; $i++) {
+                    
+                    print OUTFILE '<cpu'.$cpus[$i].'>'.$col[2+$i].'</cpu'.$cpus[$i].'>'.$NL;
+                }
+                print OUTFILE '</event>'.$NL;
+            }
+        } elsif ($region eq 'metric') {
+            if ((!/Metric,Core/) and (!/TABLE/)) {
+                @col = split(',',$_);
+                print OUTFILE '<metric>'.$NL;
+                my $name = "";
+                my $unit = "";
+                if ($col[0] =~ /\[.*\]/) {
+                    $col[0] =~ m/(.*)\s\[(.*)\]/;
+                    $name = $1;
+                    $unit = $2
+                } else {
+                    $name = $col[0]
+                }
+                print OUTFILE '<name>'.$name.'</name>'.$NL;
+                if ($unit ne "")
+                {
+                    print OUTFILE '<unit>'.$unit.'</unit>'.$NL;
+                }
+                for (my $i=0; $i<@cpus; $i++) {
+                    print OUTFILE '<cpu'.$cpus[$i].'>'.$col[1+$i].'</cpu'.$cpus[$i].'>'.$NL;
+                }
+                print OUTFILE '</metric>'.$NL;
+            }
+        } elsif (/STAT/) {
+
+        }
     }
     print OUTFILE '</perfctr>'.$NL;
 } else {
     die "Filter failed! Unknown application type $fileType!\n";
 }
 
-#unlink($infile);
+unlink($infile);
 close INFILE;
 close OUTFILE;
 
diff --git a/groups/atom/BRANCH.txt b/groups/atom/BRANCH.txt
index 51d2ddd..ad43a3f 100644
--- a/groups/atom/BRANCH.txt
+++ b/groups/atom/BRANCH.txt
@@ -3,11 +3,14 @@ SHORT Branch prediction miss rate/ratio
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  BR_INST_RETIRED_ANY
 PMC1  BR_INST_RETIRED_MISPRED
 
 METRICS
-Runtime [s] FIXC1*inverseClock
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 Branch rate   PMC0/FIXC0
 Branch misprediction rate  PMC1/FIXC0
@@ -15,5 +18,14 @@ Branch misprediction ratio  PMC1/PMC0
 Instructions per branch  FIXC0/PMC0
 
 LONG
-Bla Bla
+Formulas:
+Branch rate = BR_INST_RETIRED_ANY/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_INST_RETIRED_MISPRED/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_INST_RETIRED_MISPRED/BR_INST_RETIRED_ANY
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ANY
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
 
diff --git a/groups/atom/DATA.txt b/groups/atom/DATA.txt
index 1c0f4ae..9349354 100644
--- a/groups/atom/DATA.txt
+++ b/groups/atom/DATA.txt
@@ -3,14 +3,20 @@ SHORT Load to store ratio
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_CACHE_LD
 PMC1  L1D_CACHE_ST
 
 METRICS
-Runtime [s] FIXC1*inverseClock
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
-Bla Bla
+Formulas:
+Load to store ratio = L1D_CACHE_LD/L1D_CACHE_ST
+-
+This is a simple metric to determine your load to store ratio.
 
diff --git a/groups/atom/MEM.txt b/groups/atom/MEM.txt
index faf9a0a..db580e5 100644
--- a/groups/atom/MEM.txt
+++ b/groups/atom/MEM.txt
@@ -3,13 +3,21 @@ SHORT Main memory bandwidth in MBytes/s
 EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  BUS_TRANS_MEM_THIS_CORE_THIS_A
 
 METRICS
-Runtime [s] FIXC1*inverseClock
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Memory data volume [GBytes] 1.0E-09*PMC0*64.0
 
 LONG
-Bla Bla
+Formulas:
+Memory bandwidth [MBytes/s] = 1.0E-06*BUS_TRANS_MEM_THIS_CORE_THIS_A*64/time
+Memory data volume [GBytes] = 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0
+-
+Profiling group to measure memory bandwidth drawn by this core.
 
diff --git a/groups/broadwell/BRANCH.txt b/groups/broadwell/BRANCH.txt
new file mode 100644
index 0000000..eb33846
--- /dev/null
+++ b/groups/broadwell/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES_1
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
diff --git a/groups/broadwell/CLOCK.txt b/groups/broadwell/CLOCK.txt
new file mode 100644
index 0000000..0b934e9
--- /dev/null
+++ b/groups/broadwell/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time 
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power =  PWR_PKG_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/broadwell/DATA.txt b/groups/broadwell/DATA.txt
new file mode 100644
index 0000000..1220980
--- /dev/null
+++ b/groups/broadwell/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOP_RETIRED_LOADS
+PMC1  MEM_UOP_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOP_RETIRED_LOADS/MEM_UOP_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/broadwell/ENERGY.txt b/groups/broadwell/ENERGY.txt
new file mode 100644
index 0000000..ae1756f
--- /dev/null
+++ b/groups/broadwell/ENERGY.txt
@@ -0,0 +1,39 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/groups/broadwell/FLOPS_AVX.txt b/groups/broadwell/FLOPS_AVX.txt
new file mode 100644
index 0000000..769b8a0
--- /dev/null
+++ b/groups/broadwell/FLOPS_AVX.txt
@@ -0,0 +1,24 @@
+SHORT Packed AVX MFlops/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP MFlops/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFlops/s  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+-
+Flop rates of 256 bit packed floating-point instructions
+
diff --git a/groups/broadwell/FLOPS_DP.txt b/groups/broadwell/FLOPS_DP.txt
new file mode 100644
index 0000000..45fca0c
--- /dev/null
+++ b/groups/broadwell/FLOPS_DP.txt
@@ -0,0 +1,29 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+-
+AVX/SSE scalar and packed double precision flop rates.
+
diff --git a/groups/broadwell/FLOPS_SP.txt b/groups/broadwell/FLOPS_SP.txt
new file mode 100644
index 0000000..4bc10e5
--- /dev/null
+++ b/groups/broadwell/FLOPS_SP.txt
@@ -0,0 +1,29 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+-
+AVX/SSE scalar and packed single precision flop rates.
+
diff --git a/groups/broadwell/ICACHE.txt b/groups/broadwell/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/groups/broadwell/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/broadwell/L2.txt b/groups/broadwell/L2.txt
new file mode 100644
index 0000000..cd7474d
--- /dev/null
+++ b/groups/broadwell/L2.txt
@@ -0,0 +1,37 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
+the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cachelines transfered it the instruction
+cache.
diff --git a/groups/broadwell/L2CACHE.txt b/groups/broadwell/L2CACHE.txt
new file mode 100644
index 0000000..8aa6522
--- /dev/null
+++ b/groups/broadwell/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 Cache. L2 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwell/L3.txt b/groups/broadwell/L3.txt
new file mode 100644
index 0000000..622fa25
--- /dev/null
+++ b/groups/broadwell/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L2 and the number of modified cachelines
+evicted from the L2. This group also outputs data volume transfered between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/broadwell/L3CACHE.txt b/groups/broadwell/L3CACHE.txt
new file mode 100644
index 0000000..30e71ee
--- /dev/null
+++ b/groups/broadwell/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 Cache. L3 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwell/TLB_DATA.txt b/groups/broadwell/TLB_DATA.txt
new file mode 100644
index 0000000..5e54147
--- /dev/null
+++ b/groups/broadwell/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/broadwell/TLB_INSTR.txt b/groups/broadwell/TLB_INSTR.txt
new file mode 100644
index 0000000..8faaebe
--- /dev/null
+++ b/groups/broadwell/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/broadwellEP/BRANCH.txt b/groups/broadwellEP/BRANCH.txt
new file mode 100644
index 0000000..eb33846
--- /dev/null
+++ b/groups/broadwellEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES_1
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
diff --git a/groups/broadwellEP/CLOCK.txt b/groups/broadwellEP/CLOCK.txt
new file mode 100644
index 0000000..0b934e9
--- /dev/null
+++ b/groups/broadwellEP/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time 
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power =  PWR_PKG_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/broadwellEP/DATA.txt b/groups/broadwellEP/DATA.txt
new file mode 100644
index 0000000..1220980
--- /dev/null
+++ b/groups/broadwellEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOP_RETIRED_LOADS
+PMC1  MEM_UOP_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOP_RETIRED_LOADS/MEM_UOP_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/broadwellEP/ENERGY.txt b/groups/broadwellEP/ENERGY.txt
new file mode 100644
index 0000000..28f0256
--- /dev/null
+++ b/groups/broadwellEP/ENERGY.txt
@@ -0,0 +1,35 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Broadwell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/groups/broadwellEP/FLOPS_AVX.txt b/groups/broadwellEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..769b8a0
--- /dev/null
+++ b/groups/broadwellEP/FLOPS_AVX.txt
@@ -0,0 +1,24 @@
+SHORT Packed AVX MFlops/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP MFlops/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFlops/s  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed DP MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+-
+Flop rates of 256 bit packed floating-point instructions
+
diff --git a/groups/broadwellEP/FLOPS_DP.txt b/groups/broadwellEP/FLOPS_DP.txt
new file mode 100644
index 0000000..45fca0c
--- /dev/null
+++ b/groups/broadwellEP/FLOPS_DP.txt
@@ -0,0 +1,29 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+-
+AVX/SSE scalar and packed double precision flop rates.
+
diff --git a/groups/broadwellEP/FLOPS_SP.txt b/groups/broadwellEP/FLOPS_SP.txt
new file mode 100644
index 0000000..4bc10e5
--- /dev/null
+++ b/groups/broadwellEP/FLOPS_SP.txt
@@ -0,0 +1,29 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+-
+AVX/SSE scalar and packed single precision flop rates.
+
diff --git a/groups/broadwellEP/ICACHE.txt b/groups/broadwellEP/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/groups/broadwellEP/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/broadwellEP/L2.txt b/groups/broadwellEP/L2.txt
new file mode 100644
index 0000000..eb150d9
--- /dev/null
+++ b/groups/broadwellEP/L2.txt
@@ -0,0 +1,37 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L1 and the number of modified cachelines
+evicted from the L1. The group also output total data volume transfered between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
diff --git a/groups/broadwellEP/L2CACHE.txt b/groups/broadwellEP/L2CACHE.txt
new file mode 100644
index 0000000..8aa6522
--- /dev/null
+++ b/groups/broadwellEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 Cache. L2 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/broadwellEP/L3.txt b/groups/broadwellEP/L3.txt
new file mode 100644
index 0000000..622fa25
--- /dev/null
+++ b/groups/broadwellEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L2 and the number of modified cachelines
+evicted from the L2. This group also outputs data volume transfered between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/broadwellEP/L3CACHE.txt b/groups/broadwellEP/L3CACHE.txt
new file mode 100644
index 0000000..30e71ee
--- /dev/null
+++ b/groups/broadwellEP/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 Cache. L3 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/haswell/TLB_DATA.txt b/groups/broadwellEP/TLB_DATA.txt
similarity index 100%
copy from groups/haswell/TLB_DATA.txt
copy to groups/broadwellEP/TLB_DATA.txt
diff --git a/groups/haswell/TLB_INSTR.txt b/groups/broadwellEP/TLB_INSTR.txt
similarity index 100%
copy from groups/haswell/TLB_INSTR.txt
copy to groups/broadwellEP/TLB_INSTR.txt
diff --git a/groups/core2/BRANCH.txt b/groups/core2/BRANCH.txt
index 2515d6c..7049891 100644
--- a/groups/core2/BRANCH.txt
+++ b/groups/core2/BRANCH.txt
@@ -19,12 +19,12 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ANY / INSTR_RETIRED_ANY
-Branch misprediction rate = BR_INST_RETIRED_MISPRED / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_INST_RETIRED_MISPRED / BR_INST_RETIRED_ANY
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ANY
+Branch rate = BR_INST_RETIRED_ANY/INSTR_RETIRED_ANY
+Branch misprediction rate = BR_INST_RETIRED_MISPRED/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_INST_RETIRED_MISPRED/BR_INST_RETIRED_ANY
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ANY
 -
 The rates state how often in average a branch or a mispredicted branch occured
 per instruction retired in total. The Branch misprediction ratio sets directly
-into relation what ration of all branch instruction where mispredicted.
+into relation what ratio of all branch instruction where mispredicted.
 Instructions per branch is 1/Branch rate.
diff --git a/groups/core2/CACHE.txt b/groups/core2/CACHE.txt
index fd2af0c..3989982 100644
--- a/groups/core2/CACHE.txt
+++ b/groups/core2/CACHE.txt
@@ -10,7 +10,6 @@ PMC1  L1D_ALL_CACHE_REF
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 Data cache misses PMC0
 Data cache request rate PMC1/FIXC0
diff --git a/groups/core2/DATA.txt b/groups/core2/DATA.txt
index c48ad99..0f5bca5 100644
--- a/groups/core2/DATA.txt
+++ b/groups/core2/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = INST_RETIRED_LOADS / INST_RETIRED_STORES
+Load to store ratio = INST_RETIRED_LOADS/INST_RETIRED_STORES
 -
-This is a simple metric to determine your Load to store ratio.
+This is a simple metric to determine your load to store ratio.
 
diff --git a/groups/core2/FLOPS_DP.txt b/groups/core2/FLOPS_DP.txt
index 8e72f07..59b167a 100644
--- a/groups/core2/FLOPS_DP.txt
+++ b/groups/core2/FLOPS_DP.txt
@@ -10,7 +10,6 @@ PMC1  SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 DP MFlops/s    1.0E-06*(PMC0*2.0+PMC1)/time
 
diff --git a/groups/core2/FLOPS_SP.txt b/groups/core2/FLOPS_SP.txt
index acd2df7..9cca892 100644
--- a/groups/core2/FLOPS_SP.txt
+++ b/groups/core2/FLOPS_SP.txt
@@ -10,7 +10,6 @@ PMC1  SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 SP MFlops/s 1.0E-06*(PMC0*4.0+PMC1)/time
 
diff --git a/groups/core2/FLOPS_X87.txt b/groups/core2/FLOPS_X87.txt
index 052356e..1226493 100644
--- a/groups/core2/FLOPS_X87.txt
+++ b/groups/core2/FLOPS_X87.txt
@@ -9,7 +9,6 @@ PMC0  X87_OPS_RETIRED_ANY
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 X87 MFlops/s  1.0E-06*PMC0/time
 
diff --git a/groups/core2/L2.txt b/groups/core2/L2.txt
index 88c75c5..1517650 100644
--- a/groups/core2/L2.txt
+++ b/groups/core2/L2.txt
@@ -10,19 +10,22 @@ PMC1  L1D_M_EVICT
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 evict [MBytes/s] 1.0E-06*PMC1*64.0/time
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
 L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64.0
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64.0
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is
 computed by the number of cacheline allocated in the L1 and the 
diff --git a/groups/core2/MEM.txt b/groups/core2/MEM.txt
index b205dc4..6b43c22 100644
--- a/groups/core2/MEM.txt
+++ b/groups/core2/MEM.txt
@@ -17,6 +17,6 @@ Memory data volume [GBytes] 1.0E-09*PMC0*64.0
 LONG
 Formulas:
 Memory bandwidth [MBytes/s] = 1.0E-06*BUS_TRANS_MEM_THIS_CORE_THIS_A*64/time
-Memory data volume [GBytes] 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0
+Memory data volume [GBytes] = 1.0E-09*BUS_TRANS_MEM_THIS_CORE_THIS_A*64.0
 -
-Profiling group to measure memory bandwidth drawn by this core. 
+Profiling group to measure memory bandwidth drawn by this core.
diff --git a/groups/core2/TLB.txt b/groups/core2/TLB.txt
index d536d88..eeaca9a 100644
--- a/groups/core2/TLB.txt
+++ b/groups/core2/TLB.txt
@@ -10,7 +10,6 @@ PMC1  L1D_ALL_CACHE_REF
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 DTLB request rate    PMC1/FIXC0
 DTLB miss rate    PMC0/FIXC0
diff --git a/groups/haswell/BRANCH.txt b/groups/haswell/BRANCH.txt
index cbaf834..eb33846 100644
--- a/groups/haswell/BRANCH.txt
+++ b/groups/haswell/BRANCH.txt
@@ -5,7 +5,7 @@ FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  BR_INST_RETIRED_ALL_BRANCHES
-PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES_1
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -19,10 +19,10 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
 The rates state how often in average a branch or a mispredicted branch occured
 per instruction retired in total. The Branch misprediction ratio sets directly
diff --git a/groups/haswell/DATA.txt b/groups/haswell/DATA.txt
index 5f04a23..1220980 100644
--- a/groups/haswell/DATA.txt
+++ b/groups/haswell/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = MEM_UOP_RETIRED_LOADS / MEM_UOP_RETIRED_STORES
+Load to store ratio = MEM_UOP_RETIRED_LOADS/MEM_UOP_RETIRED_STORES
 -
 This is a metric to determine your load to store ratio.
 
diff --git a/groups/haswell/ENERGY.txt b/groups/haswell/ENERGY.txt
index 15b1c45..d6757cb 100644
--- a/groups/haswell/ENERGY.txt
+++ b/groups/haswell/ENERGY.txt
@@ -7,10 +7,13 @@ FIXC2 CPU_CLK_UNHALTED_REF
 TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
 PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
 PWR3  PWR_DRAM_ENERGY
 
+
+
 METRICS
-Runtime (RDTSC) [s] time 
+Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
@@ -19,16 +22,18 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 Energy PP0 [J]  PWR1
 Power PP0 [W] PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
 Formula:
-Power =  PWR_PKG_ENERGY / time
+Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
 -
 Haswell implements the new RAPL interface. This interface enables to
-monitor the consumed energy on the package (socket) and DRAM level.
-The PP0 energy domain is often refered to an integrated GPU.
+monitor the consumed energy on the package (socket)  and DRAM level.
 
diff --git a/groups/haswell/ICACHE.txt b/groups/haswell/ICACHE.txt
index 6ce3ce8..5f11ad6 100644
--- a/groups/haswell/ICACHE.txt
+++ b/groups/haswell/ICACHE.txt
@@ -18,8 +18,8 @@ L1I miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
-L2 miss rate  = ICACHE_MISSES / INSTR_RETIRED_ANY
-L2 miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
 -
 This group measures some L1 instruction cache metrics.
diff --git a/groups/haswell/L2.txt b/groups/haswell/L2.txt
index 47d8ec7..cd7474d 100644
--- a/groups/haswell/L2.txt
+++ b/groups/haswell/L2.txt
@@ -6,6 +6,7 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPLACEMENT
 PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_MISSES
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -21,13 +22,16 @@ L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
 
 LONG
 Formulas:
-L2D load bandwidth [MBytes/s]  1.0E-06*L1D_REPLACEMENT*64.0/time
-L2D load data volume [GBytes]  1.0E-09*L1D_REPLACEMENT*64.0
-L2D evict bandwidth [MBytes/s]  1.0E-06*L2_TRANS_L1D_WB*64.0/time
-L2D evict data volume [GBytes]  1.0E-09*L2_TRANS_L1D_WB*64.0
-L2 bandwidth [MBytes/s] 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64.0
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
 number of cacheline loaded from the L2 to the L1 data cache and the writebacks from
-the L1 data cache to the L2 cache.
+the L1 data cache to the L2 cache. The group also outputs total data volume transfered between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cachelines transfered it the instruction
+cache.
diff --git a/groups/haswell/L2CACHE.txt b/groups/haswell/L2CACHE.txt
index 8186f69..8aa6522 100644
--- a/groups/haswell/L2CACHE.txt
+++ b/groups/haswell/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_RQSTS_REFERENCES
+PMC0  L2_TRANS_ALL_REQUESTS
 PMC1  L2_RQSTS_MISS
 
 METRICS
@@ -18,9 +18,9 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_RQSTS_REFERENCES / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_RQSTS_REFERENCES
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
 -
 This group measures the locality of your data accesses with regard to the
 L2 Cache. L2 request rate tells you how data intensive your code is
@@ -30,6 +30,5 @@ cachelines from memory. And finally L2 miss ratio tells you how many of your
 memory references required a cacheline to be loaded from a higher level.
 While the Data cache miss rate might be given by your algorithm you should
 try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
 
 
diff --git a/groups/haswell/L3.txt b/groups/haswell/L3.txt
index 42d6e4a..622fa25 100644
--- a/groups/haswell/L3.txt
+++ b/groups/haswell/L3.txt
@@ -12,17 +12,21 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 Load [MBytes/s]  1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L3 Load [MBytes/s]  1.0E-06*L2_LINES_IN_ALL*64/time
-L3 Evict [MBytes/s]  1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
 -
 Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
 number of cacheline allocated in the L2 and the number of modified cachelines
diff --git a/groups/haswell/L3CACHE.txt b/groups/haswell/L3CACHE.txt
index d4fd89e..30e71ee 100644
--- a/groups/haswell/L3CACHE.txt
+++ b/groups/haswell/L3CACHE.txt
@@ -6,21 +6,22 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
 PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 request rate (PMC0)/FIXC0
-L3 miss rate PMC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
 L3 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL / INSTR_RETIRED_ANY
-L3 miss rate  = MEM_LOAD_UOPS_RETIRED_L3_MISS / INSTR_RETIRED_ANY
-L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS / MEM_LOAD_UOPS_RETIRED_L3_ALL
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
 -
 This group measures the locality of your data accesses with regard to the
 L3 Cache. L3 request rate tells you how data intensive your code is
@@ -30,6 +31,5 @@ cachelines from memory. And finally L3 miss ratio tells you how many of your
 memory references required a cacheline to be loaded from a higher level.
 While the Data cache miss rate might be given by your algorithm you should
 try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
 
 
diff --git a/groups/haswell/TLB_DATA.txt b/groups/haswell/TLB_DATA.txt
index 2f59772..5e54147 100644
--- a/groups/haswell/TLB_DATA.txt
+++ b/groups/haswell/TLB_DATA.txt
@@ -16,19 +16,19 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 DTLB load misses     PMC0
 L1 DTLB load miss rate  PMC0/FIXC0
-L1 DTLB load miss duration PMC2
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
 L1 DTLB store misses     PMC1
 L1 DTLB store miss rate  PMC1/FIXC0
-L1 DTLB store miss duration PMC3
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
 
 LONG
 Formulas:
-L1 DTLB load misses     DTLB_LOAD_MISSES_CAUSES_A_WALK
-L1 DTLB load miss rate  DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
-L1 DTLB store misses     DTLB_STORE_MISSES_CAUSES_A_WALK
-L1 DTLB store miss rate  DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
 -
 The DTLB load and store miss rates gives a measure how often a TLB miss occured
 per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/haswell/TLB_INSTR.txt b/groups/haswell/TLB_INSTR.txt
index f95f78a..8faaebe 100644
--- a/groups/haswell/TLB_INSTR.txt
+++ b/groups/haswell/TLB_INSTR.txt
@@ -14,14 +14,14 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 ITLB misses     PMC0
 L1 ITLB miss rate  PMC0/FIXC0
-L1 ITLB miss duration PMC1
+L1 ITLB miss duration [Cyc] PMC1/PMC0
 
 
 LONG
 Formulas:
-L1 ITLB misses     ITLB_MISSES_CAUSES_A_WALK
-L1 ITLB miss rate  ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
 -
 The ITLB miss rates gives a measure how often a TLB miss occured
 per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/haswellEP/BRANCH.txt b/groups/haswellEP/BRANCH.txt
new file mode 100644
index 0000000..eb33846
--- /dev/null
+++ b/groups/haswellEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES_1
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
diff --git a/groups/haswellEP/CBOX.txt b/groups/haswellEP/CBOX.txt
new file mode 100644
index 0000000..7ef769b
--- /dev/null
+++ b/groups/haswellEP/CBOX.txt
@@ -0,0 +1,60 @@
+SHORT CBOX related data and metrics
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_VICTIMS_M
+CBOX1C0 LLC_VICTIMS_M
+CBOX2C0 LLC_VICTIMS_M
+CBOX3C0 LLC_VICTIMS_M
+CBOX4C0 LLC_VICTIMS_M
+CBOX5C0 LLC_VICTIMS_M
+CBOX6C0 LLC_VICTIMS_M
+CBOX7C0 LLC_VICTIMS_M
+CBOX8C0 LLC_VICTIMS_M
+CBOX9C0 LLC_VICTIMS_M
+CBOX10C0 LLC_VICTIMS_M
+CBOX11C0 LLC_VICTIMS_M
+CBOX12C0 LLC_VICTIMS_M
+CBOX13C0 LLC_VICTIMS_M
+CBOX14C0 LLC_VICTIMS_M
+CBOX15C0 LLC_VICTIMS_M
+CBOX16C0 LLC_VICTIMS_M
+CBOX17C0 LLC_VICTIMS_M
+CBOX0C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX1C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX2C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX3C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX4C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX5C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX6C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX7C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX8C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX9C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX10C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX11C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX12C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX13C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX14C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX15C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX16C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX17C1:STATE=0x1 LLC_LOOKUP_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+LLC Misses Per Instruction (CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0+CBOX15C0+CBOX16C0+CBOX17C0)/FIXC0
+LLC data written to MEM [MBytes] 1E-6*(CBOX0C1:STATE=0x1+CBOX1C1:STATE=0x1+CBOX2C1:STATE=0x1+CBOX3C1:STATE=0x1+CBOX4C1:STATE=0x1+CBOX5C1:STATE=0x1+CBOX6C1:STATE=0x1+CBOX7C1:STATE=0x1+CBOX8C1:STATE=0x1+CBOX9C1:STATE=0x1+CBOX10C1:STATE=0x1+CBOX11C1:STATE=0x1+CBOX12C1:STATE=0x1+CBOX13C1:STATE=0x1+CBOX14C1:STATE=0x1+CBOX15C1:STATE=0x1+CBOX16C1:STATE=0x1+CBOX17C1:STATE=0x1)*64
+
+
+LONG
+Formulas:
+LLC Misses Per Instruction sum(LLC_VICTIMS_M)/INSTR_RETIRED_ANY
+LLC data written to MEM [MBytes] sum(LLC_LOOKUP_ANY)*64*1E-6
+The CBOXes mediate the traffic from the L2 cache to the segmented L3 cache. Each
+CBOX is responsible for one segment (2.5 MByte). The boxes maintain the coherence between all
+CPU cores of the socket. Depending on the CPU core count, some CBOXes are not attached
+to a 2.5 MByte slice but are still active and track the traffic.
diff --git a/groups/haswellEP/CLOCK.txt b/groups/haswellEP/CLOCK.txt
new file mode 100644
index 0000000..276cf16
--- /dev/null
+++ b/groups/haswellEP/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time 
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power =  PWR_PKG_ENERGY / time
+-
+Haswell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/haswellEP/DATA.txt b/groups/haswellEP/DATA.txt
new file mode 100644
index 0000000..1220980
--- /dev/null
+++ b/groups/haswellEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOP_RETIRED_LOADS
+PMC1  MEM_UOP_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOP_RETIRED_LOADS/MEM_UOP_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/haswellEP/ENERGY.txt b/groups/haswellEP/ENERGY.txt
new file mode 100644
index 0000000..6c26b30
--- /dev/null
+++ b/groups/haswellEP/ENERGY.txt
@@ -0,0 +1,35 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+Haswell implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket)  and DRAM level.
+
diff --git a/groups/haswellEP/ICACHE.txt b/groups/haswellEP/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/groups/haswellEP/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/haswellEP/L2.txt b/groups/haswellEP/L2.txt
new file mode 100644
index 0000000..eb150d9
--- /dev/null
+++ b/groups/haswellEP/L2.txt
@@ -0,0 +1,37 @@
+SHORT  L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L2_TRANS_L1D_WB*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L2_TRANS_L1D_WB*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L2_TRANS_L1D_WB+ICACHE_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L1 and the number of modified cachelines
+evicted from the L1. The group also output total data volume transfered between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
diff --git a/groups/haswellEP/L2CACHE.txt b/groups/haswellEP/L2CACHE.txt
new file mode 100644
index 0000000..8aa6522
--- /dev/null
+++ b/groups/haswellEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 Cache. L2 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/haswellEP/L3.txt b/groups/haswellEP/L3.txt
new file mode 100644
index 0000000..622fa25
--- /dev/null
+++ b/groups/haswellEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DEMAND_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L2 and the number of modified cachelines
+evicted from the L2. This group also outputs data volume transfered between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/haswellEP/L3CACHE.txt b/groups/haswellEP/L3CACHE.txt
new file mode 100644
index 0000000..30e71ee
--- /dev/null
+++ b/groups/haswellEP/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 Cache. L3 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/haswellEP/MEM.txt b/groups/haswellEP/MEM.txt
new file mode 100644
index 0000000..e38c584
--- /dev/null
+++ b/groups/haswellEP/MEM.txt
@@ -0,0 +1,51 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/runtime
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/runtime
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/runtime
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transfered from main memory.
+
diff --git a/groups/haswellEP/NUMA.txt b/groups/haswellEP/NUMA.txt
new file mode 100644
index 0000000..0c1b8fb
--- /dev/null
+++ b/groups/haswellEP/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local DRAM data volume [GByte]  1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s]  1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte]  1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s]  1.E-06*(PMC1*64)/time
+Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/groups/haswellEP/QPI.txt b/groups/haswellEP/QPI.txt
new file mode 100644
index 0000000..462459f
--- /dev/null
+++ b/groups/haswellEP/QPI.txt
@@ -0,0 +1,37 @@
+SHORT QPI Link Layer data
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+QBOX0C0 QBOX_CLOCKTICKS
+QBOX1C0 QBOX_CLOCKTICKS
+QBOX0C1 DIRECT2CORE_SUCCESS_RBT_HIT
+QBOX1C1 DIRECT2CORE_SUCCESS_RBT_HIT
+QBOX0C2 TXL_FLITS_G0_DATA
+QBOX1C2 TXL_FLITS_G0_DATA
+QBOX0C3 TXL_FLITS_G0_NON_DATA
+QBOX1C3 TXL_FLITS_G0_NON_DATA
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+QPI to LLC data volume [GByte] 1.E-09*(QBOX0C1+QBOX1C1)*64
+QPI data volume [GByte] 1.E-06*(QBOX0C2+QBOX1C2)*8
+QPI data bandwidth [MByte/s] 1.E-09*(QBOX0C2+QBOX1C2)*8/time
+QPI link volume [GByte] 1.E-06*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8
+QPI link bandwidth [MByte/s] 1.E-09*(QBOX0C2+QBOX1C2+QBOX0C3+QBOX1C3)*8/time
+
+LONG
+Formula:
+QPI to LLC data volume [GByte] 1.E-09*(sum(DIRECT2CORE_SUCCESS_RBT_HIT)*64)
+QPI data volume [GByte] 1.E-09*(sum(TXL_FLITS_G0_DATA)*8)
+QPI data bandwidth [MByte/s] 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime
+QPI link volume [GByte] 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)
+QPI link bandwidth [MByte/s] 1.E-09*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)/runtime
+--
+The Intel QPI Link Layer is responsible for packetizing requests from the caching agent (CBOXes)
+on the way out to the system interface. For Haswell EP systems, the Link Layer and the
+Ring interface is separated.
diff --git a/groups/haswellEP/SBOX.txt b/groups/haswellEP/SBOX.txt
new file mode 100644
index 0000000..bea3b90
--- /dev/null
+++ b/groups/haswellEP/SBOX.txt
@@ -0,0 +1,28 @@
+SHORT Ring Transfer bandwidth
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+SBOX0C0 RING_BL_USED_ANY
+SBOX1C0 RING_BL_USED_ANY
+SBOX2C0 RING_BL_USED_ANY
+SBOX3C0 RING_BL_USED_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Ring transfer bandwidth [MByte/s] 1.E-06*(SBOX0C0+SBOX1C0+SBOX2C0+SBOX3C0)*32/time
+Ring transfer data volume [GByte] 1.E-09*(SBOX0C0+SBOX1C0+SBOX2C0+SBOX3C0)*32
+
+LONG
+Formula:
+Ring transfer bandwidth [MByte/s] = 1.E-06*(SUM(SBOXxC0)*32)/time
+Ring transfer data volume [GByte] = 1.E-09*(SUM(SBOXxC0)*32)
+--
+The SBOXes manage the transfer between the socket local ring(s). For microarchitectures
+prior to Haswell, the SBOX and QBOX was similar as only a single ring was used.
+Haswell systems with a high core count assemble two rings that are connected through
+the SBOXes, the traffic between the sockets is handled by the QBOXes.
diff --git a/groups/haswellEP/TLB_DATA.txt b/groups/haswellEP/TLB_DATA.txt
new file mode 100644
index 0000000..5e54147
--- /dev/null
+++ b/groups/haswellEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/haswellEP/TLB_INSTR.txt b/groups/haswellEP/TLB_INSTR.txt
new file mode 100644
index 0000000..8faaebe
--- /dev/null
+++ b/groups/haswellEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/interlagos/BRANCH.txt b/groups/interlagos/BRANCH.txt
index 1ae9f36..3d99f0f 100644
--- a/groups/interlagos/BRANCH.txt
+++ b/groups/interlagos/BRANCH.txt
@@ -4,29 +4,23 @@ EVENTSET
 PMC0  RETIRED_INSTRUCTIONS
 PMC1  RETIRED_BRANCH_INSTR
 PMC2  RETIRED_MISPREDICTED_BRANCH_INSTR
-PMC3  RETIRED_TAKEN_BRANCH_INSTR
 
 METRICS
 Runtime (RDTSC) [s] time
 Branch rate   PMC1/PMC0
 Branch misprediction rate  PMC2/PMC0
 Branch misprediction ratio  PMC2/PMC1
-Branch taken rate  PMC3/PMC0
-Branch taken ratio  PMC3/PMC1
 Instructions per branch  PMC0/PMC1
 
 LONG
 Formulas:
-Branch rate = RETIRED_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR / RETIRED_BRANCH_INSTR
-Branch taken rate = RETIRED_TAKEN_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch taken ratio = RETIRED_TAKEN_BRANCH_INSTR / RETIRED_BRANCH_INSTR
-Instructions per branch = RETIRED_INSTRUCTIONS / RETIRED_BRANCH_INSTR
+Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_BRANCH_INSTR
+Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR
 -
 The rates state how often in average a branch or a mispredicted branch occured
 per instruction retired in total. The branch misprediction ratio sets directly
 into relation what ratio of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate. The same applies for the branches
-taken metrics.
+Instructions per branch is 1/Branch rate.
 
diff --git a/groups/interlagos/DATA.txt b/groups/interlagos/DATA.txt
index 78e4c3c..75f1f60 100644
--- a/groups/interlagos/DATA.txt
+++ b/groups/interlagos/DATA.txt
@@ -6,11 +6,11 @@ PMC1  LS_DISPATCH_STORES
 
 METRICS
 Runtime (RDTSC) [s] time
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = LS_DISPATCH_LOADS / LS_DISPATCH_STORES
+Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES
 -
 This is a simple metric to determine your load to store ratio.
 
diff --git a/groups/interlagos/FLOPS_DP.txt b/groups/interlagos/FLOPS_DP.txt
index d7f5f57..18b8295 100644
--- a/groups/interlagos/FLOPS_DP.txt
+++ b/groups/interlagos/FLOPS_DP.txt
@@ -9,7 +9,7 @@ PMC3  RETIRED_FLOPS_DOUBLE_ALL
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]  PMC1*inverseClock
-MFlops/s    1.0E-06*(PMC3)/time
+DP MFlops/s    1.0E-06*(PMC3)/time
 CPI   PMC1/PMC0
 CPI (based on uops)   PMC1/PMC2
 IPC   PMC0/PMC1
diff --git a/groups/interlagos/FLOPS_SP.txt b/groups/interlagos/FLOPS_SP.txt
index 1c4dcc3..d0de0ea 100644
--- a/groups/interlagos/FLOPS_SP.txt
+++ b/groups/interlagos/FLOPS_SP.txt
@@ -9,7 +9,7 @@ PMC3  RETIRED_FLOPS_SINGLE_ALL
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]  PMC1*inverseClock
-MFlops/s    1.0E-06*(PMC3)/time
+SP MFlops/s    1.0E-06*(PMC3)/time
 CPI   PMC1/PMC0
 CPI (based on uops)   PMC1/PMC2
 IPC   PMC0/PMC1
diff --git a/groups/interlagos/ICACHE.txt b/groups/interlagos/ICACHE.txt
index be5e5f5..62b91d6 100644
--- a/groups/interlagos/ICACHE.txt
+++ b/groups/interlagos/ICACHE.txt
@@ -8,18 +8,16 @@ PMC3  RETIRED_INSTRUCTIONS
 
 METRICS
 Runtime (RDTSC) [s] time
-Instruction cache misses  PMC1+PMC2
-Instruction cache request rate   PMC0/PMC3
-Instruction cache miss rate    (PMC1+PMC2)/PMC3
-Instruction cache miss ratio   (PMC1+PMC2)/PMC0
+L1I request rate   PMC0/PMC3
+L1I miss rate    (PMC1+PMC2)/PMC3
+L1I miss ratio   (PMC1+PMC2)/PMC0
 
 LONG
 Formulas:
-Instruction cache misses INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS
-Instruction cache request rate INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS
-Instruction cache miss rate  (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
-Instruction cache miss ratio (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES
+L1I request rate = INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS
+L1I miss rate = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
+L1I miss ratio = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES
 -
 This group measures the locality of your instruction code with regard to the
-L1 I-Cache. 
+L1 I-Cache.
 
diff --git a/groups/interlagos/L2CACHE.txt b/groups/interlagos/L2CACHE.txt
index 17209e8..3d264ee 100644
--- a/groups/interlagos/L2CACHE.txt
+++ b/groups/interlagos/L2CACHE.txt
@@ -7,15 +7,15 @@ PMC2  L2_CACHE_MISS_DC_FILL
 
 METRICS
 Runtime (RDTSC) [s] time
-L2 request rate   (PMC1)/PMC0
+L2 request rate   PMC1/PMC0
 L2 miss rate   PMC2/PMC0
-L2 miss ratio   PMC2/(PMC1)
+L2 miss ratio   PMC2/PMC1
 
 LONG
 Formulas:
-L2 request rate = (L2_REQUESTS_ALL)/INSTRUCTIONS_RETIRED
+L2 request rate = L2_REQUESTS_ALL/INSTRUCTIONS_RETIRED
 L2 miss rate  = L2_MISSES_ALL/INSTRUCTIONS_RETIRED
-L2 miss ratio = L2_MISSES_ALL/(L2_REQUESTS_ALL)
+L2 miss ratio = L2_MISSES_ALL/L2_REQUESTS_ALL
 -
 This group measures the locality of your data accesses with regard to the L2
 Cache. L2 request rate tells you how data intensive your code is or how many
diff --git a/groups/interlagos/L3.txt b/groups/interlagos/L3.txt
index c1a6f17..f47c771 100644
--- a/groups/interlagos/L3.txt
+++ b/groups/interlagos/L3.txt
@@ -7,16 +7,21 @@ PMC2  CPU_CLOCKS_UNHALTED
 
 METRICS
 Runtime (RDTSC) [s] time
-L3 bandwidth [MBytes/s]   1.0E-06*(PMC0+PMC1)*64.0/time
-L3 data volume [GBytes]    1.0E-09*(PMC0+PMC1)*64.0
-L3 refill bandwidth [MBytes/s]   1.0E-06*PMC0*64.0/time
-L3 evict [MBytes/s]    1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L3 bandwidth [MBytes/s]   1.0E-06*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64/time
-L3 data volume [GBytes]   1.0E-09*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64
-L3 refill bandwidth [MBytes/s]   1.0E-06*L2_FILL_WB_FILL*64/time
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_FILL_WB_FILL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_FILL_WB_FILL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_FILL_WB_WB*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_FILL_WB_WB*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_FILL_WB_FILL+L2_FILL_WB_WB)*64
 -
 Profiling group to measure L3 cache bandwidth. The bandwidth is
 computed by the number of cacheline loaded from L3 to L2 and the
diff --git a/groups/interlagos/L3CACHE.txt b/groups/interlagos/L3CACHE.txt
index 4bef1a7..a06962a 100644
--- a/groups/interlagos/L3CACHE.txt
+++ b/groups/interlagos/L3CACHE.txt
@@ -16,8 +16,8 @@ L3 average access latency [cycles]  UPMC2/UPMC3
 
 LONG
 Formulas:
-L3 request rate = (UNC_READ_REQ_TO_L3_ALL)/INSTRUCTIONS_RETIRED
-L3 miss rate  = UNC_L3_CACHE_MISS_ALL/INSTRUCTIONS_RETIRED
+L3 request rate = UNC_READ_REQ_TO_L3_ALL/INSTRUCTIONS_RETIRED
+L3 miss rate = UNC_L3_CACHE_MISS_ALL/INSTRUCTIONS_RETIRED
 L3 miss ratio = UNC_L3_CACHE_MISS_ALL/UNC_READ_REQ_TO_L3_ALL
 L3 average access latency =  UNC_L3_LATENCY_CYCLE_COUNT/UNC_L3_LATENCY_REQUEST_COUNT
 -
@@ -29,7 +29,7 @@ L3 miss ratio tells you how many of your memory references required a cacheline
 to be loaded from a higher level.  While the Data cache miss rate might be
 given by your algorithm you should try to get Data cache miss ratio as low as
 possible by increasing your cache reuse.  This group was inspired from the
-whitepaper -Basic Performance Measurements for AMD Athlon 64, AMD Opteron and
-AMD Phenom Processors- from Paul J. Drongowski.
+whitepaper - Basic Performance Measurements for AMD Athlon 64, AMD Opteron and
+AMD Phenom Processors - from Paul J. Drongowski.
 
 
diff --git a/groups/ivybridge/BRANCH.txt b/groups/ivybridge/BRANCH.txt
index cbaf834..09699d9 100644
--- a/groups/ivybridge/BRANCH.txt
+++ b/groups/ivybridge/BRANCH.txt
@@ -19,10 +19,10 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
 The rates state how often in average a branch or a mispredicted branch occured
 per instruction retired in total. The Branch misprediction ratio sets directly
diff --git a/groups/ivybridge/DATA.txt b/groups/ivybridge/DATA.txt
index 5f04a23..1220980 100644
--- a/groups/ivybridge/DATA.txt
+++ b/groups/ivybridge/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = MEM_UOP_RETIRED_LOADS / MEM_UOP_RETIRED_STORES
+Load to store ratio = MEM_UOP_RETIRED_LOADS/MEM_UOP_RETIRED_STORES
 -
 This is a metric to determine your load to store ratio.
 
diff --git a/groups/ivybridge/ENERGY.txt b/groups/ivybridge/ENERGY.txt
index 3f70077..541c3ad 100644
--- a/groups/ivybridge/ENERGY.txt
+++ b/groups/ivybridge/ENERGY.txt
@@ -7,6 +7,7 @@ FIXC2 CPU_CLK_UNHALTED_REF
 TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
 PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
 PWR3  PWR_DRAM_ENERGY
 
 METRICS
@@ -19,15 +20,18 @@ Energy [J]  PWR0
 Power [W] PWR0/time
 Energy PP0 [J]  PWR1
 Power PP0 [W] PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
 Energy DRAM [J]  PWR3
 Power DRAM [W] PWR3/time
 
 LONG
 Formula:
-Power =  PWR_PKG_ENERGY / time
-Power PP0 [W] PWR1/time
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
 -
 IvyBridge implements the new RAPL interface. This interface enables to
-monitor the consumed energy on the package (socket) and DRAM level.
-
+monitor the consumed energy on the package (socket), the PP0 domain
+and DRAM level. The PP0 domain often refers to only the CPU cores.
diff --git a/groups/ivybridge/FLOPS_AVX.txt b/groups/ivybridge/FLOPS_AVX.txt
index e8074c1..ce7a043 100644
--- a/groups/ivybridge/FLOPS_AVX.txt
+++ b/groups/ivybridge/FLOPS_AVX.txt
@@ -12,14 +12,14 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-32b packed SP MFlops/s  1.0E-06*(PMC0*8.0)/time
-32b packed DP MFlops/s  1.0E-06*(PMC1*4.0)/time
+Packed SP MFlops/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFlops/s  1.0E-06*(PMC1*4.0)/time
 
 LONG
 Formula:
-SP MFlops/s =  (SIMD_FP_256_PACKED_SINGLE*8)/ runtime
-DP MFlops/s =  (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
+Packed SP MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
 -
-AVX flops rates. Please note that the current flop measurements on IvyBridge are
+Packed 32b AVX flops rates. Please note that the current flop measurements on IvyBridge are
 potentially wrong. So you cannot trust these counters at the moment!
 
diff --git a/groups/ivybridge/FLOPS_DP.txt b/groups/ivybridge/FLOPS_DP.txt
index 1e47b50..efbca6e 100644
--- a/groups/ivybridge/FLOPS_DP.txt
+++ b/groups/ivybridge/FLOPS_DP.txt
@@ -20,8 +20,10 @@ Scalar MUOPS/s 1.0E-06*PMC1/time
 
 LONG
 Formula:
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s =  (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
 -
 SSE scalar and packed double precision flop rates. Please note that the current flop measurements on SandyBridge are
 potentially wrong. So you cannot trust these counters at the moment!
diff --git a/groups/ivybridge/FLOPS_SP.txt b/groups/ivybridge/FLOPS_SP.txt
index 0be0721..fc18b95 100644
--- a/groups/ivybridge/FLOPS_SP.txt
+++ b/groups/ivybridge/FLOPS_SP.txt
@@ -13,15 +13,17 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-MFlops/s  1.0E-06*(PMC0*4.0+PMC1)/time
-32b AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
 Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 
 LONG
 Formula:
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s =  (FP_256_PACKED_SINGLE*8)/ runtime
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
 -
 SSE scalar and packed single precision flop rates. Please note that the current
 flop measurements on IvyBridge are potentially wrong. So you cannot trust
diff --git a/groups/ivybridge/ICACHE.txt b/groups/ivybridge/ICACHE.txt
index 6ce3ce8..5f11ad6 100644
--- a/groups/ivybridge/ICACHE.txt
+++ b/groups/ivybridge/ICACHE.txt
@@ -18,8 +18,8 @@ L1I miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
-L2 miss rate  = ICACHE_MISSES / INSTR_RETIRED_ANY
-L2 miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
 -
 This group measures some L1 instruction cache metrics.
diff --git a/groups/ivybridge/L2.txt b/groups/ivybridge/L2.txt
index 5345b7a..a97ca77 100644
--- a/groups/ivybridge/L2.txt
+++ b/groups/ivybridge/L2.txt
@@ -6,27 +6,33 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPLACEMENT
 PMC1  L1D_M_EVICT
+PMC2  ICACHE_MISSES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
 
 LONG
 Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
 number of cacheline allocated in the L1 and the number of modified cachelines
-evicted from the L1. The group also output total data volume transfered between
+evicted from the L1. The group also outputs total data volume transfered between
 L2 and L1. Note that this bandwidth also includes data transfers due to a write
-allocate load on a store miss in L1.
+allocate load on a store miss in L1 and cachelines transfered it the instruction
+cache.
 
diff --git a/groups/ivybridge/L2CACHE.txt b/groups/ivybridge/L2CACHE.txt
index 3d7c36e..8aa6522 100644
--- a/groups/ivybridge/L2CACHE.txt
+++ b/groups/ivybridge/L2CACHE.txt
@@ -18,9 +18,9 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_TRANS_ALL_REQUESTS / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_TRANS_ALL_REQUESTS
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
 -
 This group measures the locality of your data accesses with regard to the
 L2 Cache. L2 request rate tells you how data intensive your code is
@@ -30,6 +30,5 @@ cachelines from memory. And finally L2 miss ratio tells you how many of your
 memory references required a cacheline to be loaded from a higher level.
 While the Data cache miss rate might be given by your algorithm you should
 try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
 
 
diff --git a/groups/ivybridge/L3.txt b/groups/ivybridge/L3.txt
index 9a7c914..fcb3d73 100644
--- a/groups/ivybridge/L3.txt
+++ b/groups/ivybridge/L3.txt
@@ -12,17 +12,21 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 Load [MBytes/s]  1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L3 Load [MBytes/s]  1.0E-06*L2_LINES_IN_ALL*64/time
-L3 Evict [MBytes/s]  1.0E-06*L2_LINES_OUT_DIRTY_ALL*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ALL*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ALL*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
 -
 Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
 number of cacheline allocated in the L2 and the number of modified cachelines
diff --git a/groups/ivybridge/L3CACHE.txt b/groups/ivybridge/L3CACHE.txt
index d4fd89e..9b05004 100644
--- a/groups/ivybridge/L3CACHE.txt
+++ b/groups/ivybridge/L3CACHE.txt
@@ -6,21 +6,23 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
 PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 request rate (PMC0)/FIXC0
-L3 miss rate PMC1/FIXC0
+L3 request rate (PMC0)/PMC2
+L3 miss rate PMC1/PMC2
 L3 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL / INSTR_RETIRED_ANY
-L3 miss rate  = MEM_LOAD_UOPS_RETIRED_L3_MISS / INSTR_RETIRED_ANY
-L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS / MEM_LOAD_UOPS_RETIRED_L3_ALL
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
 -
 This group measures the locality of your data accesses with regard to the
 L3 Cache. L3 request rate tells you how data intensive your code is
@@ -30,6 +32,5 @@ cachelines from memory. And finally L3 miss ratio tells you how many of your
 memory references required a cacheline to be loaded from a higher level.
 While the Data cache miss rate might be given by your algorithm you should
 try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
 
 
diff --git a/groups/ivybridge/MEM.txt b/groups/ivybridge/MEM.txt
deleted file mode 100644
index 1f9ff4a..0000000
--- a/groups/ivybridge/MEM.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-SHORT Main memory bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Profiling group to measure main memory bandwidth drawn by all cores of
-a socket.  Since this group is based on uncore events it is only possible to
-measure on the granularity of a socket.  If a thread group contains multiple
-threads only one thread per socket will show the results.  Also outputs total
-data volume transfered from main memory.
-
diff --git a/groups/ivybridge/MEM_DP.txt b/groups/ivybridge/MEM_DP.txt
deleted file mode 100644
index 7bc76cd..0000000
--- a/groups/ivybridge/MEM_DP.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-SHORT Power and Energy consumption
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-TMP0  TEMP_CORE
-PWR0  PWR_PKG_ENERGY
-PWR3  PWR_DRAM_ENERGY
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
-PMC2  SIMD_FP_256_PACKED_DOUBLE
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Temperature TMP0
-Energy [J]  PWR0
-Power [W] PWR0/time
-Energy DRAM [J]  PWR3
-Power DRAM [W] PWR3/time
-AVX MFlops/s  1.0E-06*(4.0*PMC2)/time
-MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Formula:
-Power =  PWR_PKG_ENERGY / runtime
-Power DRAM = PWR_DRAM_ENERGY / runtime
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s =  (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
-Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
---
-Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Since this group is based on uncore events it is only possible to measure on
-a per socket base. Also outputs total data volume transfered from main memory,
-SSE scalar and packed double precision flop rates as well as consumed energy and 
-temperature. Also reports on packed AVX 32b instructions.  Please note that the 
-current flop measurements on IvyBridge are potentially wrong. So you cannot trust 
-these counters at the moment!
-
diff --git a/groups/ivybridge/MEM_SP.txt b/groups/ivybridge/MEM_SP.txt
deleted file mode 100644
index 4388cc4..0000000
--- a/groups/ivybridge/MEM_SP.txt
+++ /dev/null
@@ -1,57 +0,0 @@
-SHORT Power and Energy consumption
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-TMP0  TEMP_CORE
-PWR0  PWR_PKG_ENERGY
-PWR3  PWR_DRAM_ENERGY
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
-PMC2  SIMD_FP_256_PACKED_SINGLE
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Temperature TMP0
-Energy [J]  PWR0
-Power [W] PWR0/time
-Energy DRAM [J]  PWR3
-Power DRAM [W] PWR3/time
-AVX MFlops/s  1.0E-06*(8.0*PMC2)/time
-MFlops/s  1.0E-06*(PMC0*4.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Formula:
-Power =  PWR_PKG_ENERGY / runtime
-Power DRAM = PWR_DRAM_ENERGY / runtime
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE * 4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE) / runtime
-AVX MFlops/s =  (SIMD_FP_256_PACKED_SINGLE * 8) / runtime
-Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
---
-Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Since this group is based on uncore events it is only possible to measure on
-a per socket base. Also outputs total data volume transfered from main memory.
-SSE scalar and packed single precision flop rates as well as consumed energy and 
-temperature. Also reports on packed AVX 32b instructions. Please note that the 
-current flop measurements on SandyBridge are potentially wrong. So you cannot 
-trust these counters at the moment!
-
diff --git a/groups/ivybridge/TLB_DATA.txt b/groups/ivybridge/TLB_DATA.txt
index 2f59772..5e54147 100644
--- a/groups/ivybridge/TLB_DATA.txt
+++ b/groups/ivybridge/TLB_DATA.txt
@@ -16,19 +16,19 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 DTLB load misses     PMC0
 L1 DTLB load miss rate  PMC0/FIXC0
-L1 DTLB load miss duration PMC2
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
 L1 DTLB store misses     PMC1
 L1 DTLB store miss rate  PMC1/FIXC0
-L1 DTLB store miss duration PMC3
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
 
 LONG
 Formulas:
-L1 DTLB load misses     DTLB_LOAD_MISSES_CAUSES_A_WALK
-L1 DTLB load miss rate  DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
-L1 DTLB store misses     DTLB_STORE_MISSES_CAUSES_A_WALK
-L1 DTLB store miss rate  DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
 -
 The DTLB load and store miss rates gives a measure how often a TLB miss occured
 per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/ivybridge/TLB_INSTR.txt b/groups/ivybridge/TLB_INSTR.txt
index f95f78a..8faaebe 100644
--- a/groups/ivybridge/TLB_INSTR.txt
+++ b/groups/ivybridge/TLB_INSTR.txt
@@ -14,14 +14,14 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 ITLB misses     PMC0
 L1 ITLB miss rate  PMC0/FIXC0
-L1 ITLB miss duration PMC1
+L1 ITLB miss duration [Cyc] PMC1/PMC0
 
 
 LONG
 Formulas:
-L1 ITLB misses     ITLB_MISSES_CAUSES_A_WALK
-L1 ITLB miss rate  ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
 -
 The ITLB miss rates gives a measure how often a TLB miss occured
 per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/ivybridgeEP/BRANCH.txt b/groups/ivybridgeEP/BRANCH.txt
new file mode 100644
index 0000000..09699d9
--- /dev/null
+++ b/groups/ivybridgeEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
diff --git a/groups/ivybridgeEP/CACHES.txt b/groups/ivybridgeEP/CACHES.txt
new file mode 100644
index 0000000..4ac3b1a
--- /dev/null
+++ b/groups/ivybridgeEP/CACHES.txt
@@ -0,0 +1,55 @@
+SHORT Cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DIRTY_ALL
+CBOX0C0 LLC_VICTIMS_M_STATE
+CBOX1C0 LLC_VICTIMS_M_STATE
+CBOX2C0 LLC_VICTIMS_M_STATE
+CBOX3C0 LLC_VICTIMS_M_STATE
+CBOX4C0 LLC_VICTIMS_M_STATE
+CBOX5C0 LLC_VICTIMS_M_STATE
+CBOX6C0 LLC_VICTIMS_M_STATE
+CBOX7C0 LLC_VICTIMS_M_STATE
+CBOX8C0 LLC_VICTIMS_M_STATE
+CBOX9C0 LLC_VICTIMS_M_STATE
+CBOX10C0 LLC_VICTIMS_M_STATE
+CBOX11C0 LLC_VICTIMS_M_STATE
+CBOX12C0 LLC_VICTIMS_M_STATE
+CBOX13C0 LLC_VICTIMS_M_STATE
+CBOX14C0 LLC_VICTIMS_M_STATE
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 to L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
+L1 to L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2 to L3 Load [MBytes/s] 1.0E-06*PMC2*64.0/time
+L2 to L3 Evict [MBytes/s] 1.0E-06*PMC3*64.0/time
+L2 to L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+L3 to Memory data volume [MBytes] 1.0E-06*(CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0)*64
+
+LONG
+Formulas:
+L1 to L2 Load [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L1 to L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L1 and the number of modified cachelines
+evicted from the L1. The group also output total data volume transfered between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1.
+
diff --git a/groups/ivybridgeEP/CBOX.txt b/groups/ivybridgeEP/CBOX.txt
new file mode 100644
index 0000000..9201768
--- /dev/null
+++ b/groups/ivybridgeEP/CBOX.txt
@@ -0,0 +1,55 @@
+SHORT CBOX related data and metrics
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_VICTIMS_M_STATE
+CBOX1C0 LLC_VICTIMS_M_STATE
+CBOX2C0 LLC_VICTIMS_M_STATE
+CBOX3C0 LLC_VICTIMS_M_STATE
+CBOX4C0 LLC_VICTIMS_M_STATE
+CBOX5C0 LLC_VICTIMS_M_STATE
+CBOX6C0 LLC_VICTIMS_M_STATE
+CBOX7C0 LLC_VICTIMS_M_STATE
+CBOX8C0 LLC_VICTIMS_M_STATE
+CBOX9C0 LLC_VICTIMS_M_STATE
+CBOX10C0 LLC_VICTIMS_M_STATE
+CBOX11C0 LLC_VICTIMS_M_STATE
+CBOX12C0 LLC_VICTIMS_M_STATE
+CBOX13C0 LLC_VICTIMS_M_STATE
+CBOX14C0 LLC_VICTIMS_M_STATE
+CBOX0C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX1C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX2C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX3C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX4C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX5C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX6C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX7C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX8C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX9C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX10C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX11C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX12C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX13C1:STATE=0x1 LLC_LOOKUP_ANY
+CBOX14C1:STATE=0x1 LLC_LOOKUP_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+LLC Misses Per Instruction (CBOX0C0+CBOX1C0+CBOX2C0+CBOX3C0+CBOX4C0+CBOX5C0+CBOX6C0+CBOX7C0+CBOX8C0+CBOX9C0+CBOX10C0+CBOX11C0+CBOX12C0+CBOX13C0+CBOX14C0)/FIXC0
+LLC data written to MEM [MBytes] 1E-6*(CBOX0C1:STATE=0x1+CBOX1C1:STATE=0x1+CBOX2C1:STATE=0x1+CBOX3C1:STATE=0x1+CBOX4C1:STATE=0x1+CBOX5C1:STATE=0x1+CBOX6C1:STATE=0x1+CBOX7C1:STATE=0x1+CBOX8C1:STATE=0x1+CBOX9C1:STATE=0x1+CBOX10C1:STATE=0x1+CBOX11C1:STATE=0x1+CBOX12C1:STATE=0x1+CBOX13C1:STATE=0x1+CBOX14C1:STATE=0x1)*64
+
+
+LONG
+Formulas:
+LLC Misses Per Instruction sum(LLC_VICTIMS_M_STATE)/INSTR_RETIRED_ANY
+LLC data written to MEM [MBytes] sum(LLC_LOOKUP_ANY)*64*1E-6
+--
+The CBOXes mediate the traffic from the L2 cache to the segmented L3 cache. Each
+CBOX is responsible for one segment (2.5 MByte). The boxes maintain the coherence between all
+CPU cores of the socket. Depending on the CPU core count, some CBOXes are not attached
+to a 2.5 MByte slice but are still active and track the traffic.
diff --git a/groups/ivybridgeEP/CLOCK.txt b/groups/ivybridgeEP/CLOCK.txt
new file mode 100644
index 0000000..80891d4
--- /dev/null
+++ b/groups/ivybridgeEP/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time 
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power =  PWR_PKG_ENERGY / time
+-
+IvyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/ivybridgeEP/DATA.txt b/groups/ivybridgeEP/DATA.txt
new file mode 100644
index 0000000..1220980
--- /dev/null
+++ b/groups/ivybridgeEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOP_RETIRED_LOADS
+PMC1  MEM_UOP_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOP_RETIRED_LOADS/MEM_UOP_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/ivybridgeEP/ENERGY.txt b/groups/ivybridgeEP/ENERGY.txt
new file mode 100644
index 0000000..07bc59c
--- /dev/null
+++ b/groups/ivybridgeEP/ENERGY.txt
@@ -0,0 +1,33 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+IvyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket), the PP0 domain
+and DRAM level. The PP0 domain often refers to only the CPU cores.
diff --git a/groups/ivybridgeEP/FLOPS_AVX.txt b/groups/ivybridgeEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..ce7a043
--- /dev/null
+++ b/groups/ivybridgeEP/FLOPS_AVX.txt
@@ -0,0 +1,25 @@
+SHORT Packed AVX MFlops/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  SIMD_FP_256_PACKED_SINGLE
+PMC1  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP MFlops/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFlops/s  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+-
+Packed 32b AVX flops rates. Please note that the current flop measurements on IvyBridge are
+potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/ivybridgeEP/FLOPS_DP.txt b/groups/ivybridgeEP/FLOPS_DP.txt
new file mode 100644
index 0000000..efbca6e
--- /dev/null
+++ b/groups/ivybridgeEP/FLOPS_DP.txt
@@ -0,0 +1,30 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision flop rates. Please note that the current flop measurements on SandyBridge are
+potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/ivybridgeEP/FLOPS_SP.txt b/groups/ivybridgeEP/FLOPS_SP.txt
new file mode 100644
index 0000000..fc18b95
--- /dev/null
+++ b/groups/ivybridgeEP/FLOPS_SP.txt
@@ -0,0 +1,31 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on IvyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/groups/ivybridgeEP/ICACHE.txt b/groups/ivybridgeEP/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/groups/ivybridgeEP/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/ivybridgeEP/L2.txt b/groups/ivybridgeEP/L2.txt
new file mode 100644
index 0000000..a97ca77
--- /dev/null
+++ b/groups/ivybridgeEP/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L1 and the number of modified cachelines
+evicted from the L1. The group also outputs total data volume transfered between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and cachelines transfered it the instruction
+cache.
+
diff --git a/groups/ivybridgeEP/L2CACHE.txt b/groups/ivybridgeEP/L2CACHE.txt
new file mode 100644
index 0000000..8aa6522
--- /dev/null
+++ b/groups/ivybridgeEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 Cache. L2 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/ivybridgeEP/L3.txt b/groups/ivybridgeEP/L3.txt
new file mode 100644
index 0000000..fcb3d73
--- /dev/null
+++ b/groups/ivybridgeEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_LINES_OUT_DIRTY_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ALL*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ALL*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L2 and the number of modified cachelines
+evicted from the L2. This group also outputs data volume transfered between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/ivybridgeEP/L3CACHE.txt b/groups/ivybridgeEP/L3CACHE.txt
new file mode 100644
index 0000000..9b05004
--- /dev/null
+++ b/groups/ivybridgeEP/L3CACHE.txt
@@ -0,0 +1,36 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate (PMC0)/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 Cache. L3 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/ivybridgeEP/MEM.txt b/groups/ivybridgeEP/MEM.txt
new file mode 100644
index 0000000..0e5c56c
--- /dev/null
+++ b/groups/ivybridgeEP/MEM.txt
@@ -0,0 +1,49 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on uncore events it is only possible to measure on a
+per socket base. Some of the counters may not be available on your system.
+Also outputs total data volume transfered from main memory.
+
diff --git a/groups/ivybridgeEP/MEM_DP.txt b/groups/ivybridgeEP/MEM_DP.txt
new file mode 100644
index 0000000..7d2cfdd
--- /dev/null
+++ b/groups/ivybridgeEP/MEM_DP.txt
@@ -0,0 +1,68 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transfered from main memory.
+SSE scalar and packed double precision flop rates. Also reports on packed AVX
+32b instructions.  Please note that the current flop measurements on SandyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/ivybridgeEP/MEM_SP.txt b/groups/ivybridgeEP/MEM_SP.txt
new file mode 100644
index 0000000..52db55d
--- /dev/null
+++ b/groups/ivybridgeEP/MEM_SP.txt
@@ -0,0 +1,70 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0
+
+LONG
+Formula:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transfered from main memory.
+SSE scalar and packed single precision flop rates. Also reports on packed AVX
+32b instructions. Please note that the current flop measurements on IvyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/ivybridgeEP/NUMA.txt b/groups/ivybridgeEP/NUMA.txt
new file mode 100644
index 0000000..0c1b8fb
--- /dev/null
+++ b/groups/ivybridgeEP/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local DRAM data volume [GByte]  1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s]  1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte]  1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s]  1.E-06*(PMC1*64)/time
+Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/groups/ivybridgeEP/QPI.txt b/groups/ivybridgeEP/QPI.txt
new file mode 100644
index 0000000..39c93fd
--- /dev/null
+++ b/groups/ivybridgeEP/QPI.txt
@@ -0,0 +1,52 @@
+SHORT QPI Link Layer data
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+SBOX0C0 DIRECT2CORE_SUCCESS_RBT_HIT
+SBOX1C0 DIRECT2CORE_SUCCESS_RBT_HIT
+SBOX2C0 DIRECT2CORE_SUCCESS_RBT_HIT
+SBOX0C1 TXL_FLITS_G0_DATA
+SBOX1C1 TXL_FLITS_G0_DATA
+SBOX2C1 TXL_FLITS_G0_DATA
+SBOX0C2 TXL_FLITS_G0_NON_DATA
+SBOX1C2 TXL_FLITS_G0_NON_DATA
+SBOX2C2 TXL_FLITS_G0_NON_DATA
+SBOX0C3 SBOX_CLOCKTICKS
+SBOX1C3 SBOX_CLOCKTICKS
+SBOX2C3 SBOX_CLOCKTICKS
+SBOX0FIX QPI_RATE
+SBOX1FIX QPI_RATE
+SBOX2FIX QPI_RATE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+QPI Speed Link 0 [GT/s] ((SBOX0C3)/time)*inverseClock*(8/1000)
+QPI Speed Link 1 [GT/s] ((SBOX1C3)/time)*inverseClock*(8/1000)
+QPI Speed Link 2 [GT/s] ((SBOX2C3)/time)*inverseClock*(8/1000)
+QPI Rate Link 0 [GT/s] 1.E-09*SBOX0FIX
+QPI Rate Link 1 [GT/s] 1.E-09*SBOX1FIX
+QPI Rate Link 2 [GT/s] 1.E-09*SBOX2FIX
+Data from QPI to LLC [MByte] 1.E-06*(SBOX0C0+SBOX1C0+SBOX2C0)*8
+QPI data volume [MByte] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1)*8
+QPI data bandwidth [MByte/s] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1)*8/time
+QPI link volume [MByte] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1+SBOX0C2+SBOX1C2+SBOX2C2)*8
+QPI link bandwidth [MByte/s] 1.E-06*(SBOX0C1+SBOX1C1+SBOX2C1+SBOX0C2+SBOX1C2+SBOX2C2)*8/time
+
+LONG
+Formula:
+QPI Speed Link 0/1/2 [GT/s] = ((SBOX_CLOCKTICKS)/time)*clock*(8/1000)
+QPI Rate Link 0/1/2 [GT/s] = 1.E-09*(QPI_RATE)
+Data from QPI to LLC [MByte] = 1.E-06*(sum(DIRECT2CORE_SUCCESS_RBT_HIT)*64)
+QPI data volume [MByte] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)
+QPI data bandwidth [MByte/s] = 1.E-06*(sum(TXL_FLITS_G0_DATA)*8)/runtime
+QPI link volume [MByte] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)
+QPI link bandwidth [MByte/s] = 1.E-06*((sum(TXL_FLITS_G0_DATA)+sum(TXL_FLITS_G0_NON_DATA))*8)/runtime
+--
+The Intel QPI Link Layer is responsible for packetizing requests from the caching agent (CBOXes)
+on the way out to the system interface.
+
diff --git a/groups/ivybridgeEP/TLB_DATA.txt b/groups/ivybridgeEP/TLB_DATA.txt
new file mode 100644
index 0000000..5e54147
--- /dev/null
+++ b/groups/ivybridgeEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/ivybridgeEP/TLB_INSTR.txt b/groups/ivybridgeEP/TLB_INSTR.txt
new file mode 100644
index 0000000..8faaebe
--- /dev/null
+++ b/groups/ivybridgeEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/ivybridgeEP/UNCORECLOCK.txt b/groups/ivybridgeEP/UNCORECLOCK.txt
new file mode 100644
index 0000000..fef0d36
--- /dev/null
+++ b/groups/ivybridgeEP/UNCORECLOCK.txt
@@ -0,0 +1,84 @@
+SHORT All Clocks
+
+EVENTSET
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+CBOX0C0 CBOX_CLOCKTICKS
+CBOX1C0 CBOX_CLOCKTICKS
+CBOX2C0 CBOX_CLOCKTICKS
+CBOX3C0 CBOX_CLOCKTICKS
+CBOX4C0 CBOX_CLOCKTICKS
+CBOX5C0 CBOX_CLOCKTICKS
+CBOX6C0 CBOX_CLOCKTICKS
+CBOX7C0 CBOX_CLOCKTICKS
+CBOX8C0 CBOX_CLOCKTICKS
+CBOX9C0 CBOX_CLOCKTICKS
+CBOX10C0 CBOX_CLOCKTICKS
+CBOX11C0 CBOX_CLOCKTICKS
+CBOX12C0 CBOX_CLOCKTICKS
+CBOX13C0 CBOX_CLOCKTICKS
+CBOX14C0 CBOX_CLOCKTICKS
+MBOX0C0 DRAM_CLOCKTICKS
+MBOX1C0 DRAM_CLOCKTICKS
+MBOX2C0 DRAM_CLOCKTICKS
+MBOX3C0 DRAM_CLOCKTICKS
+MBOX0FIX DRAM_CLOCKTICKS
+MBOX1FIX DRAM_CLOCKTICKS
+MBOX2FIX DRAM_CLOCKTICKS
+MBOX3FIX DRAM_CLOCKTICKS
+SBOX0C0 SBOX_CLOCKTICKS
+SBOX1C0 SBOX_CLOCKTICKS
+SBOX2C0 SBOX_CLOCKTICKS
+UBOXFIX UBOX_CLOCKTICKS
+BBOX0C0 BBOX_CLOCKTICKS
+BBOX1C0 BBOX_CLOCKTICKS
+WBOX0 WBOX_CLOCKTICKS
+PBOX0 PBOX_CLOCKTICKS
+RBOX0C0 RBOX_CLOCKTICKS
+RBOX1C0 RBOX_CLOCKTICKS
+RBOX2C0 RBOX_CLOCKTICKS
+IBOX0 IBOX_CLOCKTICKS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CBOX0 Frequency [GHz] 1.E-09*CBOX0C0/(FIXC1*inverseClock)
+CBOX1 Frequency [GHz] 1.E-09*CBOX1C0/(FIXC1*inverseClock)
+CBOX2 Frequency [GHz] 1.E-09*CBOX2C0/(FIXC1*inverseClock)
+CBOX3 Frequency [GHz] 1.E-09*CBOX3C0/(FIXC1*inverseClock)
+CBOX4 Frequency [GHz] 1.E-09*CBOX4C0/(FIXC1*inverseClock)
+CBOX5 Frequency [GHz] 1.E-09*CBOX5C0/(FIXC1*inverseClock)
+CBOX6 Frequency [GHz] 1.E-09*CBOX6C0/(FIXC1*inverseClock)
+CBOX7 Frequency [GHz] 1.E-09*CBOX7C0/(FIXC1*inverseClock)
+CBOX8 Frequency [GHz] 1.E-09*CBOX8C0/(FIXC1*inverseClock)
+CBOX9 Frequency [GHz] 1.E-09*CBOX9C0/(FIXC1*inverseClock)
+CBOX10 Frequency [GHz] 1.E-09*CBOX10C0/(FIXC1*inverseClock)
+CBOX11 Frequency [GHz] 1.E-09*CBOX11C0/(FIXC1*inverseClock)
+CBOX12 Frequency [GHz] 1.E-09*CBOX12C0/(FIXC1*inverseClock)
+CBOX13 Frequency [GHz] 1.E-09*CBOX13C0/(FIXC1*inverseClock)
+CBOX14 Frequency [GHz] 1.E-09*CBOX14C0/(FIXC1*inverseClock)
+MBOX0 Frequency [GHz] 1.E-09*MBOX0C0/(FIXC1*inverseClock)
+MBOX0FIX Frequency [GHz] 1.E-09*MBOX0FIX/(FIXC1*inverseClock)
+MBOX1 Frequency [GHz] 1.E-09*MBOX1C0/(FIXC1*inverseClock)
+MBOX1FIX Frequency [GHz] 1.E-09*MBOX1FIX/(FIXC1*inverseClock)
+MBOX2 Frequency [GHz] 1.E-09*MBOX2C0/(FIXC1*inverseClock)
+MBOX2FIX Frequency [GHz] 1.E-09*MBOX2FIX/(FIXC1*inverseClock)
+MBOX3 Frequency [GHz] 1.E-09*MBOX3C0/(FIXC1*inverseClock)
+MBOX3FIX Frequency [GHz] 1.E-09*MBOX3FIX/(FIXC1*inverseClock)
+SBOX0 Frequency [GHz] 1.E-09*SBOX0C0/(FIXC1*inverseClock)
+SBOX1 Frequency [GHz] 1.E-09*SBOX1C0/(FIXC1*inverseClock)
+SBOX2 Frequency [GHz] 1.E-09*SBOX2C0/(FIXC1*inverseClock)
+UBOX Frequency [GHz] 1.E-09*UBOXFIX/(FIXC1*inverseClock)
+BBOX0 Frequency [GHz] 1.E-09*BBOX0C0/(FIXC1*inverseClock)
+BBOX1 Frequency [GHz] 1.E-09*BBOX1C0/(FIXC1*inverseClock)
+WBOX Frequency [GHz] 1.E-09*WBOX0/(FIXC1*inverseClock)
+PBOX Frequency [GHz] 1.E-09*PBOX0/(FIXC1*inverseClock)
+RBOX0 Frequency [GHz] 1.E-09*RBOX0C0/(FIXC1*inverseClock)
+RBOX1 Frequency [GHz] 1.E-09*RBOX1C0/(FIXC1*inverseClock)
+RBOX2 Frequency [GHz] 1.E-09*RBOX2C0/(FIXC1*inverseClock)
+IBOX Frequency [GHz] 1.E-09*IBOX0/(FIXC1*inverseClock)
+
+
+LONG
+Formulas:
diff --git a/groups/k10/BRANCH.txt b/groups/k10/BRANCH.txt
index cbc6da6..40089d5 100644
--- a/groups/k10/BRANCH.txt
+++ b/groups/k10/BRANCH.txt
@@ -4,29 +4,23 @@ EVENTSET
 PMC0  INSTRUCTIONS_RETIRED
 PMC1  BRANCH_RETIRED
 PMC2  BRANCH_MISPREDICT_RETIRED
-PMC3  BRANCH_TAKEN_RETIRED
 
 METRICS
 Runtime (RDTSC) [s] time
 Branch rate   PMC1/PMC0
 Branch misprediction rate  PMC2/PMC0
 Branch misprediction ratio  PMC2/PMC1
-Branch taken rate  PMC3/PMC0
-Branch taken ratio  PMC3/PMC1
 Instructions per branch  PMC0/PMC1
 
 LONG
 Formulas:
-Branch rate = BRANCH_RETIRED / INSTRUCTIONS_RETIRED
-Branch misprediction rate = BRANCH_MISPREDICT_RETIRED / INSTRUCTIONS_RETIRED
-Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED / BRANCH_RETIRED
-Branch taken rate = BRANCH_TAKEN_RETIRED / INSTRUCTIONS_RETIRED
-Branch taken ratio = BRANCH_TAKEN_RETIRED / BRANCH_RETIRED
-Instructions per branch = INSTRUCTIONS_RETIRED / BRANCH_RETIRED
+Branch rate = BRANCH_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction rate = BRANCH_MISPREDICT_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED/BRANCH_RETIRED
+Instructions per branch = INSTRUCTIONS_RETIRED/BRANCH_RETIRED
 -
 The rates state how often in average a branch or a mispredicted branch occured
 per instruction retired in total. The Branch misprediction ratio sets directly
 into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate. The same applies for the branches
-taken metrics.
+Instructions per branch is 1/Branch rate.
 
diff --git a/groups/k10/ICACHE.txt b/groups/k10/ICACHE.txt
index 222ea5d..5150496 100644
--- a/groups/k10/ICACHE.txt
+++ b/groups/k10/ICACHE.txt
@@ -8,18 +8,16 @@ PMC3  ICACHE_REFILLS_MEM
 
 METRICS
 Runtime (RDTSC) [s] time
-Instruction cache misses  PMC2+PMC3
-Instruction cache request rate   PMC1/PMC0
-Instruction cache miss rate    (PMC2+PMC3)/PMC0
-Instruction cache miss ratio   (PMC2+PMC3)/PMC1
+L1I request rate   PMC1/PMC0
+L1I miss rate    (PMC2+PMC3)/PMC0
+L1I miss ratio   (PMC2+PMC3)/PMC1
 
 LONG
 Formulas:
-Instruction cache misses ICACHE_REFILLS_L2 + ICACHE_REFILLS_MEM
-Instruction cache request rate ICACHE_FETCHES / INSTRUCTIONS_RETIRED
-Instruction cache miss rate  (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED
-Instruction cache miss ratio (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES
+L1I request rate = ICACHE_FETCHES / INSTRUCTIONS_RETIRED
+L1I miss rate = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED
+L1I miss ratio = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES
 -
 This group measures the locality of your instruction code with regard to the
-L1 I-Cache. 
+L1 I-Cache.
 
diff --git a/groups/k10/L2.txt b/groups/k10/L2.txt
index 8b61bcc..e777be8 100644
--- a/groups/k10/L2.txt
+++ b/groups/k10/L2.txt
@@ -8,17 +8,21 @@ PMC2  CPU_CLOCKS_UNHALTED
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]   PMC2*inverseClock
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L2 bandwidth [MBytes/s]   1.0E-06*(PMC0+PMC1)*64.0/time
 L2 data volume [GBytes]   1.0E-09*(PMC0+PMC1)*64.0
-L2 refill bandwidth [MBytes/s]   1.0E-06*PMC0*64.0/time
-L2 evict [MBytes/s]    1.0E-06*PMC1*64.0/time
 
 LONG
 Formulas:
-L2 bandwidth [MBytes/s]   1.0E-06*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64/time
-L2 data volume [GBytes]   1.0E-09*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64
-L2 refill bandwidth [MBytes/s]   1.0E-06*DATA_CACHE_REFILLS_L2_ALL*64/time
-L2 evict [MBytes/s]    1.0E-06*DATA_CACHE_EVICTED_ALL*64/time
+L2D load bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_REFILLS_L2_ALL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*DATA_CACHE_REFILLS_L2_ALL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_EVICTED_ALL*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*DATA_CACHE_EVICTED_ALL*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64/time
+L2 data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_L2_ALL+DATA_CACHE_EVICTED_ALL)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is
 computed by the number of cacheline loaded from L2 to L1 and the
diff --git a/groups/k10/L3CACHE.txt b/groups/k10/L3CACHE.txt
index 85b4522..90293b7 100644
--- a/groups/k10/L3CACHE.txt
+++ b/groups/k10/L3CACHE.txt
@@ -13,9 +13,9 @@ L3 miss ratio  PMC2/PMC1
 
 LONG
 Formulas:
-L3 request rate =  L3_READ_REQUEST_ALL_ALL_CORES / INSTRUCTIONS_RETIRED
-L3 miss rate  = L3_MISSES_ALL_ALL_CORES / INSTRUCTIONS_RETIRED
-L3 miss ratio =  L3_MISSES_ALL_ALL_CORES / L3_READ_REQUEST_ALL_ALL_CORES
+L3 request rate =  L3_READ_REQUEST_ALL_ALL_CORES/INSTRUCTIONS_RETIRED
+L3 miss rate = L3_MISSES_ALL_ALL_CORES/INSTRUCTIONS_RETIRED
+L3 miss ratio =  L3_MISSES_ALL_ALL_CORES/L3_READ_REQUEST_ALL_ALL_CORES
 -
 This group measures the locality of your data accesses with regard to the
 L3 Cache. L3 request rate tells you how data intensive your code is
diff --git a/groups/k10/MEM.txt b/groups/k10/MEM.txt
index b6c9f33..57f3623 100644
--- a/groups/k10/MEM.txt
+++ b/groups/k10/MEM.txt
@@ -8,19 +8,26 @@ PMC3  DRAM_ACCESSES_DCT1_ALL
 
 METRICS
 Runtime (RDTSC) [s] time
-Read data bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
-Write data bandwidth [MBytes/s]  1.0E-06*PMC1*8.0/time
+Memory read bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+Memory read data volume [GBytes]  1.0E-09*PMC0*64.0
+Memory write bandwidth [MBytes/s]  1.0E-06*PMC1*8.0/time
+Memory write data volume [GBytes]  1.0E-09*PMC1*8.0
 Memory bandwidth [MBytes/s]   1.0E-06*(PMC2+PMC3)*64.0/time
 Memory data volume [GBytes]   1.0E-09*(PMC2+PMC3)*64.0
 
 LONG
 Formulas:
-Read data bandwidth (MBytes/s)  1.0E-06*NORTHBRIDGE_READ_RESPONSE_ALL*64/time
-Write data bandwidth (MBytes/s)  1.0E-06*OCTWORDS_WRITE_TRANSFERS*8/time
+Read data bandwidth [MBytes/s]  1.0E-06*NORTHBRIDGE_READ_RESPONSE_ALL*64/time
+Write data bandwidth [MBytes/s]  1.0E-06*OCTWORDS_WRITE_TRANSFERS*8/time
 Memory bandwidth [MBytes/s] = 1.0E-06*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64/time
 Memory data volume [GBytes] = 1.0E-09*(DRAM_ACCESSES_DCTO_ALL+DRAM_ACCESSES_DCT1_ALL)*64
 -
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
 Note: As this group measures the accesses from all cores it only makes sense
 to measure with one core per socket, similiar as with the Intel Nehalem Uncore events.
+The memory read bandwidth contains all data from DRAM, L3, or another cache,
+including another core on the same node. The event OCTWORDS_WRITE_TRANSFERS counts
+16 Byte transfers, not 64 Byte.
+
+
 
diff --git a/groups/k8/BRANCH.txt b/groups/k8/BRANCH.txt
index 64e10cd..7983e88 100644
--- a/groups/k8/BRANCH.txt
+++ b/groups/k8/BRANCH.txt
@@ -4,28 +4,22 @@ EVENTSET
 PMC0  INSTRUCTIONS_RETIRED
 PMC1  BRANCH_RETIRED
 PMC2  BRANCH_MISPREDICT_RETIRED
-PMC3  BRANCH_TAKEN_RETIRED
 
 METRICS
 Runtime (RDTSC) [s] time
 Branch rate   PMC1/PMC0
 Branch misprediction rate  PMC2/PMC0
 Branch misprediction ratio  PMC2/PMC1
-Branch taken rate  PMC3/PMC0
-Branch taken ratio  PMC3/PMC1
 Instructions per branch  PMC0/PMC1
 
 LONG
 Formulas:
-Branch rate = BRANCH_RETIRED / INSTRUCTIONS_RETIRED
-Branch misprediction rate = BRANCH_MISPREDICT_RETIRED / INSTRUCTIONS_RETIRED
-Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED / BRANCH_RETIRED
-Branch taken rate = BRANCH_TAKEN_RETIRED / INSTRUCTIONS_RETIRED
-Branch taken ratio = BRANCH_TAKEN_RETIRED / BRANCH_RETIRED
-Instructions per branch = INSTRUCTIONS_RETIRED / BRANCH_RETIRED
+Branch rate = BRANCH_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction rate = BRANCH_MISPREDICT_RETIRED/INSTRUCTIONS_RETIRED
+Branch misprediction ratio = BRANCH_MISPREDICT_RETIRED/BRANCH_RETIRED
+Instructions per branch = INSTRUCTIONS_RETIRED/BRANCH_RETIRED
 -
 The rates state how often in average a branch or a mispredicted branch occured
 per instruction retired in total. The Branch misprediction ratio sets directly
 into relation what ration of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate. The same applies for the branches
-taken metrics.
+Instructions per branch is 1/Branch rate.
diff --git a/groups/k8/ICACHE.txt b/groups/k8/ICACHE.txt
index 222ea5d..5150496 100644
--- a/groups/k8/ICACHE.txt
+++ b/groups/k8/ICACHE.txt
@@ -8,18 +8,16 @@ PMC3  ICACHE_REFILLS_MEM
 
 METRICS
 Runtime (RDTSC) [s] time
-Instruction cache misses  PMC2+PMC3
-Instruction cache request rate   PMC1/PMC0
-Instruction cache miss rate    (PMC2+PMC3)/PMC0
-Instruction cache miss ratio   (PMC2+PMC3)/PMC1
+L1I request rate   PMC1/PMC0
+L1I miss rate    (PMC2+PMC3)/PMC0
+L1I miss ratio   (PMC2+PMC3)/PMC1
 
 LONG
 Formulas:
-Instruction cache misses ICACHE_REFILLS_L2 + ICACHE_REFILLS_MEM
-Instruction cache request rate ICACHE_FETCHES / INSTRUCTIONS_RETIRED
-Instruction cache miss rate  (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED
-Instruction cache miss ratio (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES
+L1I request rate = ICACHE_FETCHES / INSTRUCTIONS_RETIRED
+L1I miss rate = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/INSTRUCTIONS_RETIRED
+L1I miss ratio = (ICACHE_REFILLS_L2+ICACHE_REFILLS_MEM)/ICACHE_FETCHES
 -
 This group measures the locality of your instruction code with regard to the
-L1 I-Cache. 
+L1 I-Cache.
 
diff --git a/groups/kabini/BRANCH.txt b/groups/kabini/BRANCH.txt
index 1ae9f36..3d99f0f 100644
--- a/groups/kabini/BRANCH.txt
+++ b/groups/kabini/BRANCH.txt
@@ -4,29 +4,23 @@ EVENTSET
 PMC0  RETIRED_INSTRUCTIONS
 PMC1  RETIRED_BRANCH_INSTR
 PMC2  RETIRED_MISPREDICTED_BRANCH_INSTR
-PMC3  RETIRED_TAKEN_BRANCH_INSTR
 
 METRICS
 Runtime (RDTSC) [s] time
 Branch rate   PMC1/PMC0
 Branch misprediction rate  PMC2/PMC0
 Branch misprediction ratio  PMC2/PMC1
-Branch taken rate  PMC3/PMC0
-Branch taken ratio  PMC3/PMC1
 Instructions per branch  PMC0/PMC1
 
 LONG
 Formulas:
-Branch rate = RETIRED_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR / RETIRED_BRANCH_INSTR
-Branch taken rate = RETIRED_TAKEN_BRANCH_INSTR / RETIRED_INSTRUCTIONS
-Branch taken ratio = RETIRED_TAKEN_BRANCH_INSTR / RETIRED_BRANCH_INSTR
-Instructions per branch = RETIRED_INSTRUCTIONS / RETIRED_BRANCH_INSTR
+Branch rate = RETIRED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction rate = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_INSTRUCTIONS
+Branch misprediction ratio = RETIRED_MISPREDICTED_BRANCH_INSTR/RETIRED_BRANCH_INSTR
+Instructions per branch = RETIRED_INSTRUCTIONS/RETIRED_BRANCH_INSTR
 -
 The rates state how often in average a branch or a mispredicted branch occured
 per instruction retired in total. The branch misprediction ratio sets directly
 into relation what ratio of all branch instruction where mispredicted.
-Instructions per branch is 1/Branch rate. The same applies for the branches
-taken metrics.
+Instructions per branch is 1/Branch rate.
 
diff --git a/groups/kabini/DATA.txt b/groups/kabini/DATA.txt
index 78e4c3c..75f1f60 100644
--- a/groups/kabini/DATA.txt
+++ b/groups/kabini/DATA.txt
@@ -6,11 +6,11 @@ PMC1  LS_DISPATCH_STORES
 
 METRICS
 Runtime (RDTSC) [s] time
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = LS_DISPATCH_LOADS / LS_DISPATCH_STORES
+Load to store ratio = LS_DISPATCH_LOADS/LS_DISPATCH_STORES
 -
 This is a simple metric to determine your load to store ratio.
 
diff --git a/groups/kabini/ICACHE.txt b/groups/kabini/ICACHE.txt
index be5e5f5..62b91d6 100644
--- a/groups/kabini/ICACHE.txt
+++ b/groups/kabini/ICACHE.txt
@@ -8,18 +8,16 @@ PMC3  RETIRED_INSTRUCTIONS
 
 METRICS
 Runtime (RDTSC) [s] time
-Instruction cache misses  PMC1+PMC2
-Instruction cache request rate   PMC0/PMC3
-Instruction cache miss rate    (PMC1+PMC2)/PMC3
-Instruction cache miss ratio   (PMC1+PMC2)/PMC0
+L1I request rate   PMC0/PMC3
+L1I miss rate    (PMC1+PMC2)/PMC3
+L1I miss ratio   (PMC1+PMC2)/PMC0
 
 LONG
 Formulas:
-Instruction cache misses INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS
-Instruction cache request rate INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS
-Instruction cache miss rate  (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
-Instruction cache miss ratio (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES
+L1I request rate = INSTRUCTION_CACHE_FETCHES / RETIRED_INSTRUCTIONS
+L1I miss rate = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/RETIRED_INSTRUCTIONS
+L1I miss ratio = (INSTRUCTION_CACHE_L2_REFILLS + INSTRUCTION_CACHE_SYSTEM_REFILLS)/INSTRUCTION_CACHE_FETCHES
 -
 This group measures the locality of your instruction code with regard to the
-L1 I-Cache. 
+L1 I-Cache.
 
diff --git a/groups/kabini/L2.txt b/groups/kabini/L2.txt
index d06d809..92baca1 100644
--- a/groups/kabini/L2.txt
+++ b/groups/kabini/L2.txt
@@ -8,17 +8,21 @@ PMC2  CPU_CLOCKS_UNHALTED
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s]   PMC2*inverseClock
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L2 bandwidth [MBytes/s]   1.0E-06*(PMC0+PMC1)*64.0/time
 L2 data volume [GBytes]   1.0E-09*(PMC0+PMC1)*64.0
-L2 refill bandwidth [MBytes/s]   1.0E-06*PMC0*64.0/time
-L2 evict [MBytes/s]    1.0E-06*PMC1*64.0/time
 
 LONG
 Formulas:
-L2 bandwidth [MBytes/s]   1.0E-06*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64/time
-L2 data volume [GBytes]   1.0E-09*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64
-L2 refill bandwidth [MBytes/s]   1.0E-06*DATA_CACHE_REFILLS_ALL*64/time
-L2 evict [MBytes/s]    1.0E-06*DATA_CACHE_EVICTED_ALL*64/time
+L2D load bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_REFILLS_ALL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*DATA_CACHE_REFILLS_ALL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*DATA_CACHE_EVICTED_ALL*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*DATA_CACHE_EVICTED_ALL*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64/time
+L2 data volume [GBytes] = 1.0E-09*(DATA_CACHE_REFILLS_ALL+DATA_CACHE_EVICTED_ALL)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is
 computed by the number of cacheline loaded from L2 to L1 and the
diff --git a/groups/nehalem/BRANCH.txt b/groups/nehalem/BRANCH.txt
index 3d81416..62ac18f 100644
--- a/groups/nehalem/BRANCH.txt
+++ b/groups/nehalem/BRANCH.txt
@@ -19,10 +19,10 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
 The rates state how often in average a branch or a mispredicted branch occured
 per instruction retired in total. The Branch misprediction ratio sets directly
diff --git a/groups/nehalem/DATA.txt b/groups/nehalem/DATA.txt
index a5611bc..08d6d76 100644
--- a/groups/nehalem/DATA.txt
+++ b/groups/nehalem/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = MEM_INST_RETIRED_LOADS / MEM_INST_RETIRED_STORES
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
 -
 This is a simple metric to determine your Load to store ratio.
 
diff --git a/groups/nehalem/FLOPS_DP.txt b/groups/nehalem/FLOPS_DP.txt
index c5ba91c..658b8ff 100644
--- a/groups/nehalem/FLOPS_DP.txt
+++ b/groups/nehalem/FLOPS_DP.txt
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,7 +22,11 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-DP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
 Therefore both Single as well as Double precision are measured to ensure the correctness
diff --git a/groups/nehalem/FLOPS_SP.txt b/groups/nehalem/FLOPS_SP.txt
index 4478c8f..a954670 100644
--- a/groups/nehalem/FLOPS_SP.txt
+++ b/groups/nehalem/FLOPS_SP.txt
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+MFlops/s 1.0E-06*(PMC0*4.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,7 +22,11 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-SP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
 Therefore both Single as well as Double precision are measured to ensure the correctness
diff --git a/groups/nehalem/ICACHE.txt b/groups/nehalem/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/groups/nehalem/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1I_READS
+PMC1  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/nehalem/L2.txt b/groups/nehalem/L2.txt
index d193047..25c5604 100644
--- a/groups/nehalem/L2.txt
+++ b/groups/nehalem/L2.txt
@@ -6,27 +6,35 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPL
 PMC1  L1D_M_EVICT
+PMC2  L1I_MISSES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
 
 LONG
 Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is
 computed by the number of cacheline allocated in the L1 and the 
-number of modified cachelines evicted from the L1. 
+number of modified cachelines evicted from the L1.  Also reports on
+total data volume transfered between L2 and L1 cache.
 Note that this bandwidth also includes data transfers due to a
-write allocate load on a store miss in L1.
+write allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
 
diff --git a/groups/nehalem/L2CACHE.txt b/groups/nehalem/L2CACHE.txt
index 0fd60da..edc8d9c 100644
--- a/groups/nehalem/L2CACHE.txt
+++ b/groups/nehalem/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_DATA_RQSTS_DEMAND_ANY
+PMC0  L2_RQSTS_REFERENCES
 PMC1  L2_RQSTS_MISS
 
 METRICS
@@ -18,9 +18,9 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_DATA_RQSTS_DEMAND_ANY / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_DATA_RQSTS_DEMAND_MESI
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
 -
 This group measures the locality of your data accesses with regard to the
 L2 Cache. L2 request rate tells you how data intensive your code is
diff --git a/groups/nehalem/L3.txt b/groups/nehalem/L3.txt
index 446afee..26ac969 100644
--- a/groups/nehalem/L3.txt
+++ b/groups/nehalem/L3.txt
@@ -12,17 +12,21 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 Load [MBytes/s]  1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L3 Load [MBytes/s]  1.0E-06*L2_LINES_IN_ANY*64/time
-L3 Evict [MBytes/s]  1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
 -
 Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
 number of cacheline allocated in the L2 and the number of modified cachelines
diff --git a/groups/nehalem/L3CACHE.txt b/groups/nehalem/L3CACHE.txt
index b6ec110..e8e0023 100644
--- a/groups/nehalem/L3CACHE.txt
+++ b/groups/nehalem/L3CACHE.txt
@@ -1,28 +1,26 @@
 SHORT L3 cache miss rate/ratio
 
 EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
 UPMC0  UNC_L3_HITS_ANY
 UPMC1  UNC_L3_MISS_ANY
-UPMC2  UNC_L3_LINES_IN_ANY
-UPMC3  UNC_L3_LINES_OUT_ANY
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 request rate   UPMC0/FIXC0
+L3 request rate   (UPMC0+UPMC1)/FIXC0
 L3 miss rate   UPMC1/FIXC0
-L3 miss ratio  UPMC1/UPMC0
+L3 miss ratio  UPMC1/(UPMC0+UPMC1)
 
 LONG
 Formulas:
-L3 request rate  UNC_L3_HITS_ANY / INSTR_RETIRED_ANY 
-L3 miss rate   UNC_L3_MISS_ANY / INSTR_RETIRED_ANY
-L3 miss ratio  UNC_L3_MISS_ANY / UNC_L3_HITS_ANY
+L3 request rate = (UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)/INSTR_RETIRED_ANY
+L3 miss rate = UNC_L3_MISS_ANY/INSTR_RETIRED_ANY
+L3 miss ratio = UNC_L3_MISS_ANY/(UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)
 -
 This group measures the locality of your data accesses with regard to the
 L3 Cache. L3 request rate tells you how data intensive your code is
diff --git a/groups/nehalem/MEM.txt b/groups/nehalem/MEM.txt
index 087b269..c589123 100644
--- a/groups/nehalem/MEM.txt
+++ b/groups/nehalem/MEM.txt
@@ -1,33 +1,46 @@
 SHORT Main memory bandwidth in MBytes/s
 
 EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
 UPMC0  UNC_QMC_NORMAL_READS_ANY
 UPMC1  UNC_QMC_WRITES_FULL_ANY
-UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
-UPMC3 UNC_QHL_REQUESTS_LOCAL_READS 
-UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES 
+UPMC2  UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3  UNC_QHL_REQUESTS_REMOTE_WRITES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64/time
-Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64
-Remote Read BW [MBytes/s] 1.0E-06*(UPMC2)*64/time
-Remote Write BW [MBytes/s] 1.0E-06*(UPMC4)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UPMC2+UPMC4)*64/time
+Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time
+Memory data volume [GBytes] 1.0E-09*UPMC0*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time
+Memory data volume [GBytes] 1.0E-09*UPMC1*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0
+Remote memory read bandwidth [MBytes/s] 1.0E-06*UPMC2*64.0/time
+Remote memory read data volume [GBytes] 1.0E-09*UPMC2*64.0
+Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time
+Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0
+Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0
 
 LONG
 Formulas:
-Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64/time
-Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64
-Remote Read BW [MBytes/s] =  1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time;
-Remote Write BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
+Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time
+Memory data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time
+Memory data volume [GBytes] = 1.0E-09*UNC_QMC_WRITES_FULL_ANY*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0
+Remote memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_READS*64.0/time
+Remote memory read data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_READS*64.0
+Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0/time
+Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0
+Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time
+Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0
 -
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
 This group will be measured by one core per socket. The Remote  Read BW  tells
diff --git a/groups/nehalem/VIEW.txt b/groups/nehalem/VIEW.txt
deleted file mode 100644
index 98a856f..0000000
--- a/groups/nehalem/VIEW.txt
+++ /dev/null
@@ -1,50 +0,0 @@
-SHORT Main memory bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
-PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
-PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
-UPMC0  UNC_QMC_NORMAL_READS_ANY
-UPMC1  UNC_QMC_WRITES_FULL_ANY
-UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
-UPMC3 UNC_QHL_REQUESTS_LOCAL_READS
-UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-DP MFlops/s (DP assumed) 1.0E-06*(PMC0*2.0+PMC1)/time
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
-Packed MUOPS/s   1.0E-06*PMC0/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-SP MUOPS/s 1.0E-06*PMC2/time
-DP MUOPS/s 1.0E-06*PMC3/time
-Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64/time
-Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64
-Remote Read BW [MBytes/s] 1.0E-06*(UPMC2)*64/time
-Remote Write BW [MBytes/s] 1.0E-06*(UPMC4)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UPMC2+UPMC4)*64/time
-
-LONG
-Formulas:
-DP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-SP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-Packed MUOPS/s   1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/time
-Scalar MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/time
-SP MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/time
-DP MUOPS/s 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/time
-Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64/time
-Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64
-Remote Read BW [MBytes/s] =  1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time;
-Remote Write BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
--
-This is a overview group using the capabilities of nehalem to measure multiple events at
-the same time.
-
diff --git a/groups/nehalemEX/BRANCH.txt b/groups/nehalemEX/BRANCH.txt
index 3d81416..62ac18f 100644
--- a/groups/nehalemEX/BRANCH.txt
+++ b/groups/nehalemEX/BRANCH.txt
@@ -19,10 +19,10 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
 The rates state how often in average a branch or a mispredicted branch occured
 per instruction retired in total. The Branch misprediction ratio sets directly
diff --git a/groups/nehalemEX/DATA.txt b/groups/nehalemEX/DATA.txt
index a5611bc..08d6d76 100644
--- a/groups/nehalemEX/DATA.txt
+++ b/groups/nehalemEX/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = MEM_INST_RETIRED_LOADS / MEM_INST_RETIRED_STORES
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
 -
 This is a simple metric to determine your Load to store ratio.
 
diff --git a/groups/nehalemEX/FLOPS_DP.txt b/groups/nehalemEX/FLOPS_DP.txt
index c5ba91c..658b8ff 100644
--- a/groups/nehalemEX/FLOPS_DP.txt
+++ b/groups/nehalemEX/FLOPS_DP.txt
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,7 +22,11 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-DP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
 Therefore both Single as well as Double precision are measured to ensure the correctness
diff --git a/groups/nehalemEX/FLOPS_SP.txt b/groups/nehalemEX/FLOPS_SP.txt
index 4478c8f..a954670 100644
--- a/groups/nehalemEX/FLOPS_SP.txt
+++ b/groups/nehalemEX/FLOPS_SP.txt
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+MFlops/s 1.0E-06*(PMC0*4.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,7 +22,11 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-SP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
 Therefore both Single as well as Double precision are measured to ensure the correctness
diff --git a/groups/nehalemEX/ICACHE.txt b/groups/nehalemEX/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/groups/nehalemEX/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1I_READS
+PMC1  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/nehalemEX/L2.txt b/groups/nehalemEX/L2.txt
index 2734c5d..25c5604 100644
--- a/groups/nehalemEX/L2.txt
+++ b/groups/nehalemEX/L2.txt
@@ -6,28 +6,35 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPL
 PMC1  L1D_M_EVICT
+PMC2  L1I_MISSES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
 
 LONG
 Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is
 computed by the number of cacheline allocated in the L1 and the 
 number of modified cachelines evicted from the L1.  Also reports on
 total data volume transfered between L2 and L1 cache.
 Note that this bandwidth also includes data transfers due to a
-write allocate load on a store miss in L1.
+write allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
 
diff --git a/groups/nehalemEX/L2CACHE.txt b/groups/nehalemEX/L2CACHE.txt
index 49778be..edc8d9c 100644
--- a/groups/nehalemEX/L2CACHE.txt
+++ b/groups/nehalemEX/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_DATA_RQSTS_DEMAND_ANY
+PMC0  L2_RQSTS_REFERENCES
 PMC1  L2_RQSTS_MISS
 
 METRICS
@@ -18,9 +18,9 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_DATA_RQSTS_DEMAND_ANY / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_DATA_RQSTS_DEMAND_ANY
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
 -
 This group measures the locality of your data accesses with regard to the
 L2 Cache. L2 request rate tells you how data intensive your code is
@@ -30,6 +30,5 @@ cachelines from memory. And finally L2 miss ratio tells you how many of your
 memory references required a cacheline to be loaded from a higher level.
 While the Data cache miss rate might be given by your algorithm you should
 try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
 
 
diff --git a/groups/nehalemEX/L3.txt b/groups/nehalemEX/L3.txt
new file mode 100644
index 0000000..20348a1
--- /dev/null
+++ b/groups/nehalemEX/L3.txt
@@ -0,0 +1,37 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ANY
+PMC1  L2_LINES_OUT_DEMAND_DIRTY
+PMC2  L2_LINES_OUT_PREFETCH_DIRTY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*(PMC1+PMC2)*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*(PMC1+PMC2)*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*(L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY+L2_LINES_OUT_PREFETCH_DIRTY)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L2 and the number of modified cachelines
+evicted from the L2. Also reports total data volume between L3 and L2 caches.
+Note that this bandwidth also includes data transfers due to a write allocate
+load on a store miss in L2.
+
diff --git a/groups/nehalemEX/L3CACHE.txt b/groups/nehalemEX/L3CACHE.txt
new file mode 100644
index 0000000..fa472cc
--- /dev/null
+++ b/groups/nehalemEX/L3CACHE.txt
@@ -0,0 +1,48 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_HITS_ALL
+CBOX0C1 LLC_MISSES_ALL
+CBOX1C0 LLC_HITS_ALL
+CBOX1C1 LLC_MISSES_ALL
+CBOX2C0 LLC_HITS_ALL
+CBOX2C1 LLC_MISSES_ALL
+CBOX3C0 LLC_HITS_ALL
+CBOX3C1 LLC_MISSES_ALL
+CBOX4C0 LLC_HITS_ALL
+CBOX4C1 LLC_MISSES_ALL
+CBOX5C0 LLC_HITS_ALL
+CBOX5C1 LLC_MISSES_ALL
+CBOX6C0 LLC_HITS_ALL
+CBOX6C1 LLC_MISSES_ALL
+CBOX7C0 LLC_HITS_ALL
+CBOX7C1 LLC_MISSES_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate   (CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1)/FIXC0
+L3 miss rate   (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)/FIXC0
+L3 miss ratio  (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)/(CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1)
+
+LONG
+Formulas:
+L3 request rate = (SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))/INSTR_RETIRED_ANY
+L3 miss rate = SUM(LLC_MISSES_ALL)/INSTR_RETIRED_ANY
+L3 miss ratio = SUM(LLC_MISSES_ALL)/(SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))
+-
+This group measures the locality of your data accesses with regard to the
+L3 Cache. L3 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/nehalemEX/MEM.txt b/groups/nehalemEX/MEM.txt
index 86a2e97..510f27b 100644
--- a/groups/nehalemEX/MEM.txt
+++ b/groups/nehalemEX/MEM.txt
@@ -1,39 +1,42 @@
 SHORT Main memory bandwidth
 
 EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-WBOX4 UNCORE_CYCLES
-MBOX0C0 FVC_EV0_BBOX_CMDS_READS 
-MBOX0C1 FVC_EV0_BBOX_RSP_ACK 
-MBOX1C0 FVC_EV0_BBOX_CMDS_READS 
-MBOX1C1 FVC_EV0_BBOX_RSP_ACK 
-BBOX0C1 IMT_INSERTS_WR 
-BBOX1C1 IMT_INSERTS_WR 
-RBOX0C0 NEW_PACKETS_RECV_PORT0_IPERF0_ANY_DRS
-RBOX0C1 NEW_PACKETS_RECV_PORT1_IPERF0_ANY_DRS
-RBOX1C0 NEW_PACKETS_RECV_PORT4_IPERF0_ANY_DRS
-RBOX1C1 NEW_PACKETS_RECV_PORT5_IPERF0_ANY_DRS
+FIXC0   INSTR_RETIRED_ANY
+FIXC1   CPU_CLK_UNHALTED_CORE
+FIXC2   CPU_CLK_UNHALTED_REF
+WBOXFIX UNCORE_CLOCKTICKS
+MBOX0C0 FVC_EV0_BBOX_CMDS_READS
+MBOX0C1 DRAM_CMD_CAS_WR_OPN
+MBOX1C0 FVC_EV0_BBOX_CMDS_READS
+MBOX1C1 DRAM_CMD_CAS_WR_OPN
 
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-Uncore Clock [MHz]  1.E-06*(WBOX4)/time
+Uncore Clock [MHz]  1.E-06*(WBOXFIX)/time
 CPI  FIXC1/FIXC0
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64/time
-Memory Write BW [MBytes/s] 1.0E-06*(BBOX0C1+BBOX1C1)*64/time
-Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+BBOX0C1+BBOX1C1)*64/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+BBOX0C1+BBOX1C1)*64
-Remote write data traffic Port 0 [MBytes/s] 1.0E-06*(RBOX0C0)*64/time
-Remote write data traffic Port 1 [MBytes/s] 1.0E-06*(RBOX0C1)*64/time
-Remote write data traffic Port 4 [MBytes/s] 1.0E-06*(RBOX1C0)*64/time
-Remote write data traffic Port 5 [MBytes/s] 1.0E-06*(RBOX1C1)*64/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64
 
 LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Addional to the bandwidth it also outputs the data volume and the remote
-traffic over QPI links to other sockets.
+On Nehalem EX it is not possible to measure the write operations with the
+FVC_EV0_BBOX_CMDS_WRITES event at the same time as the FVC_EV0_BBOX_CMDS_READS
+because they set contrary bits. The DRAM_CMD_CAS_WR_OPN is an alternative but
+it only measures write operations to open pages, hence writes to closed pages
+are not included here.
 
diff --git a/groups/pentiumm/BRANCH.txt b/groups/pentiumm/BRANCH.txt
new file mode 100644
index 0000000..084552d
--- /dev/null
+++ b/groups/pentiumm/BRANCH.txt
@@ -0,0 +1,17 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+PMC0  BR_INST_EXEC
+PMC1  BR_INST_MISSP_EXEC
+
+METRICS
+Runtime (RDTSC) [s] time
+Branch misprediction ratio  PMC1/PMC0
+
+LONG
+Formulas:
+Branch misprediction ratio = BR_INST_MISSP_EXEC / BR_INST_EXEC
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
diff --git a/groups/pentiumm/CPI.txt b/groups/pentiumm/CPI.txt
new file mode 100644
index 0000000..5b7fa88
--- /dev/null
+++ b/groups/pentiumm/CPI.txt
@@ -0,0 +1,18 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  UOPS_RETIRED
+PMC1  CPU_CLK_UNHALTED
+
+METRICS
+Runtime (RDTSC) [s] time
+CPI   PMC1/PMC0
+IPC   PMC0/PMC1
+
+LONG
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is UOPS_RETIRED as it tells you how many uops
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/groups/pentiumm/FLOPS_DP.txt b/groups/pentiumm/FLOPS_DP.txt
new file mode 100644
index 0000000..310f4ad
--- /dev/null
+++ b/groups/pentiumm/FLOPS_DP.txt
@@ -0,0 +1,18 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0 EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP
+PMC1 EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP
+
+METRICS
+Runtime (RDTSC) [s] time
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
+Packed MUOPS/s   1.0E-06*(PMC0)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s =  (EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP*2 + EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP )/ runtime
+-
+SSE scalar and packed double precision flop rates.
+
diff --git a/groups/pentiumm/FLOPS_SP.txt b/groups/pentiumm/FLOPS_SP.txt
new file mode 100644
index 0000000..5167a00
--- /dev/null
+++ b/groups/pentiumm/FLOPS_SP.txt
@@ -0,0 +1,18 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0 EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP
+PMC1 EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP
+
+METRICS
+Runtime (RDTSC) [s] time
+MFlops/s  1.0E-06*(PMC0)/time
+Scalar MUOPS/s 1.0E-06*(PMC1)/time
+
+LONG
+Formula:
+MFlops/s =  (EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP)/ runtime
+Scalar MUOPS/s =  (EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP)/ runtime
+-
+SSE scalar and packed single precision flop rates.
+
diff --git a/groups/pentiumm/L3.txt b/groups/pentiumm/L3.txt
new file mode 100644
index 0000000..9fe5000
--- /dev/null
+++ b/groups/pentiumm/L3.txt
@@ -0,0 +1,30 @@
+SHORT L3 cache bandwidth in MBytes/s
+
+EVENTSET
+PMC0  L2_LINES_IN_ALL_ALL
+PMC1  L2_LINES_OUT_ALL_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_ALL_ALL*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_ALL_ALL*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL_ALL+L2_LINES_OUT_ALL_ALL)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL_ALL+L2_LINES_OUT_ALL_ALL)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L2 and the number of modified cachelines
+evicted from the L2. The group also output total data volume transfered between
+L2. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L2.
+
diff --git a/groups/phi/CACHE.txt b/groups/phi/CACHE.txt
index d611965..e899c07 100644
--- a/groups/phi/CACHE.txt
+++ b/groups/phi/CACHE.txt
@@ -1,4 +1,4 @@
-SHORT  Compute to Data Access Ratio
+SHORT L1 Compute to Data Access Ratio
 
 EVENTSET
 PMC0  VPU_ELEMENTS_ACTIVE
@@ -8,10 +8,13 @@ METRICS
 Runtime (RDTSC) [s] time
 L1 compute intensity   PMC0/PMC1
 
-LONG 
+LONG
+Formulas:
+L1 compute intensity = VPU_ELEMENTS_ACTIVE/DATA_READ_OR_WRITE
+--
 These metric is a way to measure the computational density of an
 application, or how many computations it is performing on average for each
-piece of data loaded.  L1 Compute to Data Access Ratio, should be
+piece of data loaded.  L1 Compute to Data Access Ratio should be
 used to judge suitability of an application for running on the Intel MIC
 Architecture. Applications that will perform well on the Intel� MIC
 Architecture should be vectorized, and ideally be able to perform multiple
diff --git a/groups/phi/COMPUTE_TO_DATA_RATIO.txt b/groups/phi/COMPUTE_TO_DATA_RATIO.txt
new file mode 100644
index 0000000..a6c1524
--- /dev/null
+++ b/groups/phi/COMPUTE_TO_DATA_RATIO.txt
@@ -0,0 +1,22 @@
+SHORT L2 Compute to Data Access Ratio
+
+EVENTSET
+PMC0  VPU_ELEMENTS_ACTIVE
+PMC1  DATA_READ_MISS_OR_WRITE_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+L2 compute intensity   PMC0/PMC1
+
+LONG
+Formulas:
+L2 compute intensity = VPU_ELEMENTS_ACTIVE/DATA_READ_MISS_OR_WRITE_MISS
+--
+These metric is a way to measure the computational density of an
+application, or how many computations it is performing on average for each
+piece of data loaded.  L2 Compute to Data Access Ratio should be
+used to judge suitability of an application for running on the Intel MIC
+Architecture. Applications that will perform well on the Intel� MIC
+Architecture should be vectorized, and ideally be able to perform multiple
+operations on the same pieces of data (or same cachelines).
+
diff --git a/groups/phi/L2CACHE.txt b/groups/phi/L2CACHE.txt
deleted file mode 100644
index 228a5ba..0000000
--- a/groups/phi/L2CACHE.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-SHORT L2 Compute to Data Access Ratio
-
-EVENTSET
-PMC0  VPU_ELEMENTS_ACTIVE
-PMC1  DATA_READ_MISS_OR_WRITE_MISS
-
-METRICS
-Runtime (RDTSC) [s] time
-L2 compute intensity   PMC0/PMC1
-
-LONG
-These metric is a way to measure the computational density of an
-application, or how many computations it is performing on average for each
-piece of data loaded.  L2 Compute to Data Access Ratio, should be
-used to judge suitability of an application for running on the Intel MIC
-Architecture. Applications that will perform well on the Intel� MIC
-Architecture should be vectorized, and ideally be able to perform multiple
-operations on the same pieces of data (or same cachelines).
-
diff --git a/groups/phi/MEM.txt b/groups/phi/MEM.txt
new file mode 100644
index 0000000..643b830
--- /dev/null
+++ b/groups/phi/MEM.txt
@@ -0,0 +1,18 @@
+SHORT Memory bandwidth
+
+EVENTSET
+PMC0  DATA_READ_MISS_OR_WRITE_MISS
+PMC1  DATA_CACHE_LINES_WRITTEN_BACK
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory bandwidth [MBytes/s] = 1.0E-06*(DATA_READ_MISS_OR_WRITE_MISS+DATA_CACHE_LINES_WRITTEN_BACK)*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(DATA_READ_MISS_OR_WRITE_MISS+DATA_CACHE_LINES_WRITTEN_BACK)*64.0
+--
+Total memory bandwidth and data volume.
diff --git a/groups/phi/MEM1.txt b/groups/phi/MEM1.txt
index 16e44e0..66faa2c 100644
--- a/groups/phi/MEM1.txt
+++ b/groups/phi/MEM1.txt
@@ -1,13 +1,15 @@
-SHORT L2 Write Misses
+SHORT L2 write misses
 
 EVENTSET
 PMC0  L2_DATA_WRITE_MISS_MEM_FILL
 
 METRICS
 Runtime (RDTSC) [s] time
-RFO Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-RFO Volume [GBytes] 1.0E-09*PMC0*64.0
+L2 RFO bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 RFO data volume [GBytes] 1.0E-09*PMC0*64.0
 
 LONG
-Bla
+Formulas:
+L2 RFO data bandwidth [MBytes/s] = 1.0E-06*L2_DATA_WRITE_MISS_MEM_FILL*64.0/time
+L2 RFO data volume [GBytes] = 1.0E-09*L2_DATA_WRITE_MISS_MEM_FILL*64.0
 
diff --git a/groups/phi/MEM2.txt b/groups/phi/MEM2.txt
index 9be1f2a..9de69e0 100644
--- a/groups/phi/MEM2.txt
+++ b/groups/phi/MEM2.txt
@@ -1,13 +1,15 @@
-SHORT L2 Read Misses
+SHORT L2 read misses
 
 EVENTSET
 PMC0  L2_DATA_READ_MISS_MEM_FILL
 
 METRICS
 Runtime (RDTSC) [s] time
-Read Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Read Data Volume [GBytes] 1.0E-09*PMC0*64.0
+L2 read data bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 read data volume [GBytes] 1.0E-09*PMC0*64.0
 
 LONG
-Bla
+Formulas:
+L2 read data bandwidth [MBytes/s] = 1.0E-06*L2_DATA_READ_MISS_MEM_FILL*64.0/time
+L2 read data volume [GBytes] = 1.0E-09*L2_DATA_READ_MISS_MEM_FILL*64.0
 
diff --git a/groups/phi/MEM3.txt b/groups/phi/MEM3.txt
index 45ce0de..3ac379b 100644
--- a/groups/phi/MEM3.txt
+++ b/groups/phi/MEM3.txt
@@ -5,9 +5,11 @@ PMC0  HWP_L2MISS
 
 METRICS
 Runtime (RDTSC) [s] time
-Prefetch Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Prefetch Data Volume [GBytes] 1.0E-09*PMC0*64.0
+Prefetch bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Prefetch volume [GBytes] 1.0E-09*PMC0*64.0
 
 LONG
-Bla
+Formulas:
+Prefetch bandwidth [MBytes/s] = 1.0E-06*HWP_L2MISS*64.0/time
+Prefetch data volume [GBytes] = 1.0E-09*HWP_L2MISS*64.0
 
diff --git a/groups/phi/MEM4.txt b/groups/phi/MEM4.txt
index a861a8b..436c357 100644
--- a/groups/phi/MEM4.txt
+++ b/groups/phi/MEM4.txt
@@ -1,13 +1,15 @@
-SHORT L2 Victim requests
+SHORT L2 victom requests
 
 EVENTSET
 PMC0  L2_VICTIM_REQ_WITH_DATA
 
 METRICS
 Runtime (RDTSC) [s] time
-Victim Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Victim Data Volume [GBytes] 1.0E-09*PMC0*64.0
+Victim data bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Victim data volume [GBytes] 1.0E-09*PMC0*64.0
 
 LONG
-Bla
+Formulas:
+Victim data bandwidth [MBytes/s] = 1.0E-06*L2_VICTIM_REQ_WITH_DATA*64.0/time
+Victim data volume [GBytes] = 1.0E-09*L2_VICTIM_REQ_WITH_DATA*64.0
 
diff --git a/groups/phi/MEM5.txt b/groups/phi/MEM5.txt
index ade9828..d206eee 100644
--- a/groups/phi/MEM5.txt
+++ b/groups/phi/MEM5.txt
@@ -1,13 +1,15 @@
-SHORT L2 Snoop hits
+SHORT L2 snoop hits
 
 EVENTSET
 PMC0  SNP_HITM_L2
 
 METRICS
 Runtime (RDTSC) [s] time
-Snoop Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-Snoop Data Volume [GBytes] 1.0E-09*PMC0*64.0
+Snoop data bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+Snoop data volume [GBytes] 1.0E-09*PMC0*64.0
 
 LONG
-Bla
+Formulas:
+Snoop data bandwidth [MBytes/s] = 1.0E-06*SNP_HITM_L2*64.0/time
+Snoop data volume [GBytes] = 1.0E-09*SNP_HITM_L2*64.0
 
diff --git a/groups/phi/MEM6.txt b/groups/phi/MEM6.txt
index 41be52e..4b6fa66 100644
--- a/groups/phi/MEM6.txt
+++ b/groups/phi/MEM6.txt
@@ -1,13 +1,15 @@
-SHORT L2 Read Misses
+SHORT L2 read misses
 
 EVENTSET
 PMC0  L2_READ_MISS
 
 METRICS
 Runtime (RDTSC) [s] time
-L2 Read Data Bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Read Data Volume [GBytes] 1.0E-09*PMC0*64.0
+L2 read data bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+L2 read data volume [GBytes] 1.0E-09*PMC0*64.0
 
 LONG
-Bla
+Formulas:
+L2 read data bandwidth [MBytes/s] = 1.0E-06*L2_READ_MISS*64.0/time
+L2 read data volume [GBytes] = 1.0E-09*L2_READ_MISS*64.0
 
diff --git a/groups/phi/MEM_READ.txt b/groups/phi/MEM_READ.txt
new file mode 100644
index 0000000..f7d20e9
--- /dev/null
+++ b/groups/phi/MEM_READ.txt
@@ -0,0 +1,20 @@
+SHORT Memory read bandwidth
+
+EVENTSET
+PMC0  DATA_READ_MISS
+PMC1  HWP_L2MISS
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(L2_DATA_READ_MISS_MEM_FILL+HWP_L2MISS)*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(L2_DATA_READ_MISS_MEM_FILL+HWP_L2MISS)*64.0
+--
+Bandwidth and data volume of read operations from the memory to L2 cache. The
+metric is introduced in the book 'Intel Xeon Phi Coprocessor High-Performance
+Programming' by James Jeffers and James Reinders.
diff --git a/groups/phi/MEM_WRITE.txt b/groups/phi/MEM_WRITE.txt
new file mode 100644
index 0000000..900af94
--- /dev/null
+++ b/groups/phi/MEM_WRITE.txt
@@ -0,0 +1,20 @@
+SHORT Memory write bandwidth
+
+EVENTSET
+PMC0  L2_VICTIM_REQ_WITH_DATA
+PMC1  SNP_HITM_L2
+
+
+METRICS
+Runtime (RDTSC) [s] time
+Memory write bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory write bandwidth [MBytes/s] = 1.0E-06*(L2_VICTIM_REQ_WITH_DATA+SNP_HITM_L2)*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(L2_VICTIM_REQ_WITH_DATA+SNP_HITM_L2)*64.0
+--
+Bandwidth and data volume of write operations from the L2 cache to memory. The
+metric is introduced in the book 'Intel Xeon Phi Coprocessor High-Performance
+Programming' by James Jeffers and James Reinders.
diff --git a/groups/phi/PAIRING.txt b/groups/phi/PAIRING.txt
index 2e93cc8..941d5a5 100644
--- a/groups/phi/PAIRING.txt
+++ b/groups/phi/PAIRING.txt
@@ -6,8 +6,16 @@ PMC1  INSTRUCTIONS_EXECUTED_V_PIPE
 
 METRICS
 Runtime (RDTSC) [s] time
-VPipeRatio   PMC1/PMC0
-PairingRatio PMC1/(PMC0-PMC1)
+V-pipe ratio   PMC1/PMC0
+Pairing ratio PMC1/(PMC0-PMC1)
 
 LONG
-Pairing ratio
+Formulas:
+V-pipe ratio = INSTRUCTIONS_EXECUTED_V_PIPE/INSTRUCTIONS_EXECUTED
+Pairing ratio = INSTRUCTIONS_EXECUTED_V_PIPE/(INSTRUCTIONS_EXECUTED-INSTRUCTIONS_EXECUTED_V_PIPE)
+--
+Each hardware thread on the Xeon Phi can execute two instruction simultaneously,
+one in the U-pipe and one in the V-pipe. But this is only possible if the
+instructions can be paired. The instructions executed in paired fashion are counted
+by the event INSTRUCTIONS_EXECUTED_V_PIPE. The event INSTRUCTIONS_EXECUTED increments
+for each instruction, hence the maximal increase per cycle can be 2.
diff --git a/groups/phi/READ_MISS_RATIO.txt b/groups/phi/READ_MISS_RATIO.txt
index c98f91b..77b8a17 100644
--- a/groups/phi/READ_MISS_RATIO.txt
+++ b/groups/phi/READ_MISS_RATIO.txt
@@ -1,4 +1,4 @@
-SHORT Miss ratio for data read
+SHORT Miss ratio for data reads
 
 EVENTSET
 PMC0  DATA_READ
@@ -6,7 +6,10 @@ PMC1  DATA_READ_MISS
 
 METRICS
 Runtime (RDTSC) [s] time
-Miss ratio PMC1/PMC0
+Read miss ratio PMC1/PMC0
 
 LONG
-Miss ratio for data read
+Formulas:
+Read miss ratio = DATA_READ_MISS/DATA_READ
+--
+Miss ratio for data reads.
diff --git a/groups/phi/TLB.txt b/groups/phi/TLB.txt
new file mode 100644
index 0000000..aac4f68
--- /dev/null
+++ b/groups/phi/TLB.txt
@@ -0,0 +1,23 @@
+SHORT TLB Misses
+
+EVENTSET
+PMC0 LONG_DATA_PAGE_WALK
+PMC1 DATA_PAGE_WALK
+
+METRICS
+Runtime (RDTSC) [s] time
+L1 TLB misses [misses/s] PMC1/time
+L2 TLB misses [misses/s] PMC0/time
+L1 TLB misses per L2 TLB miss PMC1/PMC0
+
+LONG
+Formulas:
+L1 TLB misses [misses/s] = DATA_PAGE_WALK/time
+L2 TLB misses [misses/s] = LONG_DATA_PAGE_WALK/time
+L1 TLB misses per L2 TLB miss = DATA_PAGE_WALK/LONG_DATA_PAGE_WALK
+--
+Analysis of the layered TLB of the Intel Xeon Phi. According to the book
+'Intel Xeon Phi Coprocessor High-Performance Programming' by James Jeffers and
+James Reinders, a high L1 TLB misses per L2 TLB miss ratio suggests that your
+working set fits into the L2 TLB but not in L1 TLB. Using large pages may be
+beneficial.
diff --git a/groups/phi/TLB_L1.txt b/groups/phi/TLB_L1.txt
new file mode 100644
index 0000000..d06044b
--- /dev/null
+++ b/groups/phi/TLB_L1.txt
@@ -0,0 +1,23 @@
+SHORT L1 TLB misses
+
+EVENTSET
+PMC0 DATA_PAGE_WALK
+PMC1 DATA_READ_OR_WRITE
+
+METRICS
+Runtime (RDTSC) [s] time
+L1 TLB misses [misses/s] PMC0/time
+L1 TLB miss ratio PMC0/PMC1
+
+LONG
+Formulas:
+L1 TLB misses [misses/s] = DATA_PAGE_WALK/time
+L1 TLB miss ratio = DATA_PAGE_WALK/DATA_READ_OR_WRITE
+--
+This performance group measures the L1 TLB misses. A L1 TLB miss that hits the
+L2 TLB has a penelty of about 25 cycles for 4kB pages. For 2MB pages, the penelty
+for a L1 TLB miss that hits L2 TLB is about 8 cycles. The minimal L1 TLB miss ratio
+is about 1/64, so a high ratio indicates a bad spartial locality, data of a page
+is only partly accessed. It can also indicate trashing because when multiple pages
+are accessed in a loop iteration, the size and associativity is not sufficient to
+hold all pages.
diff --git a/groups/phi/TLB_L2.txt b/groups/phi/TLB_L2.txt
new file mode 100644
index 0000000..2665fdc
--- /dev/null
+++ b/groups/phi/TLB_L2.txt
@@ -0,0 +1,21 @@
+SHORT L2 TLB misses
+
+EVENTSET
+PMC0 LONG_DATA_PAGE_WALK
+PMC1 DATA_READ_OR_WRITE
+
+METRICS
+Runtime (RDTSC) [s] time
+L2 TLB misses [misses/s] PMC0/time
+L2 TLB miss ratio PMC0/PMC1
+
+LONG
+Formulas:
+L2 TLB misses [misses/s] = LONG_DATA_PAGE_WALK/time
+L2 TLB miss ratio = LONG_DATA_PAGE_WALK/DATA_READ_OR_WRITE
+--
+This performance group measures the L2 TLB misses. A L2 TLB miss has a penelty
+of at least 100 cycles, hence it is important to avoid them. A high ratio can
+indicate trashing because when multiple pages are accessed in a loop iteration,
+the size and associativity is not sufficient to hold all pages. This would also
+result in a bad ratio for the L1 TLB.
diff --git a/groups/phi/VECTOR.txt b/groups/phi/VECTOR.txt
index 1e91bc4..5155aca 100644
--- a/groups/phi/VECTOR.txt
+++ b/groups/phi/VECTOR.txt
@@ -1,4 +1,4 @@
-SHORT  Vector unit usage
+SHORT  Vectorization intensity
 
 EVENTSET
 PMC0  VPU_INSTRUCTIONS_EXECUTED
@@ -6,10 +6,16 @@ PMC1  VPU_ELEMENTS_ACTIVE
 
 METRICS
 Runtime (RDTSC) [s] time
-Vectorization Intensity PMC1/PMC0
+Vectorization intensity PMC1/PMC0
 
 LONG
+Formula:
+Vectorization intensity = VPU_ELEMENTS_ACTIVE / VPU_INSTRUCTIONS_EXECUTED
+--
 Vector instructions include instructions that perform floating-point
 operations, instructions that load vector registers from memory and store them
 to memory, instructions to manipulate vector mask registers, and other special
 purpose instructions such as vector shuffle.
+According to the book 'Intel Xeon Phi Coprocessor High-Performance Programming'
+by James Jeffers and James Reinders, the vectorization intensity should be >=8
+for double precision and >=16 for single precision.
diff --git a/groups/phi/VECTOR2.txt b/groups/phi/VECTOR2.txt
index 487460c..13985fd 100644
--- a/groups/phi/VECTOR2.txt
+++ b/groups/phi/VECTOR2.txt
@@ -9,9 +9,10 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s]  PMC1*inverseClock
 
 LONG
+No derived metrics.
+--
 This group measures how efficient the processor works with
-regard to instruction throughput. Also important as a standalone
-metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
-you need to execute for a task. An optimization might show very
-low CPI values but execute many more instruction for it.
+regard to vectorization instruction throughput. The event VPU_STALL_REG counts
+the VPU stalls due to data dependencies. Dependencies are read-after-write,
+write-after-write and write-after-read.
 
diff --git a/groups/phi/VPU_FILL_RATIO_DBL.txt b/groups/phi/VPU_FILL_RATIO_DBL.txt
index 50d3835..494de53 100644
--- a/groups/phi/VPU_FILL_RATIO_DBL.txt
+++ b/groups/phi/VPU_FILL_RATIO_DBL.txt
@@ -1,4 +1,4 @@
-SHORT VPU filling for Double
+SHORT VPU filling for double precision data
 
 EVENTSET
 PMC0  VPU_INSTRUCTIONS_EXECUTED
@@ -6,7 +6,13 @@ PMC1  VPU_ELEMENTS_ACTIVE
 
 METRICS
 Runtime (RDTSC) [s] time
-VPUFillRatio PMC0*8/PMC1
+VPU fill ratio PMC0*8/PMC1
 
 LONG
-VPU filling for Double
+Formulas:
+VPU fill ratio = VPU_INSTRUCTIONS_EXECUTED*8/VPU_ELEMENTS_ACTIVE
+--
+This performance group measures the number of vector instructions that are
+performed on each vector loaded to the VPU. It is important to increate the
+ratio to get a high throughput because memory accesses (loading data to the VPU)
+are expensive.
diff --git a/groups/phi/VPU_PAIRING.txt b/groups/phi/VPU_PAIRING.txt
index 998c1d7..024919b 100644
--- a/groups/phi/VPU_PAIRING.txt
+++ b/groups/phi/VPU_PAIRING.txt
@@ -1,4 +1,4 @@
-SHORT VPU Pairing ratio
+SHORT VPU pairing ratio
 
 EVENTSET
 PMC0  VPU_INSTRUCTIONS_EXECUTED
@@ -6,8 +6,15 @@ PMC1  VPU_INSTRUCTIONS_EXECUTED_V_PIPE
 
 METRICS
 Runtime (RDTSC) [s] time
-VPipeRatio   PMC1/PMC0
-PairingRatio PMC1/(PMC0-PMC1)
+V-pipe ratio   PMC1/PMC0
+Pairing ratio PMC1/(PMC0-PMC1)
 
 LONG
-VPU Pairing ratio
+Formulas:
+V-pipe ratio = VPU_INSTRUCTIONS_EXECUTED_V_PIPE/VPU_INSTRUCTIONS_EXECUTED
+Pairing ratio = VPU_INSTRUCTIONS_EXECUTED_V_PIPE/(VPU_INSTRUCTIONS_EXECUTED-VPU_INSTRUCTIONS_EXECUTED_V_PIPE)
+--
+This performance group measures the pairing ratio of vector instructions. The
+V-pipe can only execute a subset of all instruction, the main workload is done
+by the U-pipe. A higher throughput can be achieved if the pairing ratio is
+increased.
diff --git a/groups/phi/VPU_READ_MISS_RATIO.txt b/groups/phi/VPU_READ_MISS_RATIO.txt
index 94ec963..502644a 100644
--- a/groups/phi/VPU_READ_MISS_RATIO.txt
+++ b/groups/phi/VPU_READ_MISS_RATIO.txt
@@ -1,4 +1,4 @@
-SHORT Miss ratio for VPU data read
+SHORT Miss ratio for VPU data reads
 
 EVENTSET
 PMC0  VPU_DATA_READ
@@ -6,7 +6,11 @@ PMC1  VPU_DATA_READ_MISS
 
 METRICS
 Runtime (RDTSC) [s] time
-Miss ratio PMC1/PMC0
+VPU read miss ratio PMC1/PMC0
 
 LONG
-Miss ratio for VPU data read
+Formula:
+VPU read miss ratio = PMC1/PMC0
+--
+This performance group determines the ratio between reads and reads that miss
+the cache and are issued by the VPU.
diff --git a/groups/phi/VPU_WRITE_MISS_RATIO.txt b/groups/phi/VPU_WRITE_MISS_RATIO.txt
index 429ee6d..b098b6f 100644
--- a/groups/phi/VPU_WRITE_MISS_RATIO.txt
+++ b/groups/phi/VPU_WRITE_MISS_RATIO.txt
@@ -1,4 +1,4 @@
-SHORT Miss ratio for VPU data write
+SHORT Miss ratio for VPU data writes
 
 EVENTSET
 PMC0  VPU_DATA_WRITE
@@ -6,7 +6,11 @@ PMC1  VPU_DATA_WRITE_MISS
 
 METRICS
 Runtime (RDTSC) [s] time
-Miss ratio PMC1/PMC0
+VPU write miss ratio PMC1/PMC0
 
 LONG
-Miss ratio for VPU data write
+Formula:
+VPU write miss ratio = PMC1/PMC0
+--
+This performance group determines the ratio between writes and writes that miss
+the cache and are issued by the VPU.
diff --git a/groups/phi/WRITE_MISS_RATIO.txt b/groups/phi/WRITE_MISS_RATIO.txt
index 0544b0e..0b83772 100644
--- a/groups/phi/WRITE_MISS_RATIO.txt
+++ b/groups/phi/WRITE_MISS_RATIO.txt
@@ -1,4 +1,4 @@
-SHORT Miss ratio for data write
+SHORT Miss ratio for data writes
 
 EVENTSET
 PMC0  DATA_WRITE
@@ -6,7 +6,10 @@ PMC1  DATA_WRITE_MISS
 
 METRICS
 Runtime (RDTSC) [s] time
-Miss ratio PMC1/PMC0
+Write miss ratio PMC1/PMC0
 
 LONG
-Miss ratio for data write
+Formulas:
+Write miss ratio = DATA_WRITE_MISS/DATA_WRITE
+--
+Miss ratio for data writes.
diff --git a/groups/sandybridge/BRANCH.txt b/groups/sandybridge/BRANCH.txt
index cbaf834..09699d9 100644
--- a/groups/sandybridge/BRANCH.txt
+++ b/groups/sandybridge/BRANCH.txt
@@ -19,10 +19,10 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
 The rates state how often in average a branch or a mispredicted branch occured
 per instruction retired in total. The Branch misprediction ratio sets directly
diff --git a/groups/sandybridge/DATA.txt b/groups/sandybridge/DATA.txt
index 5f04a23..1220980 100644
--- a/groups/sandybridge/DATA.txt
+++ b/groups/sandybridge/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = MEM_UOP_RETIRED_LOADS / MEM_UOP_RETIRED_STORES
+Load to store ratio = MEM_UOP_RETIRED_LOADS/MEM_UOP_RETIRED_STORES
 -
 This is a metric to determine your load to store ratio.
 
diff --git a/groups/sandybridge/ENERGY.txt b/groups/sandybridge/ENERGY.txt
index 9261934..2b466c8 100644
--- a/groups/sandybridge/ENERGY.txt
+++ b/groups/sandybridge/ENERGY.txt
@@ -7,10 +7,11 @@ FIXC2 CPU_CLK_UNHALTED_REF
 TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
 PWR1  PWR_PP0_ENERGY
+PWR2  PWR_PP1_ENERGY
 PWR3  PWR_DRAM_ENERGY
 
 METRICS
-Runtime (RDTSC) [s] time 
+Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
@@ -18,16 +19,19 @@ Temperature [C]  TMP0
 Energy [J]  PWR0
 Power [W] PWR0/time
 Energy PP0 [J]  PWR1
-Power PP0 [W] PWR1/time
+Power PP0 [W]  PWR1/time
+Energy PP1 [J]  PWR2
+Power PP1 [W] PWR2/time
 Energy DRAM [J]  PWR3
-Power DRAM [W] PWR3/time
+Power DRAM [W]  PWR3/time
 
 LONG
 Formula:
-Power =  PWR_PKG_ENERGY / time
+Power = PWR_PKG_ENERGY / time
 Power PP0 = PWR_PP0_ENERGY / time
+Power PP1 = PWR_PP1_ENERGY / time
 Power DRAM = PWR_DRAM_ENERGY / time
 -
 SandyBridge implements the new RAPL interface. This interface enables to
-monitor the consumed energy on the package (socket) and DRAM level.
+monitor the consumed energy on the package (socket) level.
 
diff --git a/groups/sandybridge/FLOPS_AVX.txt b/groups/sandybridge/FLOPS_AVX.txt
index 6850bca..8a9766f 100644
--- a/groups/sandybridge/FLOPS_AVX.txt
+++ b/groups/sandybridge/FLOPS_AVX.txt
@@ -12,13 +12,13 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-32b packed SP MFlops/s  1.0E-06*(PMC0*8.0)/time
-32b packed DP MFlops/s  1.0E-06*(PMC1*4.0)/time
+Packed SP MFlops/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFlops/s  1.0E-06*(PMC1*4.0)/time
 
 LONG
 Formula:
-32b packed SP MFlops/s =  (SIMD_FP_256_PACKED_SINGLE*8)/ runtime
-32b packed DP MFlops/s =  (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
+Packed SP MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
 -
 Packed 32b AVX flops rates. Please note that the current flop measurements on SandyBridge are
 potentially wrong. So you cannot trust these counters at the moment!
diff --git a/groups/sandybridge/FLOPS_DP.txt b/groups/sandybridge/FLOPS_DP.txt
index cda580a..39b26b7 100644
--- a/groups/sandybridge/FLOPS_DP.txt
+++ b/groups/sandybridge/FLOPS_DP.txt
@@ -14,14 +14,16 @@ Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
-32b AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
 Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 
 LONG
 Formula:
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s =  (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
 -
 SSE scalar and packed double precision flop rates. Please note that the current
 flop measurements on IvyBridge are potentially wrong. So you cannot trust
diff --git a/groups/sandybridge/FLOPS_SP.txt b/groups/sandybridge/FLOPS_SP.txt
index 753ade7..e92decc 100644
--- a/groups/sandybridge/FLOPS_SP.txt
+++ b/groups/sandybridge/FLOPS_SP.txt
@@ -13,17 +13,19 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-MFlops/s  1.0E-06*(PMC0*4.0+PMC1)/time
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
 AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
 Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 
 LONG
 Formula:
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s =  (SIMD_FP_256_PACKED_SINGLE*8)/ runtime
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
 -
-SSE scalar and packed single precision flop rates. Also shows packed AVX 32b
-flop rates. Please note that the current flop measurements on SandyBridge are
-potentially wrong. So you cannot trust these counters at the moment!
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on SandyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
 
diff --git a/groups/sandybridge/ICACHE.txt b/groups/sandybridge/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/groups/sandybridge/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/sandybridge/L2.txt b/groups/sandybridge/L2.txt
index 5345b7a..bf24fbb 100644
--- a/groups/sandybridge/L2.txt
+++ b/groups/sandybridge/L2.txt
@@ -6,27 +6,33 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPLACEMENT
 PMC1  L1D_M_EVICT
+PMC2  ICACHE_MISSES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
 
 LONG
 Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
 number of cacheline allocated in the L1 and the number of modified cachelines
 evicted from the L1. The group also output total data volume transfered between
 L2 and L1. Note that this bandwidth also includes data transfers due to a write
-allocate load on a store miss in L1.
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
 
diff --git a/groups/sandybridge/L2CACHE.txt b/groups/sandybridge/L2CACHE.txt
index 3d7c36e..8aa6522 100644
--- a/groups/sandybridge/L2CACHE.txt
+++ b/groups/sandybridge/L2CACHE.txt
@@ -18,9 +18,9 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_TRANS_ALL_REQUESTS / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_TRANS_ALL_REQUESTS
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
 -
 This group measures the locality of your data accesses with regard to the
 L2 Cache. L2 request rate tells you how data intensive your code is
@@ -30,6 +30,5 @@ cachelines from memory. And finally L2 miss ratio tells you how many of your
 memory references required a cacheline to be loaded from a higher level.
 While the Data cache miss rate might be given by your algorithm you should
 try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
 
 
diff --git a/groups/sandybridge/L3.txt b/groups/sandybridge/L3.txt
index 9a7c914..fcb3d73 100644
--- a/groups/sandybridge/L3.txt
+++ b/groups/sandybridge/L3.txt
@@ -12,17 +12,21 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 Load [MBytes/s]  1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L3 Load [MBytes/s]  1.0E-06*L2_LINES_IN_ALL*64/time
-L3 Evict [MBytes/s]  1.0E-06*L2_LINES_OUT_DIRTY_ALL*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ALL*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ALL*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
 -
 Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
 number of cacheline allocated in the L2 and the number of modified cachelines
diff --git a/groups/sandybridge/L3CACHE.txt b/groups/sandybridge/L3CACHE.txt
index d4fd89e..30e71ee 100644
--- a/groups/sandybridge/L3CACHE.txt
+++ b/groups/sandybridge/L3CACHE.txt
@@ -6,21 +6,22 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
 PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 request rate (PMC0)/FIXC0
-L3 miss rate PMC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
 L3 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL / INSTR_RETIRED_ANY
-L3 miss rate  = MEM_LOAD_UOPS_RETIRED_L3_MISS / INSTR_RETIRED_ANY
-L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS / MEM_LOAD_UOPS_RETIRED_L3_ALL
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
 -
 This group measures the locality of your data accesses with regard to the
 L3 Cache. L3 request rate tells you how data intensive your code is
@@ -30,6 +31,5 @@ cachelines from memory. And finally L3 miss ratio tells you how many of your
 memory references required a cacheline to be loaded from a higher level.
 While the Data cache miss rate might be given by your algorithm you should
 try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
 
 
diff --git a/groups/sandybridge/MEM.txt b/groups/sandybridge/MEM.txt
deleted file mode 100644
index 1f9ff4a..0000000
--- a/groups/sandybridge/MEM.txt
+++ /dev/null
@@ -1,32 +0,0 @@
-SHORT Main memory bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Profiling group to measure main memory bandwidth drawn by all cores of
-a socket.  Since this group is based on uncore events it is only possible to
-measure on the granularity of a socket.  If a thread group contains multiple
-threads only one thread per socket will show the results.  Also outputs total
-data volume transfered from main memory.
-
diff --git a/groups/sandybridge/MEM_DP.txt b/groups/sandybridge/MEM_DP.txt
deleted file mode 100644
index 78fbd18..0000000
--- a/groups/sandybridge/MEM_DP.txt
+++ /dev/null
@@ -1,55 +0,0 @@
-SHORT Overview of arithmetic and main memory performance
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-TMP0  TEMP_CORE
-PWR0  PWR_PKG_ENERGY
-PWR3  PWR_DRAM_ENERGY
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
-PMC2  SIMD_FP_256_PACKED_DOUBLE
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Temperature TMP0
-Energy [J]  PWR0
-Power [W] PWR0/time
-Energy DRAM [J]  PWR3
-Power DRAM [W] PWR3/time
-MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
-32b AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Formula:
-Power =  PWR_PKG_ENERGY / runtime
-Power DRAM = PWR_DRAM_ENERGY / runtime
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
-AVX MFlops/s =  (SIMD_FP_256_PACKED_DOUBLE*4)/ runtime
-Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
---
-Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Since this group is based on uncore events it is only possible to measure on
-a per socket base. Also outputs total data volume transfered from main memory,
-SSE scalar and packed double precision flop rates as well as consumed energy and 
-temperature. Also reports on packed AVX 32b instructions.  Please note that the 
-current flop measurements on SandyBridge are potentially wrong. So you cannot trust these counters at the moment!
diff --git a/groups/sandybridge/MEM_SP.txt b/groups/sandybridge/MEM_SP.txt
deleted file mode 100644
index 1ede713..0000000
--- a/groups/sandybridge/MEM_SP.txt
+++ /dev/null
@@ -1,56 +0,0 @@
-SHORT Overview of arithmetic and main memory performance
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-TMP0  TEMP_CORE
-PWR0  PWR_PKG_ENERGY
-PWR3  PWR_DRAM_ENERGY
-PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
-PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
-PMC2  SIMD_FP_256_PACKED_DOUBLE
-MBOX0C0 CAS_COUNT_RD
-MBOX0C1 CAS_COUNT_WR
-MBOX1C0 CAS_COUNT_RD
-MBOX1C1 CAS_COUNT_WR
-MBOX2C0 CAS_COUNT_RD
-MBOX2C1 CAS_COUNT_WR
-MBOX3C0 CAS_COUNT_RD
-MBOX3C1 CAS_COUNT_WR
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-Temperature TMP0
-Energy [J]  PWR0
-Power [W] PWR0/time
-Energy DRAM [J]  PWR3
-Power DRAM [W] PWR3/time
-MFlops/s  1.0E-06*(PMC0*4.0+PMC1)/time
-32b AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
-Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
-Scalar MUOPS/s 1.0E-06*PMC1/time
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX1C0+MBOX1C1+MBOX2C0+MBOX2C1+MBOX3C0+MBOX3C1)*64.0
-
-LONG
-Formula:
-Power =  PWR_PKG_ENERGY / runtime
-Power DRAM = PWR_DRAM_ENERGY / runtime
-MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE * 4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE) / runtime
-AVX MFlops/s =  (SIMD_FP_256_PACKED_SINGLE * 8) / runtime
-Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
-Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
---
-Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Since this group is based on uncore events it is only possible to measure on
-a per socket base. Also outputs total data volume transfered from main memory.
-SSE scalar and packed single precision flop rates as well as consumed energy and 
-temperature. Also reports on packed AVX 32b instructions. Please note that the 
-current flop measurements on SandyBridge are potentially wrong. So you cannot 
-trust these counters at the moment!
diff --git a/groups/sandybridge/TLB_DATA.txt b/groups/sandybridge/TLB_DATA.txt
index 2f59772..5e54147 100644
--- a/groups/sandybridge/TLB_DATA.txt
+++ b/groups/sandybridge/TLB_DATA.txt
@@ -16,19 +16,19 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 DTLB load misses     PMC0
 L1 DTLB load miss rate  PMC0/FIXC0
-L1 DTLB load miss duration PMC2
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
 L1 DTLB store misses     PMC1
 L1 DTLB store miss rate  PMC1/FIXC0
-L1 DTLB store miss duration PMC3
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
 
 LONG
 Formulas:
-L1 DTLB load misses     DTLB_LOAD_MISSES_CAUSES_A_WALK
-L1 DTLB load miss rate  DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB load miss duration DTLB_LOAD_MISSES_WALK_DURATION
-L1 DTLB store misses     DTLB_STORE_MISSES_CAUSES_A_WALK
-L1 DTLB store miss rate  DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 DTLB store miss duration DTLB_STORE_MISSES_WALK_DURATION
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
 -
 The DTLB load and store miss rates gives a measure how often a TLB miss occured
 per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/sandybridge/TLB_INSTR.txt b/groups/sandybridge/TLB_INSTR.txt
index f95f78a..8faaebe 100644
--- a/groups/sandybridge/TLB_INSTR.txt
+++ b/groups/sandybridge/TLB_INSTR.txt
@@ -14,14 +14,14 @@ Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
 L1 ITLB misses     PMC0
 L1 ITLB miss rate  PMC0/FIXC0
-L1 ITLB miss duration PMC1
+L1 ITLB miss duration [Cyc] PMC1/PMC0
 
 
 LONG
 Formulas:
-L1 ITLB misses     ITLB_MISSES_CAUSES_A_WALK
-L1 ITLB miss rate  ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
-L1 ITLB miss duration ITLB_MISSES_WALK_DURATION
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
 -
 The ITLB miss rates gives a measure how often a TLB miss occured
 per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/sandybridgeEP/BRANCH.txt b/groups/sandybridgeEP/BRANCH.txt
new file mode 100644
index 0000000..09699d9
--- /dev/null
+++ b/groups/sandybridgeEP/BRANCH.txt
@@ -0,0 +1,31 @@
+SHORT Branch prediction miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  BR_INST_RETIRED_ALL_BRANCHES
+PMC1  BR_MISP_RETIRED_ALL_BRANCHES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Branch rate   PMC0/FIXC0
+Branch misprediction rate  PMC1/FIXC0
+Branch misprediction ratio  PMC1/PMC0
+Instructions per branch  FIXC0/PMC0
+
+LONG
+Formulas:
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
+-
+The rates state how often in average a branch or a mispredicted branch occured
+per instruction retired in total. The Branch misprediction ratio sets directly
+into relation what ratio of all branch instruction where mispredicted.
+Instructions per branch is 1/Branch rate.
+
diff --git a/groups/sandybridgeEP/CACHES.txt b/groups/sandybridgeEP/CACHES.txt
new file mode 100644
index 0000000..54379ba
--- /dev/null
+++ b/groups/sandybridgeEP/CACHES.txt
@@ -0,0 +1,76 @@
+SHORT  Some data from the CBOXes
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DIRTY_ALL
+CBOX0C1 LLC_VICTIMS_M_STATE
+CBOX1C1 LLC_VICTIMS_M_STATE
+CBOX2C1 LLC_VICTIMS_M_STATE
+CBOX3C1 LLC_VICTIMS_M_STATE
+CBOX4C1 LLC_VICTIMS_M_STATE
+CBOX5C1 LLC_VICTIMS_M_STATE
+CBOX6C1 LLC_VICTIMS_M_STATE
+CBOX7C1 LLC_VICTIMS_M_STATE
+CBOX0C3 CBOX_CLOCKTICKS
+CBOX1C3 CBOX_CLOCKTICKS
+CBOX2C3 CBOX_CLOCKTICKS
+CBOX3C3 CBOX_CLOCKTICKS
+CBOX4C3 CBOX_CLOCKTICKS
+CBOX5C3 CBOX_CLOCKTICKS
+CBOX6C3 CBOX_CLOCKTICKS
+CBOX7C3 CBOX_CLOCKTICKS
+MBOX0C0 CAS_COUNT_RD
+MBOX1C0 CAS_COUNT_WR
+MBOX0C1 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX0C2 CAS_COUNT_RD
+MBOX1C2 CAS_COUNT_WR
+MBOX0C3 CAS_COUNT_RD
+MBOX1C3 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 to L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
+L1 to L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
+L1 to L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L1 to L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2 to L3 Load [MBytes/s]  1.0E-06*PMC2*64.0/time
+L2 to L3 Evict [MBytes/s]  1.0E-06*PMC3*64.0/time
+L2 to L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+L2 to L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+L3 avg clock [GHz] 1.E-09*(CBOX0C3+CBOX1C3+CBOX2C3+CBOX3C3+CBOX4C3+CBOX5C3+CBOX6C3+CBOX7C3)/8
+L3 to Memory bandwidth [MBytes/s] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0/time
+L3 to Memory data volume [MBytes] 1.0E-06*(CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1)*64.0
+Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3)*64.0/time
+Memory Write BW [MBytes/s] 1.0E-06*(MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0/time
+Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3+MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX0C1+MBOX0C2+MBOX0C3+MBOX1C0+MBOX1C1+MBOX1C2+MBOX1C3)*64.0
+
+
+LONG
+Formulas:
+L1 to L2 Load [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64/time
+L1 to L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
+L1 to L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT)*64/time
+L1 to L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT)*64
+L2 to L3 Load [MBytes/s]  1.0E-06*L2_LINES_IN_ALL*64/time
+L2 to L3 Evict [MBytes/s]  1.0E-06*L2_LINES_OUT_DIRTY_ALL*64/time
+L2 to L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
+L2 to L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
+L3 avg clock [MHz] 1.E-06*(SUM(CBOX*C3))/8
+L3 to Memory data volume [MBytes/s]  1.0E-06*(SUM(CBOX*C1))*64/time
+L3 to Memory data volume [MBytes]  1.0E-06*(SUM(CBOX*C1))*64
+Memory Read BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD)*64.0/time
+Memory Write BW [MBytes/s] 1.0E-06*(CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
+Memory BW [MBytes/s] 1.0E-06*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_RD+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR+CAS_COUNT_WR)*64.0
+-
+Group to measure cache transfers between L1 and Memory
diff --git a/groups/sandybridgeEP/CLOCK.txt b/groups/sandybridgeEP/CLOCK.txt
new file mode 100644
index 0000000..0cc92d3
--- /dev/null
+++ b/groups/sandybridgeEP/CLOCK.txt
@@ -0,0 +1,27 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time 
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+
+LONG
+Formula:
+Power =  PWR_PKG_ENERGY / time
+Power DRAM =  PWR_DRAM_ENERGY / time
+-
+SandyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) and DRAM level.
+
diff --git a/groups/sandybridgeEP/DATA.txt b/groups/sandybridgeEP/DATA.txt
new file mode 100644
index 0000000..1220980
--- /dev/null
+++ b/groups/sandybridgeEP/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOP_RETIRED_LOADS
+PMC1  MEM_UOP_RETIRED_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOP_RETIRED_LOADS/MEM_UOP_RETIRED_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/sandybridgeEP/ENERGY.txt b/groups/sandybridgeEP/ENERGY.txt
new file mode 100644
index 0000000..e5e2b33
--- /dev/null
+++ b/groups/sandybridgeEP/ENERGY.txt
@@ -0,0 +1,33 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+TMP0  TEMP_CORE
+PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
+PWR3  PWR_DRAM_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Temperature [C]  TMP0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W]  PWR1/time
+Energy DRAM [J]  PWR3
+Power DRAM [W]  PWR3/time
+
+LONG
+Formula:
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PP0_ENERGY / time
+Power DRAM = PWR_DRAM_ENERGY / time
+-
+SandyBridge implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/sandybridgeEP/FLOPS_AVX.txt b/groups/sandybridgeEP/FLOPS_AVX.txt
new file mode 100644
index 0000000..8a9766f
--- /dev/null
+++ b/groups/sandybridgeEP/FLOPS_AVX.txt
@@ -0,0 +1,25 @@
+SHORT Packed AVX MFlops/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  SIMD_FP_256_PACKED_SINGLE
+PMC1  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Packed SP MFlops/s  1.0E-06*(PMC0*8.0)/time
+Packed DP MFlops/s  1.0E-06*(PMC1*4.0)/time
+
+LONG
+Formula:
+Packed SP MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed DP MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+-
+Packed 32b AVX flops rates. Please note that the current flop measurements on SandyBridge are
+potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/groups/sandybridgeEP/FLOPS_DP.txt b/groups/sandybridgeEP/FLOPS_DP.txt
new file mode 100644
index 0000000..39b26b7
--- /dev/null
+++ b/groups/sandybridgeEP/FLOPS_DP.txt
@@ -0,0 +1,31 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision flop rates. Please note that the current
+flop measurements on IvyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/groups/sandybridgeEP/FLOPS_SP.txt b/groups/sandybridgeEP/FLOPS_SP.txt
new file mode 100644
index 0000000..e92decc
--- /dev/null
+++ b/groups/sandybridgeEP/FLOPS_SP.txt
@@ -0,0 +1,31 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on SandyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/groups/sandybridgeEP/ICACHE.txt b/groups/sandybridgeEP/ICACHE.txt
new file mode 100644
index 0000000..5f11ad6
--- /dev/null
+++ b/groups/sandybridgeEP/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ICACHE_ACCESSES
+PMC1  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/sandybridgeEP/L2.txt b/groups/sandybridgeEP/L2.txt
new file mode 100644
index 0000000..bf24fbb
--- /dev/null
+++ b/groups/sandybridgeEP/L2.txt
@@ -0,0 +1,38 @@
+SHORT L2 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L1D_M_EVICT
+PMC2  ICACHE_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
+
+LONG
+Formulas:
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPLACEMENT*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPLACEMENT*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPLACEMENT+L1D_M_EVICT+ICACHE_MISSES)*64
+-
+Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L1 and the number of modified cachelines
+evicted from the L1. The group also output total data volume transfered between
+L2 and L1. Note that this bandwidth also includes data transfers due to a write
+allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
+
diff --git a/groups/sandybridgeEP/L2CACHE.txt b/groups/sandybridgeEP/L2CACHE.txt
new file mode 100644
index 0000000..8aa6522
--- /dev/null
+++ b/groups/sandybridgeEP/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_TRANS_ALL_REQUESTS
+PMC1  L2_RQSTS_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = L2_TRANS_ALL_REQUESTS/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_TRANS_ALL_REQUESTS
+-
+This group measures the locality of your data accesses with regard to the
+L2 Cache. L2 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/sandybridgeEP/L3.txt b/groups/sandybridgeEP/L3.txt
new file mode 100644
index 0000000..fcb3d73
--- /dev/null
+++ b/groups/sandybridgeEP/L3.txt
@@ -0,0 +1,36 @@
+SHORT  L3 cache bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L2_LINES_IN_ALL
+PMC1  L2_LINES_OUT_DIRTY_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ALL*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ALL*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DIRTY_ALL*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DIRTY_ALL*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ALL+L2_LINES_OUT_DIRTY_ALL)*64
+-
+Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L2 and the number of modified cachelines
+evicted from the L2. This group also outputs data volume transfered between the
+L3 and  measured cores L2 caches. Note that this bandwidth also includes data
+transfers due to a write allocate load on a store miss in L2.
+
diff --git a/groups/sandybridgeEP/L3CACHE.txt b/groups/sandybridgeEP/L3CACHE.txt
new file mode 100644
index 0000000..30e71ee
--- /dev/null
+++ b/groups/sandybridgeEP/L3CACHE.txt
@@ -0,0 +1,35 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_LOAD_UOPS_RETIRED_L3_ALL
+PMC1  MEM_LOAD_UOPS_RETIRED_L3_MISS
+PMC2  UOPS_RETIRED_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate PMC0/PMC2
+L3 miss rate PMC1/PMC2
+L3 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L3 request rate = MEM_LOAD_UOPS_RETIRED_L3_ALL/UOPS_RETIRED_ALL
+L3 miss rate = MEM_LOAD_UOPS_RETIRED_L3_MISS/UOPS_RETIRED_ALL
+L3 miss ratio = MEM_LOAD_UOPS_RETIRED_L3_MISS/MEM_LOAD_UOPS_RETIRED_L3_ALL
+-
+This group measures the locality of your data accesses with regard to the
+L3 Cache. L3 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/sandybridgeEP/MEM.txt b/groups/sandybridgeEP/MEM.txt
new file mode 100644
index 0000000..c752588
--- /dev/null
+++ b/groups/sandybridgeEP/MEM.txt
@@ -0,0 +1,40 @@
+SHORT Main memory bandwidth in MBytes/s
+
+EVENTSET
+FIXC0   INSTR_RETIRED_ANY
+FIXC1   CPU_CLK_UNHALTED_CORE
+FIXC2   CPU_CLK_UNHALTED_REF
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+-
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on uncore events it is only possible to measure on a
+per socket base. Also outputs total data volume transfered from main memory.
+
diff --git a/groups/sandybridgeEP/MEM_DP.txt b/groups/sandybridgeEP/MEM_DP.txt
new file mode 100644
index 0000000..6c3e516
--- /dev/null
+++ b/groups/sandybridgeEP/MEM_DP.txt
@@ -0,0 +1,59 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transfered from main memory.
+SSE scalar and packed double precision flop rates. Also reports on packed AVX
+32b instructions.  Please note that the current flop measurements on SandyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
diff --git a/groups/sandybridgeEP/MEM_SP.txt b/groups/sandybridgeEP/MEM_SP.txt
new file mode 100644
index 0000000..adb4713
--- /dev/null
+++ b/groups/sandybridgeEP/MEM_SP.txt
@@ -0,0 +1,61 @@
+SHORT Overview of arithmetic and main memory performance
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+PWR3  PWR_DRAM_ENERGY
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+Energy DRAM [J]  PWR3
+Power DRAM [W] PWR3/time
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0
+
+LONG
+Formula:
+Power [W] = PWR_PKG_ENERGY/runtime
+Power DRAM [W] = PWR_DRAM_ENERGY/runtime
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1))*64.0
+--
+Profiling group to measure memory bandwidth drawn by all cores of a socket.
+Since this group is based on uncore events it is only possible to measure on
+a per socket base. Also outputs total data volume transfered from main memory.
+SSE scalar and packed single precision flop rates. Also reports on packed AVX
+32b instructions. Please note that the current flop measurements on SandyBridge
+are potentially wrong. So you cannot trust these counters at the moment!
diff --git a/groups/sandybridgeEP/NUMA.txt b/groups/sandybridgeEP/NUMA.txt
new file mode 100644
index 0000000..0c1b8fb
--- /dev/null
+++ b/groups/sandybridgeEP/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local DRAM data volume [GByte]  1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s]  1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte]  1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s]  1.E-06*(PMC1*64)/time
+Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/groups/sandybridgeEP/QPI.txt b/groups/sandybridgeEP/QPI.txt
new file mode 100644
index 0000000..bfbd4f4
--- /dev/null
+++ b/groups/sandybridgeEP/QPI.txt
@@ -0,0 +1,27 @@
+SHORT QPI traffic between sockets
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+SBOX0C0 DIRECT2CORE_SUCCESS
+SBOX0C1 RXL_FLITS_G1_DRS_DATA
+SBOX0C2 RXL_FLITS_G2_NCB_DATA
+SBOX1C0 DIRECT2CORE_SUCCESS
+SBOX1C1 RXL_FLITS_G1_DRS_DATA
+SBOX1C2 RXL_FLITS_G2_NCB_DATA
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Received bandwidth from QPI [MBytes/s] 1.0E-06*(SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8/time
+Received data volume from QPI [GBytes] 1.0E-09*(SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8
+Bandwidth QPI to LLC [MBytes/s] 1.0E-06*(SBOX0C0+SBOX1C0)*64/time
+Data volume QPI to LLC [GBytes] 1.0E-09*(SBOX0C0+SBOX1C0)*64
+Bandwidth QPI to HA or IIO [MBytes/s] 1.0E-06*(((SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8)-((SBOX0C0+SBOX1C0)*64))/time
+Data volume QPI to HA or IIO [GBytes] 1.0E-09*(((SBOX0C1+SBOX0C2+SBOX1C1+SBOX1C2)*8)-((SBOX0C0+SBOX1C0)*64))
+
+LONG
+Profiling group to measure traffic on the QPI.
diff --git a/groups/sandybridgeEP/TLB_DATA.txt b/groups/sandybridgeEP/TLB_DATA.txt
new file mode 100644
index 0000000..5e54147
--- /dev/null
+++ b/groups/sandybridgeEP/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_CAUSES_A_WALK
+PMC1  DTLB_STORE_MISSES_CAUSES_A_WALK
+PMC2  DTLB_LOAD_MISSES_WALK_DURATION
+PMC3  DTLB_STORE_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     PMC1
+L1 DTLB store miss rate  PMC1/FIXC0
+L1 DTLB store miss duration [Cyc] PMC3/PMC1
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_DURATION / DTLB_LOAD_MISSES_CAUSES_A_WALK
+L1 DTLB store misses = DTLB_STORE_MISSES_CAUSES_A_WALK
+L1 DTLB store miss rate = DTLB_STORE_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = DTLB_STORE_MISSES_WALK_DURATION / DTLB_STORE_MISSES_CAUSES_A_WALK
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/sandybridgeEP/TLB_INSTR.txt b/groups/sandybridgeEP/TLB_INSTR.txt
new file mode 100644
index 0000000..8faaebe
--- /dev/null
+++ b/groups/sandybridgeEP/TLB_INSTR.txt
@@ -0,0 +1,28 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_CAUSES_A_WALK
+PMC1  ITLB_MISSES_WALK_DURATION
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_CAUSES_A_WALK
+L1 ITLB miss rate = ITLB_MISSES_CAUSES_A_WALK / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_DURATION / ITLB_MISSES_CAUSES_A_WALK
+-
+The ITLB miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/silvermont/BRANCH.txt b/groups/silvermont/BRANCH.txt
index cbaf834..09699d9 100644
--- a/groups/silvermont/BRANCH.txt
+++ b/groups/silvermont/BRANCH.txt
@@ -19,10 +19,10 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
 The rates state how often in average a branch or a mispredicted branch occured
 per instruction retired in total. The Branch misprediction ratio sets directly
diff --git a/groups/silvermont/CLOCK.txt b/groups/silvermont/CLOCK.txt
new file mode 100644
index 0000000..dd1823f
--- /dev/null
+++ b/groups/silvermont/CLOCK.txt
@@ -0,0 +1,23 @@
+SHORT Power and Energy consumption
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0  PWR_PKG_ENERGY
+
+METRICS
+Runtime (RDTSC) [s] time 
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Energy [J]  PWR0
+Power [W] PWR0/time
+
+LONG
+Formula:
+Power =  PWR_PKG_ENERGY / time
+-
+Silvermont implements the new RAPL interface. This interface enables to
+monitor the consumed energy on the package (socket) level.
+
diff --git a/groups/silvermont/DATA.txt b/groups/silvermont/DATA.txt
new file mode 100644
index 0000000..61a915b
--- /dev/null
+++ b/groups/silvermont/DATA.txt
@@ -0,0 +1,22 @@
+SHORT Load to store ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  MEM_UOPS_RETIRED_ALL_LOADS
+PMC1  MEM_UOPS_RETIRED_ALL_STORES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Load to store ratio PMC0/PMC1
+
+LONG
+Formulas:
+Load to store ratio = MEM_UOPS_RETIRED_ALL_LOADS/MEM_UOPS_RETIRED_ALL_STORES
+-
+This is a metric to determine your load to store ratio.
+
diff --git a/groups/silvermont/ENERGY.txt b/groups/silvermont/ENERGY.txt
index 5646a9a..d0996b3 100644
--- a/groups/silvermont/ENERGY.txt
+++ b/groups/silvermont/ENERGY.txt
@@ -6,6 +6,7 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 TMP0  TEMP_CORE
 PWR0  PWR_PKG_ENERGY
+PWR1  PWR_PP0_ENERGY
 
 METRICS
 Runtime (RDTSC) [s] time
@@ -15,10 +16,13 @@ CPI  FIXC1/FIXC0
 Temperature [C]  TMP0
 Energy [J]  PWR0
 Power [W] PWR0/time
+Energy PP0 [J]  PWR1
+Power PP0 [W] PWR1/time
 
 LONG
 Formula:
-Power =  PWR_PKG_ENERGY / time
+Power = PWR_PKG_ENERGY / time
+Power PP0 = PWR_PKG_ENERGY / time
 -
 Silvermont implements the new RAPL interface. This interface enables to
 monitor the consumed energy on the package (socket) level.
diff --git a/groups/silvermont/ICACHE.txt b/groups/silvermont/ICACHE.txt
index 6ce3ce8..5f11ad6 100644
--- a/groups/silvermont/ICACHE.txt
+++ b/groups/silvermont/ICACHE.txt
@@ -18,8 +18,8 @@ L1I miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
-L2 miss rate  = ICACHE_MISSES / INSTR_RETIRED_ANY
-L2 miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
+L1I request rate = ICACHE_ACCESSES / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / ICACHE_ACCESSES
 -
 This group measures some L1 instruction cache metrics.
diff --git a/groups/silvermont/L1TOL2.txt b/groups/silvermont/L1TOL2.txt
deleted file mode 100644
index 225533d..0000000
--- a/groups/silvermont/L1TOL2.txt
+++ /dev/null
@@ -1,28 +0,0 @@
-SHORT L2 load cache bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  MEM_UOPS_RETIRED_L1_MISS_LOADS 
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0)*64.0
-
-LONG
-Formulas:
-L2 Load [MBytes/s] = 1.0E-06*MEM_UOPS_RETIRED_L1_MISS_LOADS*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(MEM_UOPS_RETIRED_L1_MISS_LOADS)*64/time
-L2 data volume [GBytes] = 1.0E-09*(MEM_UOPS_RETIRED_L1_MISS_LOADS)*64
--
-Profiling group to measure L2 load cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L1 cache. Since there is no possibility to retrieve
-the evicted cache lines, this group measures only the load cache bandwidth.
-The group also output totally loaded data volume transfered between L2 and L1.
-
diff --git a/groups/silvermont/L2CACHE.txt b/groups/silvermont/L2CACHE.txt
new file mode 100644
index 0000000..02e5d93
--- /dev/null
+++ b/groups/silvermont/L2CACHE.txt
@@ -0,0 +1,34 @@
+SHORT L2 cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  LONGEST_LAT_CACHE_REFERENCE
+PMC1  LONGEST_LAT_CACHE_MISS
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L2 request rate PMC0/FIXC0
+L2 miss rate PMC1/FIXC0
+L2 miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L2 request rate = LONGEST_LAT_CACHE_REFERENCE/INSTR_RETIRED_ANY
+L2 miss rate = LONGEST_LAT_CACHE_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = LONGEST_LAT_CACHE_MISS/LONGEST_LAT_CACHE_REFERENCE
+-
+This group measures the locality of your data accesses with regard to the
+L2 Cache. L2 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L2 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L2 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache
+reuse.
+
diff --git a/groups/silvermont/L2TOMEM.txt b/groups/silvermont/L2TOMEM.txt
deleted file mode 100644
index bc4cbed..0000000
--- a/groups/silvermont/L2TOMEM.txt
+++ /dev/null
@@ -1,26 +0,0 @@
-SHORT L2 to Mem load cache bandwidth in MBytes/s
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  MEM_UOPS_RETIRED_L2_MISS_LOADS 
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-L2 to MEM load bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
-L2 to MEM load data volume [GBytes] 1.0E-09*(PMC0)*64.0
-
-LONG
-Formulas:
-L2 to MEM load bandwidth [MBytes/s] = 1.0E-06*(MEM_UOPS_RETIRED_L2_MISS_LOADS)*64/time
-L2 to MEM load data volume [GBytes] = 1.0E-09*(MEM_UOPS_RETIRED_L2_MISS_LOADS)*64
--
-Profiling group to measure L2 to MEM load cache bandwidth. The bandwidth is computed by the
-number of cacheline allocated in the L2 cache. Since there is no possibility to retrieve
-the evicted cache lines, this group measures only the load cache bandwidth.
-The group also output totally loaded data volume transfered between memory and L2.
-
diff --git a/groups/silvermont/MEM.txt b/groups/silvermont/MEM.txt
new file mode 100644
index 0000000..85b017f
--- /dev/null
+++ b/groups/silvermont/MEM.txt
@@ -0,0 +1,37 @@
+SHORT Memory load bandwidth in MBytes/s
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  LONGEST_LAT_CACHE_MISS
+PMC1  OFFCORE_RESPONSE_1_WB_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Memory read bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(PMC0)*64.0
+Memory writeback bandwidth [MBytes/s] 1.0E-06*(PMC1)*64.0/time
+Memory writeback data volume [GBytes] 1.0E-09*(PMC1)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+
+LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(LONGEST_LAT_CACHE_MISS)*64/time
+Memory read data volume [GBytes] = 1.0E-09*(LONGEST_LAT_CACHE_MISS)*64
+Memory writeback bandwidth [MBytes/s] = 1.0E-06*(OFFCORE_RESPONSE_1_WB_ANY)*64/time
+Memory writeback data volume [GBytes] = 1.0E-09*(OFFCORE_RESPONSE_1_WB_ANY)*64
+Memory bandwidth [MBytes/s] = 1.0E-06*(LONGEST_LAT_CACHE_MISS+OFFCORE_RESPONSE_1_WB_ANY)*64/time
+Memory data volume [GBytes] = 1.0E-09*(LONGEST_LAT_CACHE_MISS+OFFCORE_RESPONSE_1_WB_ANY)*64
+-
+Profiling group to measure L2 to MEM load cache bandwidth. The bandwidth is computed by the
+number of cacheline allocated in the L2 cache. Since there is no possibility to retrieve
+the evicted cache lines, this group measures only the load cache bandwidth. The
+writeback metrics count only modified cache lines that are written back to go to
+exclusive state
+The group also output totally load and writeback data volume transfered between memory and L2.
+
diff --git a/groups/silvermont/MEM_LAT.txt b/groups/silvermont/MEM_LAT.txt
new file mode 100644
index 0000000..516b135
--- /dev/null
+++ b/groups/silvermont/MEM_LAT.txt
@@ -0,0 +1,23 @@
+SHORT Average data read latency
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  OFFCORE_RESPONSE_0_DMND_DATA_RD_AVG_LAT
+PMC1  OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Average data read latency [cyc/read] PMC0/PMC1
+
+LONG
+Formulas:
+Average data read latency [cyc/read] = OFFCORE_RESPONSE_0_DMND_DATA_RD_AVG_LAT/OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY
+-
+The Offcore request facility of Intel Silvermont processors can be used to determine
+the average data read latency. It includes all operations done to read data like
+snoops and hits in upper cache levels.
diff --git a/groups/silvermont/TLB_DATA.txt b/groups/silvermont/TLB_DATA.txt
new file mode 100644
index 0000000..4a09a85
--- /dev/null
+++ b/groups/silvermont/TLB_DATA.txt
@@ -0,0 +1,27 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  PAGE_WALKS_DTLB_COUNT
+PMC1  PAGE_WALKS_DTLB_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB misses     PMC0
+L1 DTLB miss rate  PMC0/FIXC0
+L1 DTLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 DTLB misses = PAGE_WALKS_DTLB_COUNT
+L1 DTLB miss rate = PAGE_WALKS_DTLB_COUNT / INSTR_RETIRED_ANY
+L1 DTLB miss duration [Cyc] = PAGE_WALKS_DTLB_CYCLES / PAGE_WALKS_DTLB_COUNT
+-
+The DTLB load and store miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/silvermont/TLB_INSTR.txt b/groups/silvermont/TLB_INSTR.txt
new file mode 100644
index 0000000..a8f7ace
--- /dev/null
+++ b/groups/silvermont/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  PAGE_WALKS_ITLB_COUNT
+PMC1  PAGE_WALKS_ITLB_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+
+LONG
+Formulas:
+L1 ITLB misses = PAGE_WALKS_ITLB_COUNT
+L1 ITLB miss rate = PAGE_WALKS_ITLB_COUNT / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = PAGE_WALKS_ITLB_CYCLES / PAGE_WALKS_ITLB_COUNT
+-
+The ITLB miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
diff --git a/groups/westmere/BRANCH.txt b/groups/westmere/BRANCH.txt
index 3d81416..09699d9 100644
--- a/groups/westmere/BRANCH.txt
+++ b/groups/westmere/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
 The rates state how often in average a branch or a mispredicted branch occured
 per instruction retired in total. The Branch misprediction ratio sets directly
-into relation what ration of all branch instruction where mispredicted.
+into relation what ratio of all branch instruction where mispredicted.
 Instructions per branch is 1/Branch rate.
 
diff --git a/groups/westmere/DATA.txt b/groups/westmere/DATA.txt
index a5611bc..08d6d76 100644
--- a/groups/westmere/DATA.txt
+++ b/groups/westmere/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = MEM_INST_RETIRED_LOADS / MEM_INST_RETIRED_STORES
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
 -
 This is a simple metric to determine your Load to store ratio.
 
diff --git a/groups/westmere/FLOPS_DP.txt b/groups/westmere/FLOPS_DP.txt
index c5ba91c..658b8ff 100644
--- a/groups/westmere/FLOPS_DP.txt
+++ b/groups/westmere/FLOPS_DP.txt
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-DP MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,7 +22,11 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-DP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
 Therefore both Single as well as Double precision are measured to ensure the correctness
diff --git a/groups/westmere/FLOPS_SP.txt b/groups/westmere/FLOPS_SP.txt
index 4478c8f..a954670 100644
--- a/groups/westmere/FLOPS_SP.txt
+++ b/groups/westmere/FLOPS_SP.txt
@@ -14,7 +14,7 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+MFlops/s 1.0E-06*(PMC0*4.0+PMC1)/time
 Packed MUOPS/s   1.0E-06*PMC0/time
 Scalar MUOPS/s 1.0E-06*PMC1/time
 SP MUOPS/s 1.0E-06*PMC2/time
@@ -22,7 +22,11 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-SP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
 Therefore both Single as well as Double precision are measured to ensure the correctness
diff --git a/groups/westmere/ICACHE.txt b/groups/westmere/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/groups/westmere/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1I_READS
+PMC1  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/westmere/L2.txt b/groups/westmere/L2.txt
index 5506f1f..a69aa97 100644
--- a/groups/westmere/L2.txt
+++ b/groups/westmere/L2.txt
@@ -6,27 +6,33 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPL
 PMC1  L1D_M_EVICT
+PMC2  L1I_MISSES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
 
 LONG
 Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
 number of cacheline allocated in the L1 and the number of modified cachelines
 evicted from the L1. The group also reports on data volume transfered between
 L2 and L1 cache. Note that this bandwidth also includes data transfers due to a
-write allocate load on a store miss in L1.
+write allocate load on a store miss in L1 and traffic caused by misses in the
+L1 instruction cache.
 
diff --git a/groups/westmere/L2CACHE.txt b/groups/westmere/L2CACHE.txt
index 49778be..edc8d9c 100644
--- a/groups/westmere/L2CACHE.txt
+++ b/groups/westmere/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_DATA_RQSTS_DEMAND_ANY
+PMC0  L2_RQSTS_REFERENCES
 PMC1  L2_RQSTS_MISS
 
 METRICS
@@ -18,9 +18,9 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_DATA_RQSTS_DEMAND_ANY / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_DATA_RQSTS_DEMAND_ANY
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
 -
 This group measures the locality of your data accesses with regard to the
 L2 Cache. L2 request rate tells you how data intensive your code is
@@ -30,6 +30,5 @@ cachelines from memory. And finally L2 miss ratio tells you how many of your
 memory references required a cacheline to be loaded from a higher level.
 While the Data cache miss rate might be given by your algorithm you should
 try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
 
 
diff --git a/groups/westmere/L3.txt b/groups/westmere/L3.txt
index 6a58f78..ef68c70 100644
--- a/groups/westmere/L3.txt
+++ b/groups/westmere/L3.txt
@@ -12,17 +12,21 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 Load [MBytes/s]  1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L3 Load [MBytes/s]  1.0E-06*L2_LINES_IN_ANY*64/time
-L3 Evict [MBytes/s]  1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_ANY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_ANY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_ANY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_ANY)*64
 -
 Profiling group to measure L3 cache bandwidth. The bandwidth is computed by the
 number of cacheline allocated in the L2 and the number of modified cachelines
diff --git a/groups/westmere/L3CACHE.txt b/groups/westmere/L3CACHE.txt
index 944bc97..547d818 100644
--- a/groups/westmere/L3CACHE.txt
+++ b/groups/westmere/L3CACHE.txt
@@ -1,28 +1,26 @@
 SHORT L3 cache miss rate/ratio
 
 EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
 UPMC0  UNC_L3_HITS_ANY
 UPMC1  UNC_L3_MISS_ANY
-UPMC2  UNC_L3_LINES_IN_ANY
-UPMC3  UNC_L3_LINES_OUT_ANY
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 request rate   UPMC0/FIXC0
+L3 request rate   (UPMC0+UPMC1)/FIXC0
 L3 miss rate   UPMC1/FIXC0
 L3 miss ratio  UPMC1/(UPMC0+UPMC1)
 
 LONG
 Formulas:
-L3 request rate  UNC_L3_HITS_ANY / INSTR_RETIRED_ANY 
-L3 miss rate   UNC_L3_MISS_ANY / INSTR_RETIRED_ANY
-L3 miss ratio  UNC_L3_MISS_ANY / (UNC_L3_HITS_ANY + UNC_L3_MISS_ANY)
+L3 request rate = (UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)/INSTR_RETIRED_ANY
+L3 miss rate = UNC_L3_MISS_ANY/INSTR_RETIRED_ANY
+L3 miss ratio = UNC_L3_MISS_ANY/(UNC_L3_HITS_ANY+UNC_L3_MISS_ANY)
 -
 This group measures the locality of your data accesses with regard to the L3
 Cache. L3 request rate tells you how data intensive your code is or how many
diff --git a/groups/westmere/MEM.txt b/groups/westmere/MEM.txt
index f9e19ad..2288571 100644
--- a/groups/westmere/MEM.txt
+++ b/groups/westmere/MEM.txt
@@ -1,33 +1,46 @@
-SHORT Main memory bandwidth
+SHORT Main memory bandwidth in MBytes/s
 
 EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
 UPMC0  UNC_QMC_NORMAL_READS_ANY
 UPMC1  UNC_QMC_WRITES_FULL_ANY
-UPMC2 UNC_QHL_REQUESTS_REMOTE_READS
-UPMC3 UNC_QHL_REQUESTS_LOCAL_READS 
-UPMC4 UNC_QHL_REQUESTS_REMOTE_WRITES 
+UPMC2  UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3  UNC_QHL_REQUESTS_REMOTE_WRITES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64/time
-Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64
-Remote Read BW [MBytes/s] 1.0E-06*(UPMC2)*64/time
-Remote Write BW [MBytes/s] 1.0E-06*(UPMC4)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UPMC2+UPMC4)*64/time
+Memory read bandwidth [MBytes/s] 1.0E-06*UPMC0*64.0/time
+Memory data volume [GBytes] 1.0E-09*UPMC0*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*UPMC1*64.0/time
+Memory data volume [GBytes] 1.0E-09*UPMC1*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+Memory data volume [GBytes] 1.0E-09*(UPMC0+UPMC1)*64.0
+Remote memory read bandwidth [MBytes/s] 1.0E-06*UPMC2*64.0/time
+Remote memory read data volume [GBytes] 1.0E-09*UPMC2*64.0
+Remote memory write bandwidth [MBytes/s] 1.0E-06*UPMC3*64.0/time
+Remote memory write data volume [GBytes] 1.0E-09*UPMC3*64.0
+Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+Remote memory data volume [GBytes] 1.0E-09*(UPMC2+UPMC3)*64.0
 
 LONG
 Formulas:
-Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64/time
-Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64
-Remote Read BW [MBytes/s] =  1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS)*64/time;
-Remote Write BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
-Remote BW [MBytes/s] 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64/time
+Memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_NORMAL_READS_ANY*64.0/time
+Memory data volume [GBytes] = 1.0E-09*UNC_QMC_NORMAL_READS_ANY*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QMC_WRITES_FULL_ANY*64.0/time
+Memory data volume [GBytes] = 1.0E-09*UNC_QMC_WRITES_FULL_ANY*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(UNC_QMC_NORMAL_READS_ANY+UNC_QMC_WRITES_FULL_ANY)*64.0
+Remote memory read bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_READS*64.0/time
+Remote memory read data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_READS*64.0
+Remote memory write bandwidth [MBytes/s] = 1.0E-06*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0/time
+Remote memory write data volume [GBytes] = 1.0E-09*UNC_QHL_REQUESTS_REMOTE_WRITES*64.0
+Remote memory bandwidth [MBytes/s] = 1.0E-06*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0/time
+Remote memory data volume [GBytes] = 1.0E-09*(UNC_QHL_REQUESTS_REMOTE_READS+UNC_QHL_REQUESTS_REMOTE_WRITES)*64.0
 -
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
 This group will be measured by one core per socket. The Remote  Read BW  tells
diff --git a/groups/westmere/TLB.txt b/groups/westmere/TLB.txt
deleted file mode 100644
index 0077350..0000000
--- a/groups/westmere/TLB.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-SHORT  TLB miss rate/ratio
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  DTLB_MISSES_ANY
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-L1 DTLB miss rate  PMC0/FIXC0
-
-LONG
-Formulas:
-DTLB miss rate  = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
--
-The DTLB miss  rate gives a measure how often a TLB miss occured
-per instruction. 
-
diff --git a/groups/westmere/TLB_DATA.txt b/groups/westmere/TLB_DATA.txt
new file mode 100644
index 0000000..8168806
--- /dev/null
+++ b/groups/westmere/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_ANY
+PMC1  DTLB_MISSES_ANY
+PMC2  DTLB_LOAD_MISSES_WALK_CYCLES
+PMC3  DTLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     (PMC1-PMC0)
+L1 DTLB store miss rate  (PMC1-PMC0)/FIXC0
+L1 DTLB store miss duration [Cyc] (PMC3-PMC2)/(PMC1-PMC0)
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_ANY
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_ANY / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_CYCLES / DTLB_LOAD_MISSES_ANY
+L1 DTLB store misses = DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY
+L1 DTLB store miss rate = (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY) / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = (DTLB_MISSES_WALK_CYCLES-DTLB_LOAD_MISSES_WALK_CYCLES) / (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY)
+-
+The DTLB miss rate gives a measure how often a TLB miss occured
+per instruction. The store miss calculations are done using ALL-LOADS TLB walks.
+
diff --git a/groups/westmere/TLB_INSTR.txt b/groups/westmere/TLB_INSTR.txt
new file mode 100644
index 0000000..854f382
--- /dev/null
+++ b/groups/westmere/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_ANY
+PMC1  ITLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_ANY
+L1 ITLB miss rate = ITLB_MISSES_ANY / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_CYCLES / ITLB_MISSES_ANY
+-
+The ITLB miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/groups/westmereEX/BRANCH.txt b/groups/westmereEX/BRANCH.txt
index 3d81416..09699d9 100644
--- a/groups/westmereEX/BRANCH.txt
+++ b/groups/westmereEX/BRANCH.txt
@@ -19,13 +19,13 @@ Instructions per branch  FIXC0/PMC0
 
 LONG
 Formulas:
-Branch rate = BR_INST_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES / INSTR_RETIRED_ANY
-Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES / BR_INST_RETIRED_ALL_BRANCHES
-Instructions per branch = INSTR_RETIRED_ANY / BR_INST_RETIRED_ALL_BRANCHES
+Branch rate = BR_INST_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction rate =  BR_MISP_RETIRED_ALL_BRANCHES/INSTR_RETIRED_ANY
+Branch misprediction ratio = BR_MISP_RETIRED_ALL_BRANCHES/BR_INST_RETIRED_ALL_BRANCHES
+Instructions per branch = INSTR_RETIRED_ANY/BR_INST_RETIRED_ALL_BRANCHES
 -
 The rates state how often in average a branch or a mispredicted branch occured
 per instruction retired in total. The Branch misprediction ratio sets directly
-into relation what ration of all branch instruction where mispredicted.
+into relation what ratio of all branch instruction where mispredicted.
 Instructions per branch is 1/Branch rate.
 
diff --git a/groups/westmereEX/DATA.txt b/groups/westmereEX/DATA.txt
index a5611bc..08d6d76 100644
--- a/groups/westmereEX/DATA.txt
+++ b/groups/westmereEX/DATA.txt
@@ -12,11 +12,11 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Load to Store ratio PMC0/PMC1
+Load to store ratio PMC0/PMC1
 
 LONG
 Formulas:
-Load to Store ratio = MEM_INST_RETIRED_LOADS / MEM_INST_RETIRED_STORES
+Load to store ratio = MEM_INST_RETIRED_LOADS/MEM_INST_RETIRED_STORES
 -
 This is a simple metric to determine your Load to store ratio.
 
diff --git a/groups/westmereEX/FLOPS_DP.txt b/groups/westmereEX/FLOPS_DP.txt
index a62cbe3..658b8ff 100644
--- a/groups/westmereEX/FLOPS_DP.txt
+++ b/groups/westmereEX/FLOPS_DP.txt
@@ -22,7 +22,11 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-DP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*2 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
 Therefore both Single as well as Double precision are measured to ensure the correctness
diff --git a/groups/westmereEX/FLOPS_SP.txt b/groups/westmereEX/FLOPS_SP.txt
index 1485615..a954670 100644
--- a/groups/westmereEX/FLOPS_SP.txt
+++ b/groups/westmereEX/FLOPS_SP.txt
@@ -22,7 +22,11 @@ DP MUOPS/s 1.0E-06*PMC3/time
 
 LONG
 Formula:
-SP MFlops/s =  (FP_COMP_OPS_EXE_SSE_FP_PACKED*4 +  FP_COMP_OPS_EXE_SSE_FP_SCALAR)/ runtime
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR)/runtime
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
 -
 The Nehalem has not possibility to measure MFlops if mixed precision calculations are done.
 Therefore both Single as well as Double precision are measured to ensure the correctness
diff --git a/groups/westmereEX/ICACHE.txt b/groups/westmereEX/ICACHE.txt
new file mode 100644
index 0000000..49943ff
--- /dev/null
+++ b/groups/westmereEX/ICACHE.txt
@@ -0,0 +1,25 @@
+SHORT  Instruction cache miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1I_READS
+PMC1  L1I_MISSES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1I request rate PMC0/FIXC0
+L1I miss rate PMC1/FIXC0
+L1I miss ratio PMC1/PMC0
+
+LONG
+Formulas:
+L1I request rate = L1I_READS / INSTR_RETIRED_ANY
+L1I miss rate = ICACHE_MISSES / INSTR_RETIRED_ANY
+L1I miss ratio = ICACHE_MISSES / L1I_READS
+-
+This group measures some L1 instruction cache metrics.
diff --git a/groups/westmereEX/L2.txt b/groups/westmereEX/L2.txt
index 9201cd0..ffa65bc 100644
--- a/groups/westmereEX/L2.txt
+++ b/groups/westmereEX/L2.txt
@@ -6,27 +6,33 @@ FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
 PMC0  L1D_REPL
 PMC1  L1D_M_EVICT
+PMC2  L1I_MISSES
 
 METRICS
 Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L2 Load [MBytes/s] 1.0E-06*PMC0*64.0/time
-L2 Evict [MBytes/s] 1.0E-06*PMC1*64.0/time
-L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
-L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+L2D load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L2D load data volume [GBytes]  1.0E-09*PMC0*64.0
+L2D evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L2D evict data volume [GBytes]  1.0E-09*PMC1*64.0
+L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1+PMC2)*64.0/time
+L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1+PMC2)*64.0
 
 LONG
 Formulas:
-L2 Load [MBytes/s] = 1.0E-06*L1D_REPL*64/time
-L2 Evict [MBytes/s] = 1.0E-06*L1D_M_EVICT*64/time
-L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
-L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT)*64
+L2D load bandwidth [MBytes/s] = 1.0E-06*L1D_REPL*64.0/time
+L2D load data volume [GBytes] = 1.0E-09*L1D_REPL*64.0
+L2D evict bandwidth [MBytes/s] = 1.0E-06*L1D_M_EVICT*64.0/time
+L2D evict data volume [GBytes] = 1.0E-09*L1D_M_EVICT*64.0
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64/time
+L2 data volume [GBytes] = 1.0E-09*(L1D_REPL+L1D_M_EVICT+L1I_MISSES)*64
 -
 Profiling group to measure L2 cache bandwidth. The bandwidth is computed by the
 number of cacheline allocated in the L1 and the number of modified cachelines
 evicted from the L1. Also reports on total data volume transfered between L2
 and L1 cache. Note that this bandwidth also includes data transfers due to a
-write allocate load on a store miss in L1.
+write allocate load on a store miss in L1 and traffic caused by misses in the 
+instruction cache.
 
diff --git a/groups/westmereEX/L2CACHE.txt b/groups/westmereEX/L2CACHE.txt
index 49778be..edc8d9c 100644
--- a/groups/westmereEX/L2CACHE.txt
+++ b/groups/westmereEX/L2CACHE.txt
@@ -4,7 +4,7 @@ EVENTSET
 FIXC0 INSTR_RETIRED_ANY
 FIXC1 CPU_CLK_UNHALTED_CORE
 FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  L2_DATA_RQSTS_DEMAND_ANY
+PMC0  L2_RQSTS_REFERENCES
 PMC1  L2_RQSTS_MISS
 
 METRICS
@@ -18,9 +18,9 @@ L2 miss ratio PMC1/PMC0
 
 LONG
 Formulas:
-L2 request rate = L2_DATA_RQSTS_DEMAND_ANY / INSTR_RETIRED_ANY
-L2 miss rate  = L2_RQSTS_MISS / INSTR_RETIRED_ANY
-L2 miss ratio = L2_RQSTS_MISS / L2_DATA_RQSTS_DEMAND_ANY
+L2 request rate = L2_RQSTS_REFERENCES/INSTR_RETIRED_ANY
+L2 miss rate = L2_RQSTS_MISS/INSTR_RETIRED_ANY
+L2 miss ratio = L2_RQSTS_MISS/L2_RQSTS_REFERENCES
 -
 This group measures the locality of your data accesses with regard to the
 L2 Cache. L2 request rate tells you how data intensive your code is
@@ -30,6 +30,5 @@ cachelines from memory. And finally L2 miss ratio tells you how many of your
 memory references required a cacheline to be loaded from a higher level.
 While the Data cache miss rate might be given by your algorithm you should
 try to get Data cache miss ratio as low as possible by increasing your cache reuse.
-Note: This group might need to be revised!
 
 
diff --git a/groups/westmereEX/L3.txt b/groups/westmereEX/L3.txt
index f80761a..17ce431 100644
--- a/groups/westmereEX/L3.txt
+++ b/groups/westmereEX/L3.txt
@@ -12,17 +12,21 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-L3 Load [MBytes/s]  1.0E-06*PMC0*64.0/time
-L3 Evict [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time
+L3 load data volume [GBytes]  1.0E-09*PMC0*64.0
+L3 evict bandwidth [MBytes/s]  1.0E-06*PMC1*64.0/time
+L3 evict data volume [GBytes]  1.0E-09*PMC1*64.0
 L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
 L3 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
 
 LONG
 Formulas:
-L3 Load [MBytes/s]  1.0E-06*L2_LINES_IN_ANY*64/time
-L3 Evict [MBytes/s]  1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64/time
-L3 bandwidth [MBytes/s] 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
-L3 data volume [GBytes] 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
+L3 load bandwidth [MBytes/s] = 1.0E-06*L2_LINES_IN_ANY*64.0/time
+L3 load data volume [GBytes] = 1.0E-09*L2_LINES_IN_ANY*64.0
+L3 evict bandwidth [MBytes/s] = 1.0E-06*L2_LINES_OUT_DEMAND_DIRTY*64.0/time
+L3 evict data volume [GBytes] = 1.0E-09*L2_LINES_OUT_DEMAND_DIRTY*64.0
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+L3 data volume [GBytes] = 1.0E-09*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64
 -
 Profiling group to measure L3 cache bandwidth. The bandwidth is
 computed by the number of cacheline allocated in the L2 and the number of
diff --git a/groups/westmereEX/L3CACHE.txt b/groups/westmereEX/L3CACHE.txt
new file mode 100644
index 0000000..ddf63e6
--- /dev/null
+++ b/groups/westmereEX/L3CACHE.txt
@@ -0,0 +1,52 @@
+SHORT L3 cache miss rate/ratio
+
+EVENTSET
+FIXC0  INSTR_RETIRED_ANY
+FIXC1  CPU_CLK_UNHALTED_CORE
+FIXC2  CPU_CLK_UNHALTED_REF
+CBOX0C0 LLC_HITS_ALL
+CBOX0C1 LLC_MISSES_ALL
+CBOX1C0 LLC_HITS_ALL
+CBOX1C1 LLC_MISSES_ALL
+CBOX2C0 LLC_HITS_ALL
+CBOX2C1 LLC_MISSES_ALL
+CBOX3C0 LLC_HITS_ALL
+CBOX3C1 LLC_MISSES_ALL
+CBOX4C0 LLC_HITS_ALL
+CBOX4C1 LLC_MISSES_ALL
+CBOX5C0 LLC_HITS_ALL
+CBOX5C1 LLC_MISSES_ALL
+CBOX6C0 LLC_HITS_ALL
+CBOX6C1 LLC_MISSES_ALL
+CBOX7C0 LLC_HITS_ALL
+CBOX7C1 LLC_MISSES_ALL
+CBOX8C0 LLC_HITS_ALL
+CBOX8C1 LLC_MISSES_ALL
+CBOX9C0 LLC_HITS_ALL
+CBOX9C1 LLC_MISSES_ALL
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L3 request rate   (CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1+CBOX8C0+CBOX8C1+CBOX9C0+CBOX9C1)/FIXC0
+L3 miss rate   (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1)/FIXC0
+L3 miss ratio  (CBOX0C1+CBOX1C1+CBOX2C1+CBOX3C1+CBOX4C1+CBOX5C1+CBOX6C1+CBOX7C1+CBOX8C1+CBOX9C1)/(CBOX0C0+CBOX0C1+CBOX1C0+CBOX1C1+CBOX2C0+CBOX2C1+CBOX3C0+CBOX3C1+CBOX4C0+CBOX4C1+CBOX5C0+CBOX5C1+CBOX6C0+CBOX6C1+CBOX7C0+CBOX7C1+CBOX8C0+CBOX8C1+CBOX9C0+CBOX9C1)
+
+LONG
+Formulas:
+L3 request rate = (SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))/INSTR_RETIRED_ANY
+L3 miss rate = SUM(LLC_MISSES_ALL)/INSTR_RETIRED_ANY
+L3 miss ratio = SUM(LLC_MISSES_ALL)/(SUM(LLC_HITS_ALL)+SUM(LLC_MISSES_ALL))
+-
+This group measures the locality of your data accesses with regard to the
+L3 Cache. L3 request rate tells you how data intensive your code is
+or how many Data accesses you have in average per instruction.
+The L3 miss rate gives a measure how often it was necessary to get
+cachelines from memory. And finally L3 miss ratio tells you how many of your
+memory references required a cacheline to be loaded from a higher level.
+While the Data cache miss rate might be given by your algorithm you should
+try to get Data cache miss ratio as low as possible by increasing your cache reuse.
+
+
diff --git a/groups/westmereEX/MEM.txt b/groups/westmereEX/MEM.txt
index defa391..5d4fc62 100644
--- a/groups/westmereEX/MEM.txt
+++ b/groups/westmereEX/MEM.txt
@@ -1,19 +1,15 @@
-SHORT Main memory bandwidth
+SHORT Main memory bandwidth in MBytes/s
 
 EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-MBOX0C0 FVC_EV0_BBOX_CMDS_READS 
-MBOX0C1 FVC_EV0_BBOX_RSP_ACK 
-MBOX1C0 FVC_EV0_BBOX_CMDS_READS 
-MBOX1C1 FVC_EV0_BBOX_RSP_ACK 
-BBOX0C1 IMT_INSERTS_WR 
-BBOX1C1 IMT_INSERTS_WR 
-RBOX0C0 NEW_PACKETS_RECV_PORT0_IPERF0_ANY_DRS
-RBOX0C1 NEW_PACKETS_RECV_PORT1_IPERF0_ANY_DRS
-RBOX1C0 NEW_PACKETS_RECV_PORT4_IPERF0_ANY_DRS
-RBOX1C1 NEW_PACKETS_RECV_PORT5_IPERF0_ANY_DRS
+FIXC0   INSTR_RETIRED_ANY
+FIXC1   CPU_CLK_UNHALTED_CORE
+FIXC2   CPU_CLK_UNHALTED_REF
+MBOX0C0 FVC_EV0_BBOX_CMDS_READS
+MBOX0C1 DRAM_CMD_CAS_WR_OPN
+MBOX0C2 DRAM_MISC_CAS_WR_CLS
+MBOX1C0 FVC_EV0_BBOX_CMDS_READS
+MBOX1C1 DRAM_CMD_CAS_WR_OPN
+MBOX1C2 DRAM_MISC_CAS_WR_CLS
 
 
 METRICS
@@ -21,17 +17,22 @@ Runtime (RDTSC) [s] time
 Runtime unhalted [s] FIXC1*inverseClock
 Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
 CPI  FIXC1/FIXC0
-Memory Read BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64/time
-Memory Write BW [MBytes/s] 1.0E-06*(BBOX0C1+BBOX1C1)*64/time
-Memory BW [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+BBOX0C1+BBOX1C1)*64/time
-Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+BBOX0C1+BBOX1C1)*64
-Remote write data traffic Port 0 [MBytes/s] 1.0E-06*(RBOX0C0)*64/time
-Remote write data traffic Port 1 [MBytes/s] 1.0E-06*(RBOX0C1)*64/time
-Remote write data traffic Port 4 [MBytes/s] 1.0E-06*(RBOX1C0)*64/time
-Remote write data traffic Port 5 [MBytes/s] 1.0E-06*(RBOX1C1)*64/time
+Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0)*64.0/time
+Memory read data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0)*64.0
+Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX0C2+MBOX1C1+MBOX1C2)*64.0/time
+Memory write data volume [GBytes] 1.0E-09*(MBOX0C1+MBOX0C2+MBOX1C1+MBOX1C2)*64.0
+Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1+MBOX0C2+MBOX1C2)*64/time
+Memory data volume [GBytes] 1.0E-09*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1+MBOX0C2+MBOX1C2)*64
 
 LONG
+Formulas:
+Memory read bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0))*64.0/time
+Memory read data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0))*64.0
+Memory write bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC1)+SUM(MBOXxC2))*64.0/time
+Memory write data volume [GBytes] = 1.0E-09*(SUM(MBOXxC1)+SUM(MBOXxC2))*64.0
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(MBOXxC0)+SUM(MBOXxC1)+SUM(MBOXxC2))*64.0/time
+Memory data volume [GBytes] = 1.0E-09*(SUM(MBOXxC0)+SUM(MBOXxC1)+SUM(MBOXxC2))*64.0
+-
 Profiling group to measure memory bandwidth drawn by all cores of a socket.
-Addional to the bandwidth it also outputs the data volume and the remote
-traffic over QPI links to other sockets.
+Addional to the bandwidth it also outputs the data volume.
 
diff --git a/groups/westmereEX/NUMA.txt b/groups/westmereEX/NUMA.txt
new file mode 100644
index 0000000..0c1b8fb
--- /dev/null
+++ b/groups/westmereEX/NUMA.txt
@@ -0,0 +1,33 @@
+SHORT Local and remote memory accesses
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0 OFFCORE_RESPONSE_0_LOCAL_DRAM
+PMC1 OFFCORE_RESPONSE_1_REMOTE_DRAM
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+Local DRAM data volume [GByte]  1.E-09*PMC0*64
+Local DRAM bandwidth [MByte/s]  1.E-06*(PMC0*64)/time
+Remote DRAM data volume [GByte]  1.E-09*PMC1*64
+Remote DRAM bandwidth [MByte/s]  1.E-06*(PMC1*64)/time
+Memory data volume [GByte]  1.E-09*(PMC0+PMC1)*64
+Memory bandwidth [MByte/s]  1.E-06*((PMC0+PMC1)*64)/time
+
+LONG
+Formula:
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+Local DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_0_LOCAL_DRAM*64
+Local DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_0_LOCAL_DRAM*64)/time
+Remote DRAM data volume [GByte] = 1.E-09*OFFCORE_RESPONSE_1_REMOTE_DRAM*64
+Remote DRAM bandwidth [MByte/s] = 1.E-06*(OFFCORE_RESPONSE_1_REMOTE_DRAM*64)/time
+Memory data volume [GByte] = 1.E-09*(OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64
+Memory bandwidth [MByte/s] = 1.E-06*((OFFCORE_RESPONSE_0_LOCAL_DRAM+OFFCORE_RESPONSE_1_REMOTE_DRAM)*64)/time
+--
+This performance group measures the data traffic of CPU cores to local and remote
+memory.
diff --git a/groups/westmereEX/TLB.txt b/groups/westmereEX/TLB.txt
deleted file mode 100644
index 0077350..0000000
--- a/groups/westmereEX/TLB.txt
+++ /dev/null
@@ -1,22 +0,0 @@
-SHORT  TLB miss rate/ratio
-
-EVENTSET
-FIXC0 INSTR_RETIRED_ANY
-FIXC1 CPU_CLK_UNHALTED_CORE
-FIXC2 CPU_CLK_UNHALTED_REF
-PMC0  DTLB_MISSES_ANY
-
-METRICS
-Runtime (RDTSC) [s] time
-Runtime unhalted [s] FIXC1*inverseClock
-Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
-CPI  FIXC1/FIXC0
-L1 DTLB miss rate  PMC0/FIXC0
-
-LONG
-Formulas:
-DTLB miss rate  = DTLB_MISSES_ANY / INSTR_RETIRED_ANY
--
-The DTLB miss  rate gives a measure how often a TLB miss occured
-per instruction. 
-
diff --git a/groups/westmereEX/TLB_DATA.txt b/groups/westmereEX/TLB_DATA.txt
new file mode 100644
index 0000000..8168806
--- /dev/null
+++ b/groups/westmereEX/TLB_DATA.txt
@@ -0,0 +1,35 @@
+SHORT  L1 Data TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  DTLB_LOAD_MISSES_ANY
+PMC1  DTLB_MISSES_ANY
+PMC2  DTLB_LOAD_MISSES_WALK_CYCLES
+PMC3  DTLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 DTLB load misses     PMC0
+L1 DTLB load miss rate  PMC0/FIXC0
+L1 DTLB load miss duration [Cyc] PMC2/PMC0
+L1 DTLB store misses     (PMC1-PMC0)
+L1 DTLB store miss rate  (PMC1-PMC0)/FIXC0
+L1 DTLB store miss duration [Cyc] (PMC3-PMC2)/(PMC1-PMC0)
+
+LONG
+Formulas:
+L1 DTLB load misses = DTLB_LOAD_MISSES_ANY
+L1 DTLB load miss rate = DTLB_LOAD_MISSES_ANY / INSTR_RETIRED_ANY
+L1 DTLB load miss duration [Cyc] = DTLB_LOAD_MISSES_WALK_CYCLES / DTLB_LOAD_MISSES_ANY
+L1 DTLB store misses = DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY
+L1 DTLB store miss rate = (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY) / INSTR_RETIRED_ANY
+L1 DTLB store miss duration [Cyc] = (DTLB_MISSES_WALK_CYCLES-DTLB_LOAD_MISSES_WALK_CYCLES) / (DTLB_MISSES_ANY-DTLB_LOAD_MISSES_ANY)
+-
+The DTLB miss rate gives a measure how often a TLB miss occured
+per instruction. The store miss calculations are done using ALL-LOADS TLB walks.
+
diff --git a/groups/westmereEX/TLB_INSTR.txt b/groups/westmereEX/TLB_INSTR.txt
new file mode 100644
index 0000000..854f382
--- /dev/null
+++ b/groups/westmereEX/TLB_INSTR.txt
@@ -0,0 +1,27 @@
+SHORT  L1 Instruction TLB miss rate/ratio
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  ITLB_MISSES_ANY
+PMC1  ITLB_MISSES_WALK_CYCLES
+
+METRICS
+Runtime (RDTSC) [s] time
+Runtime unhalted [s] FIXC1*inverseClock
+Clock [MHz]  1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI  FIXC1/FIXC0
+L1 ITLB misses     PMC0
+L1 ITLB miss rate  PMC0/FIXC0
+L1 ITLB miss duration [Cyc] PMC1/PMC0
+
+LONG
+Formulas:
+L1 ITLB misses = ITLB_MISSES_ANY
+L1 ITLB miss rate = ITLB_MISSES_ANY / INSTR_RETIRED_ANY
+L1 ITLB miss duration [Cyc] = ITLB_MISSES_WALK_CYCLES / ITLB_MISSES_ANY
+-
+The ITLB miss rates gives a measure how often a TLB miss occured
+per instruction. The duration measures the time in cycles how long a walk did take.
+
diff --git a/kernel/Makefile b/kernel/Makefile
index fd0ffdf..b9b814a 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -9,4 +9,5 @@ modules_install:
 	install -m 666 enable_rdpmc.ko /lib/modules/$(shell uname -r)/extra/
 
 clean:
-	rm -f *.ko *.o modules.order Module.symvers enable_rdpmc.mod.c
+	rm -f *.ko *.o modules.order Module.symvers enable_rdpmc.mod.c .enable_rdpmc*.cmd
+	rm -rf .tmp_versions
diff --git a/make/config_checks.mk b/make/config_checks.mk
new file mode 100644
index 0000000..523869f
--- /dev/null
+++ b/make/config_checks.mk
@@ -0,0 +1,38 @@
+
+ifneq ($(MAKECMDGOALS),docs)
+# determine kernel Version
+KERNEL_VERSION_MAJOR := $(shell uname -r | awk '{split($$1,a,"."); print a[1]}' | cut -d '-' -f1)
+KERNEL_VERSION := $(shell uname -r | awk  '{split($$1,a,"."); print a[2]}' | cut -d '-' -f1)
+KERNEL_VERSION_MINOR := $(shell uname -r | awk '{split($$1,a,"."); print a[3]}' | cut -d '-' -f1)
+
+HAS_MEMPOLICY = $(shell if [ $(KERNEL_VERSION) -lt 7 -a $(KERNEL_VERSION_MAJOR) -lt 3 -a $(KERNEL_VERSION_MINOR) -lt 8 ]; then \
+               echo 0;  else echo 1; \
+			   fi; )
+
+# determine glibc Version
+GLIBC_VERSION := $(shell ldd --version | grep ldd |  awk '{ print $$NF }' | awk -F. '{ print $$2 }')
+
+HAS_SCHEDAFFINITY = $(shell if [ $(GLIBC_VERSION) -lt 4 ]; then \
+               echo 0;  else echo 1; \
+			   fi; )
+
+FORTRAN_IF_NAME := likwid.mod
+ifneq ($(FORTRAN_INTERFACE),false)
+HAS_FORTRAN_COMPILER = $(shell $(FC) --version 2>/dev/null || echo 'NOFORTRAN' )
+ifeq ($(HAS_FORTRAN_COMPILER),NOFORTRAN)
+FORTRAN_IF=
+$(info Warning: You have selected the fortran interface in config.mk, but there seems to be no fortran compiler $(FC) - not compiling it!)
+FORTRAN_INSTALL =
+else
+FORTRAN_IF := $(FORTRAN_IF_NAME)
+FORTRAN_INSTALL = @echo "===> INSTALL fortran interface to $(PREFIX)/include/"; \
+                  cp -f likwid.mod  $(PREFIX)/include/
+FORTRAN_REMOVE = @echo "===> REMOVING fortran interface from $(PREFIX)/include/"; \
+                 rm -f $(PREFIX)/include/likwid.mod
+endif
+else
+FORTRAN_IF =
+FORTRAN_INSTALL =
+FORTRAN_REMOVE =
+endif
+endif
diff --git a/make/config_defines.mk b/make/config_defines.mk
new file mode 100644
index 0000000..b1356b0
--- /dev/null
+++ b/make/config_defines.mk
@@ -0,0 +1,99 @@
+DEFINES   += -DVERSION=$(VERSION)         \
+		 -DRELEASE=$(RELEASE)                 \
+		 -DCFGFILE=$(CFG_FILE_PATH)           \
+		 -DINSTALL_PREFIX=$(PREFIX)           \
+		 -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) \
+		 -DMAX_NUM_NODES=$(MAX_NUM_NODES)     \
+		 -DLIBLIKWIDPIN=$(LIBLIKWIDPIN)       \
+		 -DLIKWIDFILTERPATH=$(LIKWIDFILTERPATH) \
+		 -DLIKWIDPATH=$(PREFIX) \
+		 -D_GNU_SOURCE
+
+ifneq ($(COLOR),NONE)
+DEFINES += -DCOLOR=$(COLOR)
+endif
+
+ifeq ($(BUILDDAEMON),true)
+ifneq ($(COMPILER),MIC)
+    DAEMON_TARGET = likwid-accessD
+else
+    $(info Info: Compiling for Xeon Phi. Disabling build of likwid-accessD.);
+    DAEMON_TARGET =
+endif
+endif
+
+ifeq ($(BUILDFREQ),true)
+ifneq ($(COMPILER),MIC)
+    FREQ_TARGET = likwid-setFreq
+else
+    $(info Info: Compiling for Xeon Phi. Disabling build of likwid-setFreq.);
+endif
+endif
+
+ifeq ($(INSTRUMENT_BENCH),true)
+DEFINES += -DPERFMON
+endif
+
+ifeq ($(HAS_MEMPOLICY),1)
+DEFINES += -DHAS_MEMPOLICY
+else
+$(info Kernel 2.6.$(KERNEL_VERSION) has no mempolicy support!);
+endif
+
+
+ifeq ($(SHARED_LIBRARY),true)
+CFLAGS += $(SHARED_CFLAGS)
+LIBS += -L. -pthread -lm
+TARGET_LIB := $(DYNAMIC_TARGET_LIB)
+else
+TARGET_LIB := $(STATIC_TARGET_LIB)
+endif
+
+ifeq ($(HAS_SCHEDAFFINITY),1)
+DEFINES += -DHAS_SCHEDAFFINITY
+PINLIB  = liblikwidpin.so
+else
+$(info GLIBC version 2.$(GLIBC_VERSION) has no pthread_setaffinity_np support!);
+PINLIB  =
+endif
+
+FILTER_HWLOC_OBJ = yes
+LIBHWLOC =
+ifeq ($(USE_HWLOC),true)
+DEFINES += -DLIKWID_USE_HWLOC
+LIBHWLOC = ext/hwloc/libhwloc.a
+LIBS += -Lext/hwloc
+EXT_TARGETS += ./ext/hwloc
+FILTER_HWLOC_OBJ =
+endif
+
+DEFINES += -DACCESSDAEMON=$(ACCESSDAEMON)
+
+ifeq ($(ACCESSMODE),sysdaemon)
+ifneq ($(COMPILER),MIC)
+DEFINES += -DACCESSMODE=2
+else
+$(info Info: Compiling for Xeon Phi. Set accessmode to direct.);
+ACCESSMODE = direct
+DEFINES += -DACCESSMODE=0
+endif
+else
+ifeq ($(ACCESSMODE),accessdaemon)
+ifneq ($(COMPILER),MIC)
+DEFINES += -DACCESSMODE=1
+else
+$(info Info: Compiling for Xeon Phi. Set accessmode to direct.);
+DEFINES += -DACCESSMODE=0
+ACCESSMODE = direct
+endif
+else
+DEFINES += -DACCESSMODE=0
+endif
+endif
+
+ifeq ($(DEBUG),true)
+DEBUG_FLAGS = -g
+DEFINES += -DDEBUG_LIKWID
+else
+DEBUG_FLAGS =
+endif
diff --git a/make/include_CLANG.mk b/make/include_CLANG.mk
new file mode 100644
index 0000000..8dbbd02
--- /dev/null
+++ b/make/include_CLANG.mk
@@ -0,0 +1,28 @@
+CC  = clang
+FC  = ifort
+AS  = as
+AR  = ar
+PAS = ./perl/AsmGen.pl
+GEN_PAS = ./perl/generatePas.pl 
+GEN_GROUPS = ./perl/generateGroups.pl
+GEN_PMHEADER = ./perl/gen_events.pl
+
+ANSI_CFLAGS   =
+
+CFLAGS   =  -O2 -std=c99 -Wno-format -fPIC
+FCFLAGS  = -module ./  # ifort
+#FCFLAGS  = -J ./  -fsyntax-only  #gfortran
+PASFLAGS  = x86-64
+ASFLAGS  = 
+CPPFLAGS =
+LFLAGS   =  -pthread
+
+SHARED_CFLAGS = -fPIC
+SHARED_LFLAGS = -shared
+
+DEFINES  = -DPAGE_ALIGNMENT=4096
+DEFINES  += -DLIKWID_MONITOR_LOCK
+DEFINES  += -DDEBUGLEV=0
+
+INCLUDES =
+LIBS     = -lm
diff --git a/make/include_GCC.mk b/make/include_GCC.mk
index 1ccfd88..3682ffb 100644
--- a/make/include_GCC.mk
+++ b/make/include_GCC.mk
@@ -2,29 +2,28 @@ CC  = gcc
 FC  = ifort
 AS  = as
 AR  = ar
-PAS = ./perl/AsmGen.pl
-GEN_PAS = ./perl/generatePas.pl
-GEN_GROUPS = ./perl/generateGroups.pl
-GEN_PMHEADER = ./perl/gen_events.pl
+PAS = ./perl/AsmGen.pl 
+GEN_PAS = ./perl/generatePas.pl 
+GEN_GROUPS = ./perl/generateGroups.pl 
+GEN_PMHEADER = ./perl/gen_events.pl 
 
-#ANSI_CFLAGS   = -std=c99
+ANSI_CFLAGS   =
 #ANSI_CFLAGS += -pedantic
 #ANSI_CFLAGS += -Wextra
 #ANSI_CFLAGS += -Wall
 
-CFLAGS   =  -O2  -Wno-format -Wno-nonnull -std=c99
+CFLAGS   =  -O2 -std=c99 -Wno-format -fPIC
 FCFLAGS  = -module ./  # ifort
 #FCFLAGS  = -J ./  -fsyntax-only  #gfortran
 PASFLAGS  = x86-64
-ASFLAGS  =
+ASFLAGS  = 
 CPPFLAGS =
-LFLAGS   =  -pthread
+LFLAGS   =  -pthread 
 
-SHARED_CFLAGS = -fpic
+SHARED_CFLAGS = -fPIC
 SHARED_LFLAGS = -shared
 
-DEFINES  = -D_GNU_SOURCE
-DEFINES  += -DPAGE_ALIGNMENT=4096
+DEFINES  = -DPAGE_ALIGNMENT=4096
 DEFINES  += -DLIKWID_MONITOR_LOCK
 DEFINES  += -DDEBUGLEV=0
 
diff --git a/make/include_GCCX86.mk b/make/include_GCCX86.mk
index 19add95..2d44301 100644
--- a/make/include_GCCX86.mk
+++ b/make/include_GCCX86.mk
@@ -1,22 +1,22 @@
 CC  = gcc
 AS  = as
 AR  = ar
-PAS = ./perl/AsmGen.pl
-GEN_PAS = ./perl/generatePas.pl
-GEN_GROUPS = ./perl/generateGroups.pl
-GEN_PMHEADER = ./perl/gen_events.pl
+PAS = ./perl/AsmGen.pl 
+GEN_PAS = ./perl/generatePas.pl 
+GEN_GROUPS = ./perl/generateGroups.pl 
+GEN_PMHEADER = ./perl/gen_events.pl 
 
-#ANSI_CFLAGS   = -std=c99
+ANSI_CFLAGS   = -std=c99
 #ANSI_CFLAGS += -pedantic
 #ANSI_CFLAGS += -Wextra
 #ANSI_CFLAGS += -Wall
 
-CFLAGS   =  -O2 -m32 -Wno-format -std=c99
+CFLAGS   =  -O2 -g -m32 -Wno-format
 FCFLAGS  = -J ./  -fsyntax-only
 PASFLAGS  = x86
-ASFLAGS  = --32
+ASFLAGS  = --32 -g
 CPPFLAGS =
-LFLAGS   = -m32 -pthread
+LFLAGS   = -m32 -g -pthread 
 
 SHARED_CFLAGS = -fpic
 SHARED_LFLAGS = -shared
diff --git a/make/include_ICC.mk b/make/include_ICC.mk
index ce49bfe..6efd38a 100644
--- a/make/include_ICC.mk
+++ b/make/include_ICC.mk
@@ -7,19 +7,20 @@ GEN_PAS = ./perl/generatePas.pl
 GEN_GROUPS = ./perl/generateGroups.pl 
 GEN_PMHEADER = ./perl/gen_events.pl 
 
-ANSI_CFLAGS += -std=c99
+ANSI_CFLAGS  = -std=c99 #-strict-ansi
 
-CFLAGS   =  -O1 -Wno-format
-FCFLAGS  = -module ./
+CFLAGS   =  -O1 -Wno-format -vec-report=0 -fPIC -pthread
+FCFLAGS  = -module ./ 
 ASFLAGS  = -gdwarf-2
 PASFLAGS  = x86-64
 CPPFLAGS =
 LFLAGS   = -pthread
 
-SHARED_CFLAGS = -fpic
-SHARED_LFLAGS = -shared
+SHARED_CFLAGS = -fPIC -pthread
+SHARED_LFLAGS = -shared -pthread
 
 DEFINES  = -D_GNU_SOURCE
+DEFINES  += -DMAX_NUM_THREADS=128
 DEFINES  += -DPAGE_ALIGNMENT=4096
 #enable this option to build likwid-bench with marker API for likwid-perfctr
 #DEFINES  += -DPERFMON
diff --git a/make/include_MIC.mk b/make/include_MIC.mk
index aa3c39a..fed4d86 100644
--- a/make/include_MIC.mk
+++ b/make/include_MIC.mk
@@ -7,12 +7,12 @@ GEN_PAS = ./perl/generatePas.pl
 GEN_GROUPS = ./perl/generateGroups.pl 
 GEN_PMHEADER = ./perl/gen_events.pl 
 
-#ANSI_CFLAGS   = -std=c99
-#ANSI_CFLAGS += -pedantic
+ANSI_CFLAGS   = -std=c99 -fPIC
+ANSI_CFLAGS += -pedantic
 #ANSI_CFLAGS += -Wextra
 #ANSI_CFLAGS += -Wall
 
-CFLAGS   = -mmic -O2 -Wno-format -std=c99
+CFLAGS   = -mmic -O1 -g -Wno-format -fPIC
 FCFLAGS  = -J ./  -fsyntax-only
 #FCFLAGS  = -module ./ 
 ASFLAGS  =  -mmic -c
diff --git a/monitoring/README.agent b/monitoring/README.agent
new file mode 100644
index 0000000..756d015
--- /dev/null
+++ b/monitoring/README.agent
@@ -0,0 +1,66 @@
+The likwid-agent application is a daemon that reads hardware performance
+counters in a periodic fashion. Which counters can be measured is determined by
+the system's CPU architecture. Each architecture has its own set of events and
+corresponding counter registers. For the measurement the likwid library is used
+and interfaced through the Lua interface. The measured values can be exported in
+multiple ways like RRD, syslog or gmetric from the Ganglia Monitoring System.
+
+
+The configuration file needs to be given at startup and has the following
+format:
+GROUPPATH <PATH_TO_GROUPS> # default is set during installation
+EVENTSET <SPACE_SEPARATED_LIST_OF_GROUPS>
+DURATION <TIME_IN_SECONDS_TO_MEASURE_EACH_GROUP>
+ACCESSMODE <0/1> # 0 is direct access, 1 forward access to the accessDaemon
+LOGPATH <PATH_TO_STORE_LOGFILES> # each montitoring group creates a logfile there named likwid.<GROUP>.log
+LOGSTYLE <log/update> # log appends new lines, update clears file previously
+GMETRIC <True/False> # send measured values to Gangla
+GMETRICPATH <PATH_TO_THE_GMETRIC_EXECUTABLE>
+GMETRICCONFIG <EXTRA_CONFIG_OPTIONS_TO_GMETRIC>
+RRD <True/False> # write measured values to RRD files, one RRD per group
+RRDPATH <PATH_TO_STORE_RRD_FILES>
+SYSLOG <True/False> # write measured values to syslog
+SYSLOGPRIO <prio> # Use priority level <prio> for syslog, default is local0.notice
+
+
+
+The group files cannot lie directly in GROUPPATH, you need to create a folder
+with the short name of the architecture like sandybridge or ivybridge. This
+enables to use the same group path distributed over a set of systems with different
+CPU architecture. The format of a group file is the following:
+SHORT <SHORT_NAME_OF_THE GROUP>
+
+EVENTSET // Starts event/counter definitions
+FIXC0 INSTR_RETIRED_ANY // Measure event INSTR_RETIRED_ANY in counter FIXC0
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS // Starts section of derived metrics and output items
+ONCE Runtime (RDTSC) [s] time # Output runtime only once
+MIN CPI FIXC1/FIXC0 # Output the minimum of the formula FIXC1/FIXC0 named CPI
+AVG CPI FIXC1/FIXC0 # Output the average of the same formula
+MAX L2 load bandwidth [MBytes/s]  1.0E-06*PMC0*64.0/time # Calculate bandwidth and output only the maximum
+MIN L2 load data volume [GBytes]  1.0E-09*PMC0*64.0
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time # Sum up all the values of all CPUs
+SUM L2 data volume [GBytes] 1.0E-09*(PMC0+PMC1)*64.0
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM L3 data volume [GBytes] 1.0E-09*(PMC2+PMC3)*64.0
+
+LONG
+<LONG DESCRIPTION OF THE GROUP>
+
+Possible functions are:
+ONCE: Output only once (CPU core 0), no aggregation is done
+MIN: Output the minimum of all cores
+MAX: Output the maximum of all cores
+AVG: Output the average of all cores
+SUM: Output the sum of all cores' values
+If no function is set, the values of all HW threads is written to output and
+T<ID> is written in front of the name.
+
+The output metric names can be equal, the function is glued to the output name for later separation.
+
diff --git a/monitoring/groups/atom/BW_MEM.txt b/monitoring/groups/atom/BW_MEM.txt
new file mode 100644
index 0000000..8eb701f
--- /dev/null
+++ b/monitoring/groups/atom/BW_MEM.txt
@@ -0,0 +1,10 @@
+SHORT Memory bandwidth
+
+EVENTSET
+PMC0  BUS_TRANS_MEM_THIS_CORE_THIS_A
+
+METRICS
+SUM Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+
+
+LONG
diff --git a/monitoring/groups/atom/FLOPS_DP.txt b/monitoring/groups/atom/FLOPS_DP.txt
new file mode 100644
index 0000000..14961f0
--- /dev/null
+++ b/monitoring/groups/atom/FLOPS_DP.txt
@@ -0,0 +1,13 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  SIMD_COMP_INST_RETIRED_PACKED_DOUBLE
+PMC1  SIMD_COMP_INST_RETIRED_SCALAR_DOUBLE
+
+METRICS
+DP MFlops/s    1.0E-06*(PMC0*2.0+PMC1)/time
+
+
+LONG
+Double Precision MFlops/s Double Precision MFlops/s
+
diff --git a/monitoring/groups/atom/FLOPS_SP.txt b/monitoring/groups/atom/FLOPS_SP.txt
new file mode 100644
index 0000000..d67704f
--- /dev/null
+++ b/monitoring/groups/atom/FLOPS_SP.txt
@@ -0,0 +1,12 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0  SIMD_COMP_INST_RETIRED_PACKED_SINGLE
+PMC1  SIMD_COMP_INST_RETIRED_SCALAR_SINGLE
+
+METRICS
+SP MFlops/s (SP assumed) 1.0E-06*(PMC0*4.0+PMC1)/time
+
+LONG
+Single Precision MFlops/s Double Precision MFlops/s
+
diff --git a/monitoring/groups/broadwell/BW.txt b/monitoring/groups/broadwell/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/broadwell/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/broadwell/ENERGY.txt b/monitoring/groups/broadwell/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/broadwell/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/broadwell/FLOPS_DP.txt b/monitoring/groups/broadwell/FLOPS_DP.txt
new file mode 100644
index 0000000..53b2463
--- /dev/null
+++ b/monitoring/groups/broadwell/FLOPS_DP.txt
@@ -0,0 +1,22 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_DOUBLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE*2+FP_ARITH_INST_RETIRED_SCALAR_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE+FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_DOUBLE/runtime
+-
+AVX/SSE scalar and packed double precision flop rates.
+
diff --git a/monitoring/groups/broadwell/FLOPS_SP.txt b/monitoring/groups/broadwell/FLOPS_SP.txt
new file mode 100644
index 0000000..b04f87a
--- /dev/null
+++ b/monitoring/groups/broadwell/FLOPS_SP.txt
@@ -0,0 +1,22 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0  FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE
+PMC1  FP_ARITH_INST_RETIRED_SCALAR_SINGLE
+PMC2  FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE*4+FP_ARITH_INST_RETIRED_SCALAR_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE+FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_ARITH_INST_RETIRED_SCALAR_SINGLE/runtime
+-
+AVX/SSE scalar and packed single precision flop rates.
+
diff --git a/monitoring/groups/broadwellEP/BW.txt b/monitoring/groups/broadwellEP/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/broadwellEP/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/broadwellEP/ENERGY.txt b/monitoring/groups/broadwellEP/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/broadwellEP/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/core2/BW_L2.txt b/monitoring/groups/core2/BW_L2.txt
new file mode 100644
index 0000000..6d73bf8
--- /dev/null
+++ b/monitoring/groups/core2/BW_L2.txt
@@ -0,0 +1,11 @@
+SHORT Cache bandwidth
+
+EVENTSET
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+
+
+LONG
diff --git a/monitoring/groups/core2/BW_MEM.txt b/monitoring/groups/core2/BW_MEM.txt
new file mode 100644
index 0000000..8eb701f
--- /dev/null
+++ b/monitoring/groups/core2/BW_MEM.txt
@@ -0,0 +1,10 @@
+SHORT Memory bandwidth
+
+EVENTSET
+PMC0  BUS_TRANS_MEM_THIS_CORE_THIS_A
+
+METRICS
+SUM Memory bandwidth [MBytes/s] 1.0E-06*PMC0*64.0/time
+
+
+LONG
diff --git a/monitoring/groups/haswell/BW.txt b/monitoring/groups/haswell/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/haswell/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/haswell/ENERGY.txt b/monitoring/groups/haswell/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/haswell/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/haswellEP/BW.txt b/monitoring/groups/haswellEP/BW.txt
new file mode 100644
index 0000000..e6f4b73
--- /dev/null
+++ b/monitoring/groups/haswellEP/BW.txt
@@ -0,0 +1,32 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+SUM Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/haswellEP/ENERGY.txt b/monitoring/groups/haswellEP/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/haswellEP/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/interlagos/BW.txt b/monitoring/groups/interlagos/BW.txt
new file mode 100644
index 0000000..3f465f6
--- /dev/null
+++ b/monitoring/groups/interlagos/BW.txt
@@ -0,0 +1,16 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  DATA_CACHE_REFILLS_ALL
+PMC1  DATA_CACHE_REFILLS_SYSTEM
+PMC2  L2_FILL_WB_FILL
+PMC3  L2_FILL_WB_WB
+UPMC0  UNC_DRAM_ACCESSES_DCT0_ALL
+UPMC1  UNC_DRAM_ACCESSES_DCT1_ALL
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0-PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/interlagos/CPI.txt b/monitoring/groups/interlagos/CPI.txt
new file mode 100644
index 0000000..d599a34
--- /dev/null
+++ b/monitoring/groups/interlagos/CPI.txt
@@ -0,0 +1,19 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_UOPS
+
+METRICS
+CPI   PMC1/PMC0
+Cycles per UOPS  PMC1/PMC2
+IPC   PMC0/PMC1
+
+LONG
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/monitoring/groups/interlagos/FLOPS.txt b/monitoring/groups/interlagos/FLOPS.txt
new file mode 100644
index 0000000..7bfb29a
--- /dev/null
+++ b/monitoring/groups/interlagos/FLOPS.txt
@@ -0,0 +1,18 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0  RETIRED_FLOPS_DOUBLE_ALL
+PMC1  RETIRED_FLOPS_SINGLE_ALL
+
+METRICS
+DP MFlops/s    1.0E-06*(PMC0)/time
+SP MFlops/s    1.0E-06*(PMC1)/time
+
+LONG
+Formulas:
+DP MFlops/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+SP MFlops/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
+-
+Profiling group to measure double precisision flop rate.
+
+
diff --git a/monitoring/groups/ivybridge/BW.txt b/monitoring/groups/ivybridge/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/ivybridge/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/ivybridge/ENERGY.txt b/monitoring/groups/ivybridge/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/ivybridge/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/ivybridge/FLOPS_DP.txt b/monitoring/groups/ivybridge/FLOPS_DP.txt
new file mode 100644
index 0000000..496b8a5
--- /dev/null
+++ b/monitoring/groups/ivybridge/FLOPS_DP.txt
@@ -0,0 +1,23 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision flop rates. Please note that the current flop measurements on SandyBridge are
+potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/monitoring/groups/ivybridge/FLOPS_SP.txt b/monitoring/groups/ivybridge/FLOPS_SP.txt
new file mode 100644
index 0000000..64edd19
--- /dev/null
+++ b/monitoring/groups/ivybridge/FLOPS_SP.txt
@@ -0,0 +1,24 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on IvyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/ivybridgeEP/BW.txt b/monitoring/groups/ivybridgeEP/BW.txt
new file mode 100644
index 0000000..e6f4b73
--- /dev/null
+++ b/monitoring/groups/ivybridgeEP/BW.txt
@@ -0,0 +1,32 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+MBOX4C0 CAS_COUNT_RD
+MBOX4C1 CAS_COUNT_WR
+MBOX5C0 CAS_COUNT_RD
+MBOX5C1 CAS_COUNT_WR
+MBOX6C0 CAS_COUNT_RD
+MBOX6C1 CAS_COUNT_WR
+MBOX7C0 CAS_COUNT_RD
+MBOX7C1 CAS_COUNT_WR
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0)*64.0/time
+SUM Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX4C0+MBOX5C0+MBOX6C0+MBOX7C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1+MBOX4C1+MBOX5C1+MBOX6C1+MBOX7C1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/ivybridgeEP/ENERGY.txt b/monitoring/groups/ivybridgeEP/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/ivybridgeEP/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/ivybridgeEP/FLOPS_DP.txt b/monitoring/groups/ivybridgeEP/FLOPS_DP.txt
new file mode 100644
index 0000000..496b8a5
--- /dev/null
+++ b/monitoring/groups/ivybridgeEP/FLOPS_DP.txt
@@ -0,0 +1,23 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision flop rates. Please note that the current flop measurements on SandyBridge are
+potentially wrong. So you cannot trust these counters at the moment!
+
diff --git a/monitoring/groups/ivybridgeEP/FLOPS_SP.txt b/monitoring/groups/ivybridgeEP/FLOPS_SP.txt
new file mode 100644
index 0000000..64edd19
--- /dev/null
+++ b/monitoring/groups/ivybridgeEP/FLOPS_SP.txt
@@ -0,0 +1,24 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on IvyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/kabini/BW.txt b/monitoring/groups/kabini/BW.txt
new file mode 100644
index 0000000..7e34078
--- /dev/null
+++ b/monitoring/groups/kabini/BW.txt
@@ -0,0 +1,14 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0  DATA_CACHE_REFILLS_ALL
+PMC1  DATA_CACHE_EVICTED_ALL
+UPMC0  UNC_DRAM_ACCESSES_DCT0_ALL
+UPMC1  UNC_DRAM_ACCESSES_DCT1_ALL
+
+
+METRICS
+SUM L2 bandwidth [MBytes/s]   1.0E-06*(PMC0+PMC1)*64.0/time
+SUM Memory bandwidth [MBytes/s]   1.0E-06*(UPMC0+UPMC1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/kabini/CPI.txt b/monitoring/groups/kabini/CPI.txt
new file mode 100644
index 0000000..d599a34
--- /dev/null
+++ b/monitoring/groups/kabini/CPI.txt
@@ -0,0 +1,19 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  RETIRED_INSTRUCTIONS
+PMC1  CPU_CLOCKS_UNHALTED
+PMC2  RETIRED_UOPS
+
+METRICS
+CPI   PMC1/PMC0
+Cycles per UOPS  PMC1/PMC2
+IPC   PMC0/PMC1
+
+LONG
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is RETIRED_INSTRUCTIONS as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/monitoring/groups/kabini/FLOPS.txt b/monitoring/groups/kabini/FLOPS.txt
new file mode 100644
index 0000000..ccb2f92
--- /dev/null
+++ b/monitoring/groups/kabini/FLOPS.txt
@@ -0,0 +1,14 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0  RETIRED_FLOPS_DOUBLE_ALL
+PMC1  RETIRED_FLOPS_SINGLE_ALL
+
+METRICS
+DP MFlops/s    1.0E-06*(PMC0)/time
+SP MFlops/s    1.0E-06*(PMC1)/time
+
+LONG
+Formulas:
+DP MFlops/s = 1.0E-06*(RETIRED_FLOPS_DOUBLE_ALL)/time
+SP MFlops/s = 1.0E-06*(RETIRED_FLOPS_SINGLE_ALL)/time
diff --git a/monitoring/groups/nehalem/BW.txt b/monitoring/groups/nehalem/BW.txt
new file mode 100644
index 0000000..ddc8c82
--- /dev/null
+++ b/monitoring/groups/nehalem/BW.txt
@@ -0,0 +1,20 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ANY
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+UPMC0  UNC_QMC_NORMAL_READS_ANY
+UPMC1  UNC_QMC_WRITES_FULL_ANY
+UPMC2  UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3  UNC_QHL_REQUESTS_REMOTE_WRITES
+
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+SUM Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/nehalem/CPI.txt b/monitoring/groups/nehalem/CPI.txt
new file mode 100644
index 0000000..9852da8
--- /dev/null
+++ b/monitoring/groups/nehalem/CPI.txt
@@ -0,0 +1,14 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+
+METRICS
+CPI  FIXC1/FIXC0
+IPC  FIXC0/FIXC1
+
+
+LONG
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+IPC = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
diff --git a/monitoring/groups/nehalem/FLOPS.txt b/monitoring/groups/nehalem/FLOPS.txt
new file mode 100644
index 0000000..e372504
--- /dev/null
+++ b/monitoring/groups/nehalem/FLOPS.txt
@@ -0,0 +1,20 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Packed MUOPS/s   1.0E-06*PMC0/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+SP MUOPS/s 1.0E-06*PMC2/time
+DP MUOPS/s 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/groups/nehalemEX/BW.txt b/monitoring/groups/nehalemEX/BW.txt
new file mode 100644
index 0000000..473ce76
--- /dev/null
+++ b/monitoring/groups/nehalemEX/BW.txt
@@ -0,0 +1,29 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ANY
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+MBOX0C0 FVC_EV0_BBOX_CMDS_READS
+MBOX0C1 DRAM_CMD_CAS_WR_OPN
+MBOX1C0 FVC_EV0_BBOX_CMDS_READS
+MBOX1C1 DRAM_CMD_CAS_WR_OPN
+
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1)*64/time
+
+LONG
+Formula:
+L2 bandwidth [MBytes/s] = 1.0E-06*(L1D_REPL+L1D_M_EVICT)*64/time
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ANY+L2_LINES_OUT_DEMAND_DIRTY)*64/time
+Memory bandwidth [MBytes/s] = 1.0E-06*(SUM(FVC_EV0_BBOX_CMDS_READS)+SUM(DRAM_CMD_CAS_WR_OPN))*64.0/time
+
+On Nehalem EX it is not possible to measure the write operations with the
+FVC_EV0_BBOX_CMDS_WRITES event at the same time as the FVC_EV0_BBOX_CMDS_READS
+because they set contrary bits. The DRAM_CMD_CAS_WR_OPN is an alternative but
+it only measures write operations to open pages, hence writes to closed pages
+are not included here.
diff --git a/monitoring/groups/nehalemEX/CPI.txt b/monitoring/groups/nehalemEX/CPI.txt
new file mode 100644
index 0000000..0e4faa3
--- /dev/null
+++ b/monitoring/groups/nehalemEX/CPI.txt
@@ -0,0 +1,12 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+
+METRICS
+CPI  FIXC1/FIXC0
+
+
+LONG
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
diff --git a/monitoring/groups/nehalemEX/FLOPS.txt b/monitoring/groups/nehalemEX/FLOPS.txt
new file mode 100644
index 0000000..e372504
--- /dev/null
+++ b/monitoring/groups/nehalemEX/FLOPS.txt
@@ -0,0 +1,20 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Packed MUOPS/s   1.0E-06*PMC0/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+SP MUOPS/s 1.0E-06*PMC2/time
+DP MUOPS/s 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/groups/pentiumm/BW.txt b/monitoring/groups/pentiumm/BW.txt
new file mode 100644
index 0000000..5877abc
--- /dev/null
+++ b/monitoring/groups/pentiumm/BW.txt
@@ -0,0 +1,12 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0  L2_LINES_IN_ALL_ALL
+PMC1  L2_LINES_OUT_ALL_ALL
+
+METRICS
+L3 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+
+LONG
+Formulas:
+L3 bandwidth [MBytes/s] = 1.0E-06*(L2_LINES_IN_ALL_ALL+L2_LINES_OUT_ALL_ALL)*64/time
diff --git a/monitoring/groups/pentiumm/CPI.txt b/monitoring/groups/pentiumm/CPI.txt
new file mode 100644
index 0000000..fb0d97b
--- /dev/null
+++ b/monitoring/groups/pentiumm/CPI.txt
@@ -0,0 +1,17 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  UOPS_RETIRED
+PMC1  CPU_CLK_UNHALTED
+
+METRICS
+CPI   PMC1/PMC0
+IPC   PMC0/PMC1
+
+LONG
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is UOPS_RETIRED as it tells you how many uops
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/monitoring/groups/phi/CPI.txt b/monitoring/groups/phi/CPI.txt
new file mode 100644
index 0000000..0ce61cd
--- /dev/null
+++ b/monitoring/groups/phi/CPI.txt
@@ -0,0 +1,17 @@
+SHORT  Cycles per instruction
+
+EVENTSET
+PMC0  INSTRUCTIONS_EXECUTED
+PMC1  CPU_CLK_UNHALTED
+
+METRICS
+CPI   PMC1/PMC0
+IPC   PMC0/PMC1
+
+LONG
+This group measures how efficient the processor works with
+regard to instruction throughput. Also important as a standalone
+metric is INSTRUCTIONS_RETIRED as it tells you how many instruction
+you need to execute for a task. An optimization might show very
+low CPI values but execute many more instruction for it.
+
diff --git a/monitoring/groups/sandybridge/BW.txt b/monitoring/groups/sandybridge/BW.txt
new file mode 100644
index 0000000..3a2eb90
--- /dev/null
+++ b/monitoring/groups/sandybridge/BW.txt
@@ -0,0 +1,13 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/sandybridge/ENERGY.txt b/monitoring/groups/sandybridge/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/sandybridge/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/sandybridge/FLOPS_DP.txt b/monitoring/groups/sandybridge/FLOPS_DP.txt
new file mode 100644
index 0000000..c004b88
--- /dev/null
+++ b/monitoring/groups/sandybridge/FLOPS_DP.txt
@@ -0,0 +1,24 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision flop rates. Please note that the current
+flop measurements on IvyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/sandybridge/FLOPS_SP.txt b/monitoring/groups/sandybridge/FLOPS_SP.txt
new file mode 100644
index 0000000..f9e6df7
--- /dev/null
+++ b/monitoring/groups/sandybridge/FLOPS_SP.txt
@@ -0,0 +1,24 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on SandyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/sandybridgeEP/BW.txt b/monitoring/groups/sandybridgeEP/BW.txt
new file mode 100644
index 0000000..18eea4f
--- /dev/null
+++ b/monitoring/groups/sandybridgeEP/BW.txt
@@ -0,0 +1,24 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  L1D_REPLACEMENT
+PMC1  L2_TRANS_L1D_WB
+PMC2  L2_LINES_IN_ALL
+PMC3  L2_LINES_OUT_DEMAND_DIRTY
+MBOX0C0 CAS_COUNT_RD
+MBOX0C1 CAS_COUNT_WR
+MBOX1C0 CAS_COUNT_RD
+MBOX1C1 CAS_COUNT_WR
+MBOX2C0 CAS_COUNT_RD
+MBOX2C1 CAS_COUNT_WR
+MBOX3C0 CAS_COUNT_RD
+MBOX3C1 CAS_COUNT_WR
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory read bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0)*64.0/time
+SUM Memory write bandwidth [MBytes/s] 1.0E-06*(MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX2C0+MBOX3C0+MBOX0C1+MBOX1C1+MBOX2C1+MBOX3C1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/sandybridgeEP/ENERGY.txt b/monitoring/groups/sandybridgeEP/ENERGY.txt
new file mode 100644
index 0000000..7256f1e
--- /dev/null
+++ b/monitoring/groups/sandybridgeEP/ENERGY.txt
@@ -0,0 +1,18 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+PWR3 PWR_DRAM_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+SUM Total Power DRAM [W] PWR3/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/sandybridgeEP/FLOPS_DP.txt b/monitoring/groups/sandybridgeEP/FLOPS_DP.txt
new file mode 100644
index 0000000..c004b88
--- /dev/null
+++ b/monitoring/groups/sandybridgeEP/FLOPS_DP.txt
@@ -0,0 +1,24 @@
+SHORT Double Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE
+PMC2  SIMD_FP_256_PACKED_DOUBLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*2.0+PMC1+PMC2*4.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*4.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*2+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_DOUBLE*4)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE+SIMD_FP_256_PACKED_DOUBLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE/runtime
+-
+SSE scalar and packed double precision flop rates. Please note that the current
+flop measurements on IvyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/sandybridgeEP/FLOPS_SP.txt b/monitoring/groups/sandybridgeEP/FLOPS_SP.txt
new file mode 100644
index 0000000..f9e6df7
--- /dev/null
+++ b/monitoring/groups/sandybridgeEP/FLOPS_SP.txt
@@ -0,0 +1,24 @@
+SHORT Single Precision MFlops/s
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE
+PMC2  SIMD_FP_256_PACKED_SINGLE
+
+METRICS
+MFlops/s  1.0E-06*(PMC0*4.0+PMC1+PMC2*8.0)/time
+AVX MFlops/s  1.0E-06*(PMC2*8.0)/time
+Packed MUOPS/s   1.0E-06*(PMC0+PMC2)/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+
+LONG
+Formula:
+MFlops/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED*4+FP_COMP_OPS_EXE_SSE_FP_SCALAR+SIMD_FP_256_PACKED_SINGLE*8)/runtime
+AVX MFlops/s = 1.0E-06*(SIMD_FP_256_PACKED_SINGLE*8)/runtime
+Packed MUOPS/s = 1.0E-06*(FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE+SIMD_FP_256_PACKED_SINGLE)/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE/runtime
+-
+SSE scalar and packed single precision flop rates. Please note that the current
+flop measurements on SandyBridge are potentially wrong. So you cannot trust
+these counters at the moment!
+
diff --git a/monitoring/groups/silvermont/BW.txt b/monitoring/groups/silvermont/BW.txt
new file mode 100644
index 0000000..952e64a
--- /dev/null
+++ b/monitoring/groups/silvermont/BW.txt
@@ -0,0 +1,12 @@
+SHORT Cache and memory bandwidths
+
+EVENTSET
+PMC0  LONGEST_LAT_CACHE_MISS
+PMC1  OFFCORE_RESPONSE_1_WB_ANY
+
+METRICS
+SUM Memory read bandwidth [MBytes/s] 1.0E-06*(PMC0)*64.0/time
+SUM Memory writeback bandwidth [MBytes/s] 1.0E-06*(PMC1)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+
+LONG
diff --git a/monitoring/groups/silvermont/CPI.txt b/monitoring/groups/silvermont/CPI.txt
new file mode 100644
index 0000000..4eb4d40
--- /dev/null
+++ b/monitoring/groups/silvermont/CPI.txt
@@ -0,0 +1,14 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+
+METRICS
+CPI FIXC0/FIXC1
+IPC FIXC1/FIXC0
+
+LONG
+CPI = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
+IPC = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
diff --git a/monitoring/groups/silvermont/ENERGY.txt b/monitoring/groups/silvermont/ENERGY.txt
new file mode 100644
index 0000000..3814560
--- /dev/null
+++ b/monitoring/groups/silvermont/ENERGY.txt
@@ -0,0 +1,16 @@
+SHORT Energy, CPI and Clock
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+FIXC2 CPU_CLK_UNHALTED_REF
+PWR0 PWR_PKG_ENERGY
+
+METRICS
+ONCE Runtime (RDTSC) [s] time
+SUM Retired instructions FIXC0
+SUM Total Power [W] PWR0/time
+CPU Clock [MHz] 1.E-06*(FIXC1/FIXC2)/inverseClock
+CPI FIXC0/FIXC1
+
+LONG
diff --git a/monitoring/groups/westmere/BW.txt b/monitoring/groups/westmere/BW.txt
new file mode 100644
index 0000000..4925077
--- /dev/null
+++ b/monitoring/groups/westmere/BW.txt
@@ -0,0 +1,19 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ANY
+PMC3  L2_LINES_OUT_ANY
+UPMC0  UNC_QMC_NORMAL_READS_ANY
+UPMC1  UNC_QMC_WRITES_FULL_ANY
+UPMC2  UNC_QHL_REQUESTS_REMOTE_READS
+UPMC3  UNC_QHL_REQUESTS_REMOTE_WRITES
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(UPMC0+UPMC1)*64.0/time
+SUM Remote memory bandwidth [MBytes/s] 1.0E-06*(UPMC2+UPMC3)*64.0/time
+
+LONG
diff --git a/monitoring/groups/westmere/CPI.txt b/monitoring/groups/westmere/CPI.txt
new file mode 100644
index 0000000..9852da8
--- /dev/null
+++ b/monitoring/groups/westmere/CPI.txt
@@ -0,0 +1,14 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+
+METRICS
+CPI  FIXC1/FIXC0
+IPC  FIXC0/FIXC1
+
+
+LONG
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+IPC = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
diff --git a/monitoring/groups/westmere/FLOPS.txt b/monitoring/groups/westmere/FLOPS.txt
new file mode 100644
index 0000000..e372504
--- /dev/null
+++ b/monitoring/groups/westmere/FLOPS.txt
@@ -0,0 +1,20 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Packed MUOPS/s   1.0E-06*PMC0/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+SP MUOPS/s 1.0E-06*PMC2/time
+DP MUOPS/s 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/groups/westmereEX/BW.txt b/monitoring/groups/westmereEX/BW.txt
new file mode 100644
index 0000000..a960025
--- /dev/null
+++ b/monitoring/groups/westmereEX/BW.txt
@@ -0,0 +1,20 @@
+SHORT Cache and memory bandwidth
+
+EVENTSET
+PMC0  L1D_REPL
+PMC1  L1D_M_EVICT
+PMC2  L2_LINES_IN_ANY
+PMC3  L2_LINES_OUT_ANY
+MBOX0C0 FVC_EV0_BBOX_CMDS_READS
+MBOX0C1 DRAM_CMD_CAS_WR_OPN
+MBOX0C2 DRAM_MISC_CAS_WR_CLS
+MBOX1C0 FVC_EV0_BBOX_CMDS_READS
+MBOX1C1 DRAM_CMD_CAS_WR_OPN
+MBOX1C2 DRAM_MISC_CAS_WR_CLS
+
+METRICS
+SUM L2 bandwidth [MBytes/s] 1.0E-06*(PMC0+PMC1)*64.0/time
+SUM L3 bandwidth [MBytes/s] 1.0E-06*(PMC2+PMC3)*64.0/time
+SUM Memory bandwidth [MBytes/s] 1.0E-06*(MBOX0C0+MBOX1C0+MBOX0C1+MBOX1C1+MBOX0C2+MBOX1C2)*64/time
+
+LONG
diff --git a/monitoring/groups/westmereEX/CPI.txt b/monitoring/groups/westmereEX/CPI.txt
new file mode 100644
index 0000000..9852da8
--- /dev/null
+++ b/monitoring/groups/westmereEX/CPI.txt
@@ -0,0 +1,14 @@
+SHORT Cycles per instruction
+
+EVENTSET
+FIXC0 INSTR_RETIRED_ANY
+FIXC1 CPU_CLK_UNHALTED_CORE
+
+METRICS
+CPI  FIXC1/FIXC0
+IPC  FIXC0/FIXC1
+
+
+LONG
+CPI = CPU_CLK_UNHALTED_CORE/INSTR_RETIRED_ANY
+IPC = INSTR_RETIRED_ANY/CPU_CLK_UNHALTED_CORE
diff --git a/monitoring/groups/westmereEX/FLOPS.txt b/monitoring/groups/westmereEX/FLOPS.txt
new file mode 100644
index 0000000..e372504
--- /dev/null
+++ b/monitoring/groups/westmereEX/FLOPS.txt
@@ -0,0 +1,20 @@
+SHORT Floating point operations
+
+EVENTSET
+PMC0  FP_COMP_OPS_EXE_SSE_FP_PACKED
+PMC1  FP_COMP_OPS_EXE_SSE_FP_SCALAR
+PMC2  FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION
+PMC3  FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION
+
+METRICS
+Packed MUOPS/s   1.0E-06*PMC0/time
+Scalar MUOPS/s 1.0E-06*PMC1/time
+SP MUOPS/s 1.0E-06*PMC2/time
+DP MUOPS/s 1.0E-06*PMC3/time
+
+LONG
+Formulas:
+Packed MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_PACKED/runtime
+Scalar MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_FP_SCALAR/runtime
+SP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_SINGLE_PRECISION/runtime
+DP MUOPS/s = 1.0E-06*FP_COMP_OPS_EXE_SSE_DOUBLE_PRECISION/runtime
diff --git a/monitoring/likwid-agent.conf b/monitoring/likwid-agent.conf
new file mode 100644
index 0000000..2d999b6
--- /dev/null
+++ b/monitoring/likwid-agent.conf
@@ -0,0 +1,52 @@
+### Global section ###
+
+# Set path to monitoring group files. Default is the normal LIKWID group path
+# <PREFIX>/share/likwid/mongroups
+#GROUPPATH <path_to_mon_groups>
+# List of monitoring groups that should be measured
+#EVENTSET <group1> <group2> ...
+# Define access mode for LIKWID. If likwid-agent runs as root, use 0 for direct
+# access to the MSR and PCI registers. If you are running it as common user, you
+# have to select 1 to use the accessDaemon of LIKWID. Default is 1.
+#ACCESSMODE <0/1>
+# Define the time in seconds that each given monitoring group should be measured
+#DURATION 1
+
+
+### Output section ###
+
+## Simple logfile output ##
+# Specify path for the logfile. For each monitoring group a own logfile is
+# created with the format likwid.<group>.log
+#LOGPATH <path>
+# Specify the logfile writing style. The two possible options are log and
+# update.
+# log appends all new messages to the logfile, while update empties the logfile
+# before performing any writing. The update option is recommended when the
+# output is further parsed with other tools. If LOGPATH is set but no LOGSTYLE
+# set, the style log is selected.
+#LOGSTYLE <log/update>
+
+## Syslog output ##
+# De/Activate the output to the syslog system using shell tool logger
+#SYSLOG <True/False>
+# Define the priority value for logger. Default priority is local0.notice.
+#SYSLOGPRIO local0.notice
+
+## RRD output ##
+# Likwid-agent tries to create basic RRD configurations for the selected
+# groups. Each monitoring group gets its own RRD file containing all metrics
+# as data sources. For better printing, RRAs are created to hold the min, max
+# and average values for every 10 minutes in the last hour, every hour for the
+# last day and every day for the last month.
+#RRD <True/False>
+# Store the RRDs in RRDPATH
+#RRDPATH <path>
+
+## GMetric output ##
+# De/Activate the output to the Ganglia Monitoring System using the gmetric tool
+#GMETRIC <True/False>
+# Set path to the executable of gmetric.
+#GMETRICPATH <path_to_gmetric>
+# In some environments they need to hand over a special config file for gmetric.
+#GMETRICCONFIG <path_to_gmetric_config>
diff --git a/perl/feedGnuplot b/perl/feedGnuplot
index 67aaf37..d379981 100755
--- a/perl/feedGnuplot
+++ b/perl/feedGnuplot
@@ -1,27 +1,36 @@
 #!/usr/bin/perl
+
+package feedgnuplot; # for the metacpan indexer
+
 use strict;
 use warnings;
 use Getopt::Long;
-use Time::HiRes qw( usleep );
+use Time::HiRes qw( usleep gettimeofday tv_interval );
 use IO::Handle;
 use List::Util qw( first );
+use Scalar::Util qw( looks_like_number );
 use Text::ParseWords;
 use threads;
 use threads::shared;
 use Thread::Queue;
 use Pod::Usage;
+use Time::Piece;
 
-
-our $VERSION = '1.11';
+my $VERSION = 1.34;
 
 my %options;
-interpretCommandline(\%options);
+interpretCommandline();
+
+# list containing the plot data. Each element is a hashref of parameters.
+# $curve->{datastring} is a string of all the data in this curve that can be
+# sent directly to gnuplot. $curve->{datastring_meta} is a hashref {domain =>
+# ..., offset_start => ...}. offset_start represents a position in the
+# datastring where this particular data element begins. As the data is culled
+# with --xlen, the offsets are preserved by using $curve->{datastring_offset} to
+# represent the offset IN THE ORIGINAL STRING of the current start of the
+# datastring
 
-my $gnuplotVersion = getGnuplotVersion();
 
-# list containing the plot data. Each element is a reference to a list, representing the data for
-# one curve. The first 'point' is a hash describing various curve parameters. The rest are all
-# references to lists of (x,y) tuples
 my @curves = ();
 
 # list mapping curve names to their indices in the @curves list
@@ -29,24 +38,32 @@ my %curveIndices = ();
 
 # now start the data acquisition and plotting threads
 my $dataQueue;
-my $xwindow;
+
+# Whether any new data has arrived since the last replot
+my $haveNewData;
+
+# when the last replot happened
+my $last_replot_time = [gettimeofday];
+
+# whether the previous replot was timer based
+my $last_replot_is_from_timer = 1;
 
 my $streamingFinished : shared = undef;
+
 if($options{stream})
 {
-  if( $options{hardcopy})
-  {
-    $options{stream} = undef;
-  }
-
   $dataQueue  = Thread::Queue->new();
   my $addThr  = threads->create(\&mainThread);
-  my $plotThr = threads->create(\&plotThread);
+
+  # spawn the plot updating thread. If I'm replotting from a data trigger, I don't need this
+  my $plotThr = threads->create(\&plotUpdateThread) if $options{stream} > 0;
 
   while(<>)
   {
     chomp;
 
+    last if /^exit/;
+
     # place every line of input to the queue, so that the plotting thread can process it. if we are
     # using an implicit domain (x = line number), then we send it on the data queue also, since
     # $. is not meaningful in the plotting thread
@@ -58,8 +75,9 @@ if($options{stream})
   }
 
   $streamingFinished = 1;
+  $dataQueue->enqueue(undef);
 
-  $plotThr->join();
+  $plotThr->join() if defined $plotThr;
   $addThr->join();
 }
 else
@@ -81,86 +99,296 @@ sub interpretCommandline
     unshift @ARGV, shellwords shift @ARGV;
   }
 
-  my $options = shift;
-
   # everything off by default:
   # do not stream in the data by default
   # point plotting by default.
   # no monotonicity checks by default
+  # normal histograms by default
   $options{ maxcurves } = 100;
+  $options{ histstyle}  = 'freq';
+
+  # Previously I was using 'legend=s%' and 'curvestyle=s%' for curve addressing. This had cleaner
+  # syntax, but disregarded the order of the given options. This resulted in arbitrarily ordered
+  # curves. I thus make parse these into lists, and then also make hashes, for later use
+
+  # needed for these to be parsed into an array-ref, these default to []
+  $options{legend}     = [];
+  $options{curvestyle} = [];
+  $options{style}      = [];
+  $options{histogram}  = [];
+  $options{y2}         = [];
+  $options{extracmds}  = [];
+  $options{set}        = [];
+  $options{unset}      = [];
+
+  $options{curvestyleall} = '';
+  $options{styleall}      = '';
+  $options{with}          = '';
+
+  $options{rangesize} = [];
+
+  GetOptions(\%options, 'stream:s', 'domain!', 'dataid!', '3d!', 'colormap!', 'lines!', 'points!',
+             'circles', 'legend=s{2}', 'autolegend!', 'xlabel=s', 'ylabel=s', 'y2label=s', 'zlabel=s',
+             'title=s', 'xlen=f', 'ymin=f', 'ymax=f', 'xmin=s', 'xmax=s', 'y2min=f', 'y2max=f',
+             'zmin=f', 'zmax=f', 'y2=s@',
+             'style=s{2}', 'curvestyle=s{2}', 'curvestyleall=s', 'styleall=s', 'with=s', 'extracmds=s@', 'set=s@', 'unset=s@',
+             'square!', 'square_xy!', 'hardcopy=s', 'maxcurves=i', 'monotonic!', 'timefmt=s',
+             'histogram=s@', 'binwidth=f', 'histstyle=s',
+             'terminal=s',
+             'rangesize=s{2}', 'rangesizeall=i', 'extraValuesPerPoint=i',
+             'help', 'dump', 'exit', 'version',
+             'geometry=s') or pod2usage( -exitval => 1,
+                                         -verbose => 1, # synopsis and args
+                                         -output  => \*STDERR );
 
-  GetOptions($options, 'stream!', 'domain!', 'dataid!', '3d!', 'colormap!', 'lines!', 'points!',
-             'circles', 'legend=s%', 'autolegend!', 'xlabel=s', 'ylabel=s', 'y2label=s', 'zlabel=s',
-             'title=s', 'xlen=f', 'ymin=f', 'ymax=f', 'xmin=f', 'xmax=f', 'y2min=f', 'y2max=f',
-             'zmin=f', 'zmax=f', 'y2=s@', 'curvestyle=s%', 'curvestyleall=s', 'extracmds=s@',
-             'size=s', 'square!', 'square_xy!', 'hardcopy=s', 'maxcurves=i', 'monotonic!',
-             'extraValuesPerPoint=i', 'help', 'dump') or pod2usage(1);
 
   # handle various cmdline-option errors
-  if ( $options->{help} )
-  { pod2usage(0); }
+  if ( $options{help} )
+  {
+    pod2usage( -exitval => 0,
+               -verbose => 1, # synopsis and args
+               -output  => \*STDOUT );
+  }
+
+  if( $options{version} )
+  {
+    print "feedgnuplot version $VERSION\n";
+    exit 0;
+  }
+
+  # expand options that are given as comma-separated lists
+  for my $listkey (qw(histogram y2))
+  {
+    @{$options{$listkey}} = map split('\s*,\s*', $_), @{$options{$listkey}}
+      if defined $options{$listkey};
+  }
+
+  # --style and --curvestyle are synonyms, as are --styleall and
+  # --curvestyleall, so fill that in
+  if( $options{styleall} )
+  {
+    if($options{curvestyleall} )
+    {
+      $options{curvestyleall} .= " $options{styleall}";
+    }
+    else
+    {
+      $options{curvestyleall} = $options{styleall};
+    }
+  }
+  push @{$options{curvestyle}}, @{$options{style}};
+
+
+  # --legend and --curvestyle options are conceptually hashes, but are parsed as
+  # arrays in order to preserve the ordering. I parse both of these into hashes
+  # because those are useful to have later. After this I can access individual
+  # legends with $options{legend_hash}{curveid}
+  for my $listkey (qw(legend curvestyle rangesize))
+  {
+    $options{"${listkey}_hash"} = {};
+
+    my $n = scalar @{$options{$listkey}}/2;
+    foreach my $idx (0..$n-1)
+    {
+      $options{"${listkey}_hash"}{$options{$listkey}[$idx*2]} = $options{$listkey}[$idx*2 + 1];
+    }
+  }
+
+  if ( defined $options{hardcopy} && defined $options{stream} )
+  {
+    print STDERR "--stream doesn't make sense together with --hardcopy\n";
+    exit -1;
+  }
+
+  if ( defined $options{rangesizeall} && defined $options{extraValuesPerPoint} )
+  {
+    print STDERR "Only one of --rangesizeall and --extraValuesPerPoint may be given\n";
+    exit -1;
+  }
+
+
+  # I now set up the rangesize to always be
+  #  $options{rangesize_hash}{$id} // $options{rangesize_default}
+  if ( $options{rangesizeall} )
+  {
+      $options{rangesize_default} = $options{rangesizeall};
+  }
+  else
+  {
+      $options{rangesize_default} = 1;
+
+      $options{rangesize_default} += $options{extraValuesPerPoint} if ($options{extraValuesPerPoint});
+      $options{rangesize_default}++                                if ($options{colormap});
+      $options{rangesize_default}++                                if ($options{circles} );
+  }
 
-  $options->{curvestyleall} = '' unless defined $options->{curvestyleall};
 
-  if ($options->{colormap})
+  # parse stream option. Allowed only numbers >= 0 or 'trigger'. After this code
+  # $options{stream} is
+  #  -1 for triggered replotting
+  #  >0 for timed replotting
+  #  undef if not streaming
+  if(defined $options{stream})
+  {
+    # if no streaming period is given, default to 1Hz.
+    $options{stream} = 1 if $options{stream} eq '';
+
+    if( !looks_like_number $options{stream} )
+    {
+      if($options{stream} eq 'trigger')
+      {
+        $options{stream} = 0;
+      }
+      else
+      {
+        print STDERR "--stream can only take in values >=0 or 'trigger'\n";
+        exit -1;
+      }
+    }
+
+    if ( $options{stream} == 0 )
+    {
+      $options{stream} = -1;
+    }
+    elsif ( $options{stream} <= 0)
+    {
+      print STDERR "--stream can only take in values >=0 or 'trigger'\n";
+      exit -1;
+    }
+  }
+
+  if( $options{curvestyleall} && $options{with} )
+  {
+    print STDERR "--curvestyleall and --with are mutually exclusive. Please just use one.\n";
+    exit -1;
+  }
+  if( $options{with} )
+  {
+    $options{curvestyleall} = "with $options{with}";
+    $options{with} = '';
+  }
+
+  if ($options{colormap})
   {
     # colormap styles all curves with palette. Seems like there should be a way to do this with a
     # global setting, but I can't get that to work
-    $options->{curvestyleall} .= ' palette';
+    $options{curvestyleall} .= ' palette';
   }
 
-  if ( $options->{'3d'} )
+  if ( $options{'3d'} )
   {
-    if ( !$options->{domain} )
+    if ( !$options{domain} )
     {
       print STDERR "--3d only makes sense with --domain\n";
       exit -1;
     }
 
-    if ( defined $options->{y2min} || defined $options->{y2max} || defined $options->{y2} )
+    if ( $options{timefmt} )
+    {
+      print STDERR "--3d makes no sense with --timefmt\n";
+      exit -1;
+    }
+
+    if ( defined $options{y2min} || defined $options{y2max} || @{$options{y2}} )
     {
       print STDERR "--3d does not make sense with --y2...\n";
       exit -1;
     }
 
-    if ( defined $options->{xlen} )
+    if ( defined $options{xlen} )
     {
       print STDERR "--3d does not make sense with --xlen\n";
       exit -1;
     }
 
-    if ( defined $options->{monotonic} )
+    if ( defined $options{monotonic} )
     {
       print STDERR "--3d does not make sense with --monotonic\n";
       exit -1;
     }
+
+    if ( defined $options{binwidth} || @{$options{histogram}} )
+    {
+      print STDERR "--3d does not make sense with histograms\n";
+      exit -1;
+    }
+
+    if ( defined $options{circles} )
+    {
+      print STDERR "--3d does not make sense with circles (gnuplot doesn't support this)\n";
+      exit -1;
+    }
   }
   else
   {
-    if(!$options->{colormap})
+    if ( $options{timefmt} && !$options{domain} )
+    {
+      print STDERR "--timefmt makes sense only with --domain\n";
+      exit -1;
+    }
+
+    if(!$options{colormap})
     {
-      if ( defined $options->{zmin} || defined $options->{zmax} || defined $options->{zlabel} )
+      if ( defined $options{zmin} || defined $options{zmax} || defined $options{zlabel} )
       {
         print STDERR "--zmin/zmax/zlabel only makes sense with --3d or --colormap\n";
         exit -1;
       }
     }
 
-    if ( defined $options->{square_xy} )
+    if ( defined $options{square_xy} )
     {
       print STDERR "--square_xy only makes sense with --3d\n";
       exit -1;
     }
   }
 
-  if(defined $options{xlen} && !defined $options{stream} )
+  if(defined $options{xlen} && !$options{stream} )
   {
     print STDERR "--xlen does not make sense without --stream\n";
     exit -1;
   }
 
+  if($options{stream} && defined $options{xlen} &&
+     ( defined $options{xmin} || defined $options{xmax}))
+  {
+    print STDERR "With --stream and --xlen the X bounds are set, so neither --xmin nor --xmax make sense\n";
+    exit -1;
+  }
+
   # --xlen implies an order to the data, so I force monotonicity
-  $options{monotonic} = defined $options{xlen};
+  $options{monotonic} = 1 if defined $options{xlen};
+
+  if( $options{histstyle} !~ /freq|cum|uniq|cnorm/ )
+  {
+    print STDERR "unknown histstyle. Allowed are 'freq...', 'cum...', 'uniq...', 'cnorm...'\n";
+    exit -1;
+  }
+
+  # deal with timefmt
+  if ( $options{timefmt} )
+  {
+    # I need to compute a regex to match the time field and I need to count how
+    # many whilespace-separated fields there are.
+
+    # strip leading and trailing whitespace
+    $options{timefmt} =~ s/^\s*//;
+    $options{timefmt} =~ s/\s*$//;
+
+    my $Nfields = () = split /\s+/, $options{timefmt}, -1;
+    $options{timefmt_Ncols} = $Nfields;
+
+    # make sure --xlen is an integer. With a timefmt xlen goes through strptime
+    # and strftime, and those are integer-only
+    if( defined $options{xlen} )
+    {
+      if( $options{xlen} - int($options{xlen}) )
+      {
+        say STDERR "When streaming --xlen MUST be an integer. Rounding up to the nearest second";
+        $options{xlen} = 1 + int($options{xlen});
+      }
+    }
+  }
 }
 
 sub getGnuplotVersion
@@ -177,31 +405,60 @@ sub getGnuplotVersion
   return $gnuplotVersion;
 }
 
-sub plotThread
+sub plotUpdateThread
 {
   while(! $streamingFinished)
   {
-    sleep(1);
-    $dataQueue->enqueue('Plot now');
+    usleep( $options{stream} * 1e6 );
+
+    # indicate that the timer was the replot source
+    $dataQueue->enqueue('replot timertick');
   }
+}
 
-  $dataQueue->enqueue(undef);
+sub sendRangeCommand
+{
+  my ($name, $min, $max) = @_;
+
+  return unless defined $min || defined $max;
+
+  if( defined $min )
+  { $min = "\"$min\""; }
+  else
+  { $min = ''; }
+
+  if( defined $max )
+  { $max = "\"$max\""; }
+  else
+  { $max = ''; }
 
+  my $cmd = "set $name [$min:$max]\n";
+  print PIPE $cmd;
 }
 
-sub mainThread
+sub makeDomainNumeric
 {
-    my $valuesPerPoint = 1;
-    if($options{extraValuesPerPoint}) { $valuesPerPoint += $options{extraValuesPerPoint}; }
-    if($options{colormap})            { $valuesPerPoint++; }
-    if($options{circles} )            { $valuesPerPoint++; }
+  my ($domain0) = @_;
+
+  if ( $options{timefmt} )
+  {
+    my $timepiece = Time::Piece->strptime( $domain0, $options{timefmt} )
+      or die "Couldn't parse time format. String '$domain0' doesn't fit format '$options{timefmt}'";
+
+    return $timepiece->epoch();
+  }
+
+  return $domain0;
+}
 
+sub mainThread
+{
     local *PIPE;
     my $dopersist = '';
 
-    if($gnuplotVersion >= 4.3)
+    if( !$options{stream} && getGnuplotVersion() >= 4.3)
     {
-      $dopersist = '--persist' if(!$options{stream});
+      $dopersist = '--persist';
     }
 
     if(exists $options{dump})
@@ -210,51 +467,43 @@ sub mainThread
     }
     else
     {
-      open PIPE, "|gnuplot $dopersist" or die "Can't initialize gnuplot\n";
+      my $geometry = defined $options{geometry} ?
+        "-geometry $options{geometry}" : '';
+      open PIPE, "|gnuplot $geometry $dopersist" or die "Can't initialize gnuplot\n";
     }
     autoflush PIPE 1;
 
     my $outputfile;
     my $outputfileType;
-    if( $options{hardcopy})
+    if( defined $options{hardcopy})
     {
       $outputfile = $options{hardcopy};
-      ($outputfileType) = $outputfile =~ /\.(eps|ps|pdf|png)$/;
-      if(!$outputfileType) { die("Only .eps, .ps, .pdf and .png supported\n"); }
+      if( $outputfile =~ /^[^|]                       # starts with anything other than |
+                          .*                          # stuff in the middle
+                          \.(eps|ps|pdf|png|svg)$/ix) # ends with a known extension
+      {
+        $outputfileType = lc $1;
+      }
 
       my %terminalOpts =
       ( eps  => 'postscript solid color enhanced eps',
         ps   => 'postscript solid color landscape 10',
         pdf  => 'pdfcairo solid color font ",10" size 11in,8.5in',
-        png  => 'png size 1280,1024' );
-
-      print PIPE "set terminal $terminalOpts{$outputfileType}\n";
-      print PIPE "set output \"$outputfile\"\n";
-    }
-    else
-    {
-      print PIPE "set terminal x11\n";
-    }
+        png  => 'png size 1280,1024',
+        svg  => 'svg');
 
-    # If a bound isn't given I want to set it to the empty string, so I can communicate it simply to
-    # gnuplot
-    $options{xmin}  = '' unless defined $options{xmin};
-    $options{xmax}  = '' unless defined $options{xmax};
-    $options{ymin}  = '' unless defined $options{ymin};
-    $options{ymax}  = '' unless defined $options{ymax};
-    $options{y2min} = '' unless defined $options{y2min};
-    $options{y2max} = '' unless defined $options{y2max};
-    $options{zmin}  = '' unless defined $options{zmin};
-    $options{zmax}  = '' unless defined $options{zmax};
+      if( !defined $options{terminal} &&
+           defined $outputfileType    &&
+           $terminalOpts{$outputfileType} )
+      {
+        $options{terminal} = $terminalOpts{$outputfileType};
+      }
 
-    print PIPE "set xtics\n";
-    if($options{y2})
-    {
-      print PIPE "set ytics nomirror\n";
-      print PIPE "set y2tics\n";
-      # if any of the ranges are given, set the range
-      print PIPE "set y2range [". $options{y2min} . ":" . $options{y2max} ."]\n" if length( $options{y2min} . $options{y2max} );
+      die "Asked to plot to file '$outputfile', but I don't know which terminal to use, and no --terminal given"
+        unless $options{terminal};
     }
+    print PIPE "set terminal $options{terminal}\n" if $options{terminal};
+    print PIPE "set output \"$outputfile\"\n"      if $outputfile;
 
     # set up plotting style
     my $style = '';
@@ -265,94 +514,143 @@ sub mainThread
       $options{curvestyleall} = "with circles $options{curvestyleall}";
     }
 
-    # if any of the ranges are given, set the range
-    print PIPE "set xrange [". $options{xmin} . ":" . $options{xmax} ."]\n" if length( $options{xmin} . $options{xmax} );
-    print PIPE "set yrange [". $options{ymin} . ":" . $options{ymax} ."]\n" if length( $options{ymin} . $options{ymax} );
-    print PIPE "set zrange [". $options{zmin} . ":" . $options{zmax} ."]\n" if length( $options{zmin} . $options{zmax} );
     print PIPE "set style data $style\n" if $style;
     print PIPE "set grid\n";
 
-    print(PIPE "set xlabel  \"" . $options{xlabel } . "\"\n") if defined $options{xlabel};
-    print(PIPE "set ylabel  \"" . $options{ylabel } . "\"\n") if defined $options{ylabel};
-    print(PIPE "set zlabel  \"" . $options{zlabel } . "\"\n") if defined $options{zlabel};
-    print(PIPE "set y2label \"" . $options{y2label} . "\"\n") if defined $options{y2label};
-    print(PIPE "set title   \"" . $options{title  } . "\"\n") if defined $options{title};
+    print(PIPE "set xlabel  \"$options{xlabel }\"\n") if defined $options{xlabel};
+    print(PIPE "set ylabel  \"$options{ylabel }\"\n") if defined $options{ylabel};
+    print(PIPE "set zlabel  \"$options{zlabel }\"\n") if defined $options{zlabel};
+    print(PIPE "set y2label \"$options{y2label}\"\n") if defined $options{y2label};
+    print(PIPE "set title   \"$options{title  }\"\n") if defined $options{title};
 
     if($options{square})
     {
       # set a square aspect ratio. Gnuplot does this differently for 2D and 3D plots
       if(! $options{'3d'})
       {
-        $options{size} = '' unless defined $options{size};
-        $options{size} .= ' ratio -1';
+        print(PIPE "set size ratio -1\n");
       }
       else
       {
         print(PIPE "set view equal xyz\n");
       }
     }
-    print(PIPE "set size $options{size}\n")                     if defined $options{size};
 
     if($options{square_xy})
     {
       print(PIPE "set view equal xy\n");
     }
 
-    if($options{colormap})
-    {
-      print PIPE "set cbrange [". $options{zmin} . ":" . $options{zmax} ."]\n" if length( $options{zmin} . $options{zmax} );
-    }
-
 # For the specified values, set the legend entries to 'title "blah blah"'
-    if($options{legend})
+    if(@{$options{legend}})
     {
-      foreach my $id (keys %{$options{legend}})
+      # @{$options{legend}} is a list where consecutive pairs are (curveID,
+      # legend). I use $options{legend} here instead of $options{legend_hash}
+      # because I create a new curve when I see a new one, and the hash is
+      # unordered, thus messing up the ordering
+      my $n = scalar @{$options{legend}}/2;
+      foreach my $idx (0..$n-1)
       {
-        setCurveLabel($id, $options{legend}{$id});
+        setCurveLabel($options{legend}[$idx*2    ],
+                      $options{legend}[$idx*2 + 1]);
       }
     }
 
 # add the extra curve options
-    if($options{curvestyle})
+    if(@{$options{curvestyle}})
     {
-      foreach my $id (keys %{$options{curvestyle}})
+      # @{$options{curvestyle}} is a list where consecutive pairs are (curveID,
+      # style). I use $options{curvestyle} here instead of
+      # $options{curvestyle_hash} because I create a new curve when I see a new
+      # one, and the hash is unordered, thus messing up the ordering
+      my $n = scalar @{$options{curvestyle}}/2;
+      foreach my $idx (0..$n-1)
       {
-        addCurveOption($id, $options{curvestyle}{$id});
+        addCurveOption($options{curvestyle}[$idx*2    ],
+                       $options{curvestyle}[$idx*2 + 1]);
       }
     }
 
 # For the values requested to be printed on the y2 axis, set that
-    foreach (@{$options{y2}})
+    addCurveOption($_, 'axes x1y2') foreach (@{$options{y2}});
+
+# timefmt
+    if( $options{timefmt} )
     {
-      addCurveOption($_, 'axes x1y2 linewidth 3');
+      print(PIPE "set timefmt '$options{timefmt}'\n");
+      print(PIPE "set xdata time\n");
     }
 
 # add the extra global options
-    if($options{extracmds})
+    print(PIPE "$_\n")       foreach (@{$options{extracmds}});
+    print(PIPE "set $_\n")   foreach (@{$options{set}});
+    print(PIPE "unset $_\n") foreach (@{$options{unset}});
+
+# set up histograms
+    $options{binwidth} ||= 1;   # if no binwidth given, set it to 1
+    print PIPE
+      "set boxwidth $options{binwidth}\n" .
+      "histbin(x) = $options{binwidth} * floor(0.5 + x/$options{binwidth})\n";
+
+    setCurveAsHistogram( $_ ) foreach (@{$options{histogram}});
+
+# set all the axis ranges
+    # If a bound isn't given I want to set it to the empty string, so I can communicate it simply to
+    # gnuplot
+    print PIPE "set xtics\n";
+
+    if(@{$options{y2}})
     {
-      foreach (@{$options{extracmds}})
-      {
-        print(PIPE "$_\n");
-      }
+      print PIPE "set ytics nomirror\n";
+      print PIPE "set y2tics\n";
+      # if any of the ranges are given, set the range
+      sendRangeCommand( "y2range", $options{y2min}, $options{y2max} );
     }
 
-    # regexp for a possibly floating point, possibly scientific notation number
-    my $numRE   = '-?\d*\.?\d+(?:[Ee][-+]?\d+)?';
+    # if any of the ranges are given, set the range
+    sendRangeCommand( "xrange",  $options{xmin}, $options{xmax} );
+    sendRangeCommand( "yrange",  $options{ymin}, $options{ymax} );
+    sendRangeCommand( "zrange",  $options{zmin}, $options{zmax} );
+    sendRangeCommand( "cbrange", $options{zmin}, $options{zmax} ) if($options{colormap});
+
 
-    # a point may be preceded by an id
-    my $pointRE = $options{dataid} ? '(\w+)\s+' : '()';
-    $pointRE .= '(' . join('\s+', ($numRE) x $valuesPerPoint) . ')';
-    $pointRE = qr/$pointRE/;
 
+
+    # latest domain variable present in our data
+    my $latestX;
+
+    # The domain of the current point
     my @domain;
-    my $haveNewData;
+
+    # The x-axis domain represented as a number. This is exactly the same as
+    # $domain[0] unless the x-axis domain uses a timefmt. Then this is the
+    # number of seconds since the UNIX epoch.
+    my $domain0_numeric;
 
     # I should be using the // operator, but I'd like to be compatible with perl 5.8
     while( $_ = (defined $dataQueue ? $dataQueue->dequeue() : <>))
     {
       next if /^#/o;
 
-      if($_ ne 'Plot now')
+      if( $options{stream} )
+      {
+        if(/^clear/o )
+        {
+          clearCurves();
+          next;
+        }
+
+        if(/^replot/o )
+        {
+          # /timertick/ determines if the timer was the source of the replot
+          replot( $domain0_numeric, /timertick/ );
+          next;
+        }
+
+        # /exit/ is handled in the data-reading thread
+      }
+
+      if(! /^replot/o)
       {
         # parse the incoming data lines. The format is
         # x id0 dat0 id1 dat1 ....
@@ -364,14 +662,49 @@ sub mainThread
         # line is used)
         # 3d plots require $options{domain}, and dictate "x y" for the domain instead of just "x"
 
+        my @fields = split;
+
         if($options{domain})
         {
-          /($numRE)/go or next;
-          $domain[0] = $1;
-          if($options{'3d'})
+          if( $options{timefmt} )
           {
-            /($numRE)/go or next;
-            $domain[1] = $1;
+              # no point if doing anything unless I have at least the domain and
+              # 1 piece of data
+              next if @fields < $options{timefmt_Ncols}+1;
+
+              $domain[0] = join (' ', splice( @fields, 0, $options{timefmt_Ncols}) );
+              $domain0_numeric = makeDomainNumeric( $domain[0] );
+          }
+          elsif(!$options{'3d'})
+          {
+              # no point if doing anything unless I have at least the domain and
+              # 1 piece of data
+              next if @fields < 1+1;
+
+              $domain[0] = $domain0_numeric = shift @fields;
+          }
+          else
+          {
+              # no point if doing anything unless I have at least the domain and
+              # 1 piece of data
+              next if @fields < 2+1;
+
+              @domain = splice(@fields, 0, 2);
+          }
+
+          if( $options{monotonic} )
+          {
+            if( defined $latestX && $domain0_numeric < $latestX )
+            {
+              # the x-coordinate of the new point is in the past, so I wipe out
+              # all the data and start anew. Before I wipe the old data, I
+              # replot the old data
+              replot( $domain0_numeric );
+              clearCurves();
+              $latestX = undef;
+            }
+            else
+            { $latestX = $domain0_numeric; }
           }
         }
         else
@@ -380,53 +713,67 @@ sub mainThread
           # $. on the data queue in that case
           if(defined $dataQueue)
           {
-            s/ ([\d]+)$//o;
-            $domain[0] = $1;
+            $domain[0] = pop @fields;
           }
           else
           {
             $domain[0] = $.;
           }
+          $domain0_numeric = makeDomainNumeric( $domain[0] );
         }
 
         my $id = -1;
-        while (/$pointRE/go)
-        {
-          if($1 ne '') {$id = $1;}
-          else         {$id++;   }
-
-          $haveNewData = 1;
-          pushPoint(getCurve($id),
-                    [@domain, split( /\s+/, $2)]);
-        }
-      }
-
-      elsif($options{stream})
-      {
-        # only redraw a streaming plot if there's new data to plot
-        next unless $haveNewData;
-        $haveNewData = undef;
 
-        if( $options{xlen} )
+        while(@fields)
         {
-          pruneOldData($domain[0] - $options{xlen});
-          plotStoredData($domain[0] - $options{xlen}, $domain[0]);
+            if($options{dataid})
+            {
+                $id = shift @fields;
+            }
+            else
+            {
+                $id++;
+            }
+
+            # I'd like to use //, but I guess some people are still on perl 5.8
+            my $rangesize = exists $options{rangesize_hash}{$id} ?
+              $options{rangesize_hash}{$id} :
+              $options{rangesize_default};
+
+            last if @fields < $rangesize;
+
+            pushPoint(getCurve($id),
+                      join(' ',
+                           @domain,
+                           splice( @fields, 0, $rangesize ) ) . "\n",
+                      $domain0_numeric);
         }
-        else
-        { plotStoredData(); }
       }
     }
 
+    # if we were streaming, we're now done!
+    if( $options{stream} )
+    {
+      return;
+    }
+
     # finished reading in all. Plot what we have
     plotStoredData();
 
-    if ( $options{hardcopy})
+    if ( defined $options{hardcopy})
     {
       print PIPE "set output\n";
-      # sleep until the plot file exists, and it is closed. Sometimes the output is
-      # still being written at this point
-      usleep(100_000) until -e $outputfile;
-      usleep(100_000) until(system("fuser -s \"$outputfile\""));
+
+      # sleep until the plot file exists, and it is closed. Sometimes the output
+      # is still being written at this point. If the output filename starts with
+      # '|', gnuplot pipes the output to that process, instead of writing to a
+      # file. In that case I don't make sure the file exists, since there IS not
+      # file
+      if( $options{hardcopy} !~ /^\|/ )
+      {
+        usleep(100_000) until -e $outputfile;
+        usleep(100_000) until(system("fuser -s \"$outputfile\""));
+      }
 
       print "Wrote output to $outputfile\n";
       return;
@@ -435,46 +782,53 @@ sub mainThread
     # we persist gnuplot, so we shouldn't need this sleep. However, once
     # gnuplot exits, but the persistent window sticks around, you can no
     # longer interactively zoom the plot. So we still sleep
-    sleep(100000);
+    sleep(100000) unless $options{dump} || $options{exit};
 }
 
 sub pruneOldData
 {
   my ($oldestx) = @_;
 
-  foreach my $xy (@curves)
+  foreach my $curve (@curves)
   {
-    if( @$xy > 1 )
+    next unless $curve->{datastring};
+
+    my $meta = $curve->{datastring_meta};
+
+    my $firstInWindow = first {$meta->[$_]{domain} >= $oldestx} 0..$#$meta;
+    if ( !defined $firstInWindow )
     {
-      if( my $firstInWindow = first {$xy->[$_][0] >= $oldestx} 1..$#$xy )
-      { splice( @$xy, 1, $firstInWindow-1 ); }
-      else
-      { splice( @$xy, 1); }
+      # everything is too old. Clear out all the data
+      $curve->{datastring}        = '';
+      $curve->{datastring_meta}   = [];
+      $curve->{datastring_offset} = 0;
+    }
+    elsif ( $firstInWindow >= 2 )
+    {
+      # clear out everything that's too old, except for one point. This point
+      # will be off the plot, but if we're plotting lines there will be a
+      # connecting line to it. Some of the line will be visible
+      substr( $curve->{datastring}, 0,
+              $meta->[$firstInWindow-1]{offset_start} - $curve->{datastring_offset},
+              '' );
+      $curve->{datastring_offset} = $meta->[$firstInWindow-1]{offset_start};
     }
   }
 }
 
 sub plotStoredData
 {
-  my ($xmin, $xmax) = @_;
-  print PIPE "set xrange [$xmin:$xmax]\n" if defined $xmin;
+  # get the options for those curves that havse any data
+  my @nonemptyCurves = grep { $_->{datastring} } @curves;
+  my @extraopts = map {$_->{options}} @nonemptyCurves;
 
-  # get the options for those curves that have any data
-  my @nonemptyCurves = grep {@$_ > 1} @curves;
-  my @extraopts = map {$_->[0]{options}} @nonemptyCurves;
-
-  my $body = join(', ' , map({ '"-"' . $_} @extraopts) );
+  my $body = join(', ' , map({ "'-' $_" } @extraopts) );
   if($options{'3d'}) { print PIPE "splot $body\n"; }
   else               { print PIPE  "plot $body\n"; }
 
-  foreach my $buf (@nonemptyCurves)
+  foreach my $curve (@nonemptyCurves)
   {
-    # send each point to gnuplot. Ignore the first "point" since it's the
-    # curve options
-    for my $elem (@{$buf}[1..$#$buf])
-    {
-      print PIPE "@$elem\n";
-    }
+    print PIPE $curve->{datastring};
     print PIPE "e\n";
   }
 }
@@ -486,19 +840,51 @@ sub updateCurveOptions
   # case. When no title is specified, gnuplot will still add a legend entry with an unhelpful '-'
   # label. Thus I explicitly do 'notitle' for that case
 
-  my ($curveoptions, $id) = @_;
+  my ($curve, $id) = @_;
 
   # use the given title, unless we're generating a legend automatically. Given titles
   # override autolegend
   my $title;
-  if(defined $curveoptions->{title})
-  { $title = $curveoptions->{title}; }
+  if(defined $curve->{title})
+  { $title = $curve->{title}; }
   elsif( $options{autolegend} )
   { $title = $id; }
 
   my $titleoption = defined $title ? "title \"$title\"" : "notitle";
-  my $extraoption = defined $options{curvestyleall} ? $options{curvestyleall} : '';
-  $curveoptions->{options} = "$titleoption $curveoptions->{extraoptions} $extraoption";
+
+  my ($curvestyleall);
+  if( defined $options{curvestyle_hash}{$id} )
+  {
+    # I have a curve-specific style set with --curvestyle. This style lives in
+    # $curve->{extraoptions}, and it overrides the global styles
+    $curvestyleall = '';
+  }
+  else
+  {
+    $curvestyleall = $options{curvestyleall};
+  }
+
+  my $histoptions = $curve->{histoptions} || '';
+
+  my $usingoptions = '';
+  if( $options{timefmt} )
+  {
+      # with --timefmt I need an explicit 'using' specification. I specify the
+      # columns as 1:2:3..... I need the right number of columns (this is given
+      # as 1 + rangesize). I also need to start the range at the first column
+      # past the timefmt
+
+      # I'd like to use //, but I guess some people are still on perl 5.8
+      my $rangesize = exists $options{rangesize_hash}{$id} ?
+        $options{rangesize_hash}{$id} :
+        $options{rangesize_default};
+
+      my @rest = map {$_ + $options{timefmt_Ncols}} (1..$rangesize);
+
+      $usingoptions = "using 1:" . join(':', @rest);
+  }
+
+  $curve->{options} = "$histoptions $usingoptions $titleoption $curve->{extraoptions} $curvestyleall";
 }
 
 sub getCurve
@@ -510,17 +896,20 @@ sub getCurve
   {
     print STDERR "Tried to exceed the --maxcurves setting.\n";
     print STDERR "Invoke with a higher --maxcurves limit if you really want to do this.\n";
-    exit;
+    exit -1;
   }
 
   my ($id) = @_;
 
   if( !exists $curveIndices{$id} )
   {
-    push @curves, [{extraoptions => ' '}]; # push a curve with no data and no options
+    push @curves, {extraoptions      => ' ',
+                   datastring        => '',
+                   datastring_meta   => [],
+                   datastring_offset => 0}; # push a curve with no data and no options
     $curveIndices{$id} =  $#curves;
 
-    updateCurveOptions($curves[$#curves][0], $id);
+    updateCurveOptions($curves[$#curves], $id);
   }
   return $curves[$curveIndices{$id}];
 }
@@ -530,8 +919,8 @@ sub addCurveOption
   my ($id, $str) = @_;
 
   my $curve = getCurve($id);
-  $curve->[0]{extraoptions} .= "$str ";
-  updateCurveOptions($curve->[0], $id);
+  $curve->{extraoptions} .= "$str ";
+  updateCurveOptions($curve, $id);
 }
 
 sub setCurveLabel
@@ -539,37 +928,114 @@ sub setCurveLabel
   my ($id, $str) = @_;
 
   my $curve = getCurve($id);
-  $curve->[0]{title} = $str;
-  updateCurveOptions($curve->[0], $id);
+  $curve->{title} = $str;
+  updateCurveOptions($curve, $id);
 }
 
-# function to add a point to the plot. Assumes that the curve indexed by $idx already exists
-sub pushPoint
+sub setCurveAsHistogram
 {
-  my ($curve, $xy) = @_;
+  my ($id, $str) = @_;
+
+  my $curve = getCurve($id);
+  $curve->{histoptions} = 'using (histbin($2)):(1.0) smooth ' . $options{histstyle};
+
+  updateCurveOptions($curve, $id);
+}
+
+# remove all the curve data
+sub clearCurves
+{
+  foreach my $curve(@curves)
+  {
+    $curve->{datastring}        = '';
+    $curve->{datastring_meta}   = [];
+    $curve->{datastring_offset} = 0;
+  }
+}
+
+sub replot
+{
+  return unless $haveNewData;
+  $haveNewData = undef;
+
+  return if !$options{stream};
 
-  if($options{monotonic})
+
+  # The logic involving domain rollover replotting due to --monotonic is a bit
+  # tricky. I want this:
+
+  # if( domain rolls over slowly )
+  # {
+  #   should update on a timer;
+  #   when the domain rolls over, --monotonic should force a replot
+  # }
+  # if( domain rolls over quickly )
+  # {
+  #   should update when the domain rolls over,
+  #     at most as quickly as the timer indicates
+  # }
+
+
+  my ($domain0_numeric, $replot_is_from_timer) = @_;
+
+  my $now = [gettimeofday];
+
+  if( # If there is no replot timer at all, replot at any indication
+      $options{stream} < 0 ||
+
+      # if the last replot was timer-based, but this one isn't, force a replot.
+      # This makes sure that a replot happens for a domain rollover shortly
+      # after a timer replot
+      !$replot_is_from_timer && $last_replot_is_from_timer ||
+
+      # if enough time has elapsed since the last replot, it's ok to replot
+      tv_interval ( $last_replot_time, $now ) > 0.8*$options{stream} )
   {
-    if( @$curve > 1 && $xy->[0] < $curve->[$#{$curve}][0] )
+    # ok, then. We really need to replot
+    if ( defined $options{xlen} )
     {
-      # the x-coordinate of the new point is in the past, so I wipe out all the data for this curve
-      # and start anew
-      splice( @$curve, 1, @$curve-1 );
+      # we have an --xlen, so we need to clean out the old data
+      pruneOldData( $domain0_numeric - $options{xlen} );
+
+      my ($xmin, $xmax) = ($domain0_numeric - $options{xlen}, $domain0_numeric);
+      if ( defined $options{timefmt} )
+      {
+        # if we're using a timefmt, I need to convert my xmin range from
+        # seconds-since-the-epoch BACK to the timefmt. Sheesh
+        ($xmin, $xmax) = map {Time::Piece->strptime( $_, '%s' )->strftime( $options{timefmt} ) } ($xmin, $xmax);
+      }
+      sendRangeCommand( "xrange", $xmin, $xmax );
     }
+
+    plotStoredData();
+
+
+    # update replot state
+    $last_replot_time          = $now;
+    $last_replot_is_from_timer = $replot_is_from_timer;
   }
+}
+
+# function to add a point to the plot. Assumes that the curve indexed by $idx already exists
+sub pushPoint
+{
+  my ($curve, $datastring, $domain0_numeric) = @_;
+
+  push @{$curve->{datastring_meta}}, { offset_start => length( $curve->{datastring} ) + $curve->{datastring_offset},
+                                       domain       => $domain0_numeric };
+  $curve->{datastring} .= $datastring;
 
-  push @$curve, $xy;
+  $haveNewData = 1;
 }
 
-__END__
 
 =head1 NAME
 
-feedGnuplot - A pipe-oriented frontend to Gnuplot
+feedgnuplot - General purpose pipe-oriented plotting tool
 
 =head1 SYNOPSIS
 
-Simple plotting of stored data:
+Simple plotting of piped data:
 
  $ seq 5 | awk '{print 2*$1, $1*$1}'
  2 1
@@ -579,14 +1045,55 @@ Simple plotting of stored data:
  10 25
 
  $ seq 5 | awk '{print 2*$1, $1*$1}' |
-   feedGnuplot --lines --points --legend 0="data 0" --title "Test plot" --y2 1
+   feedgnuplot --lines --points --legend 0 "data 0" --title "Test plot" --y2 1
+               --terminal 'dumb 80,40' --exit
+
+                                  Test plot
+
+  10 ++------+--------+-------+-------+-------+--------+-------+------*A 25
+     +       +        +       +       +       +        +       +    **#+
+     |       :        :       :       :       :        : data 0+**A*** |
+     |       :        :       :       :       :        :       :** #   |
+   9 ++.......................................................**.##....|
+     |       :        :       :       :       :        :    ** :#      |
+     |       :        :       :       :       :        :  **   #       |
+     |       :        :       :       :       :        :**   ##:      ++ 20
+   8 ++................................................A....#..........|
+     |       :        :       :       :       :      **:   #   :       |
+     |       :        :       :       :       :    **  : ##    :       |
+     |       :        :       :       :       :  **    :#      :       |
+     |       :        :       :       :       :**      B       :       |
+   7 ++......................................**......##................|
+     |       :        :       :       :    ** :    ##  :       :      ++ 15
+     |       :        :       :       :  **   :   #    :       :       |
+     |       :        :       :       :**     : ##     :       :       |
+   6 ++..............................*A.......##.......................|
+     |       :        :       :    ** :     ##:        :       :       |
+     |       :        :       :  **   :    #  :        :       :       |
+     |       :        :       :**     :  ##   :        :       :      ++ 10
+   5 ++......................**........##..............................|
+     |       :        :    ** :      #B       :        :       :       |
+     |       :        :  **   :    ## :       :        :       :       |
+     |       :        :**     :  ##   :       :        :       :       |
+   4 ++...............A.......###......................................|
+     |       :      **:     ##:       :       :        :       :       |
+     |       :    **  :   ##  :       :       :        :       :      ++ 5
+     |       :  **    : ##    :       :       :        :       :       |
+     |       :**    ##B#      :       :       :        :       :       |
+   3 ++.....**..####...................................................|
+     |    **####      :       :       :       :        :       :       |
+     |  **## :        :       :       :       :        :       :       |
+     B**     +        +       +       +       +        +       +       +
+   2 A+------+--------+-------+-------+-------+--------+-------+------++ 0
+     1      1.5       2      2.5      3      3.5       4      4.5      5
+
 
 Simple real-time plotting example: plot how much data is received on the wlan0
 network interface in bytes/second (uses bash, awk and Linux):
 
  $ while true; do sleep 1; cat /proc/net/dev; done |
-   awk '/wlan0/ {if(b) {print $2-b; fflush()} b=$2}' |
-   feedGnuplot --lines --stream --xlen 10 --ylabel 'Bytes/sec' --xlabel seconds
+   gawk '/wlan0/ {if(b) {print $2-b; fflush()} b=$2}' |
+   feedgnuplot --lines --stream --xlen 10 --ylabel 'Bytes/sec' --xlabel seconds
 
 =head1 DESCRIPTION
 
@@ -595,23 +1102,31 @@ plots from data coming in on STDIN or given in a filename passed on the
 commandline. Various data representations are supported, as is hardcopy
 output and streaming display of live data. A simple example:
 
- $ seq 5 | awk '{print 2*$1, $1*$1}' | feedGnuplot
+ $ seq 5 | awk '{print 2*$1, $1*$1}' | feedgnuplot
 
 You should see a plot with two curves. The C<awk> command generates some data to
-plot and the C<feedGnuplot> reads it in from STDIN and generates the plot. The
+plot and the C<feedgnuplot> reads it in from STDIN and generates the plot. The
 C<awk> invocation is just an example; more interesting things would be plotted
 in normal usage. No commandline-options are required for the most basic
 plotting. Input parsing is flexible; every line need not have the same number of
 points. New curves will be created as needed.
 
 The most commonly used functionality of gnuplot is supported directly by the
-script. Anything not directly supported can still be done with the
-C<--extracmds> and C<--curvestyle> options. Arbitrary gnuplot commands can be
-passed in with C<--extracmds>. For example, to turn off the grid, pass in
-C<--extracmds 'unset grid'>. As many of these options as needed can be passed
-in. To add arbitrary curve styles, use C<--curvestyle curveID=extrastyle>. Pass
-these more than once to affect more than one curve. To apply an extra style to
-I<all> the curves, pass in C<--curvestyleall extrastyle>.
+script. Anything not directly supported can still be done with options such as
+C<--set>, C<--extracmds> C<--style>, etc. Arbitrary gnuplot commands can be
+passed in with C<--extracmds>. For example, to turn off the grid, you can pass
+in C<--extracmds 'unset grid'>. Commands C<--set> and C<--unset> exists to
+provide nicer syntax, so this is equivalent to passing C<--unset grid>. As many
+of these options as needed can be passed in. To add arbitrary curve styles, use
+C<--style curveID extrastyle>. Pass these more than once to affect more than one
+curve.
+
+To apply an extra style to I<all> the curves that lack an explicit C<--style>,
+pass in C<--styleall extrastyle>. In the most common case, the extra style is
+C<with something>. To support this more simply, you can pass in C<--with
+something> instead of C<--styleall 'with something'>. C<--styleall> and
+C<--with> are mutually exclusive. Furthermore any curve-specific C<--style>
+overrides the global C<--styleall> or C<--with> setting.
 
 =head2 Data formats
 
@@ -627,9 +1142,9 @@ interpreted as the I<X>-value for the rest of the data on that line. Without
 C<--domain> the I<X>-value is the line number, and the first value on a line is
 a plain data point like the others. Default is C<--nodomain>. Thus the original
 example above produces 2 curves, with B<1,2,3,4,5> as the I<X>-values. If we run
-the same command with --domain:
+the same command with C<--domain>:
 
- $ seq 5 | awk '{print 2*$1, $1*$1}' | feedGnuplot --domain
+ $ seq 5 | awk '{print 2*$1, $1*$1}' | feedgnuplot --domain
 
 we get only 1 curve, with B<2,4,6,8,10> as the I<X>-values. As many points as
 desired can appear on a single line, but all points on a line are associated
@@ -642,7 +1157,7 @@ data is to be plotted. With the C<--dataid> option, each point is represented by
 2 values: a string identifying the curve, and the value itself. If we add
 C<--dataid> to the original example:
 
- $ seq 5 | awk '{print 2*$1, $1*$1}' | feedGnuplot --dataid --autolegend
+ $ seq 5 | awk '{print 2*$1, $1*$1}' | feedgnuplot --dataid --autolegend
 
 we get 5 different curves with one point in each. The first column, as produced
 by C<awk>, is B<2,4,6,8,10>. These are interpreted as the IDs of the curves to
@@ -654,18 +1169,24 @@ conjunction with C<--dataid>.
 =head3 Multi-value style support
 
 Depending on how gnuplot is plotting the data, more than one value may be needed
-to represent a single point. For example, the script has support to plot all the
-data with C<--circles>. This requires a radius to be specified for each point in
-addition to the position of the point. Thus, when plotting with C<--circles>, 2
-numbers are read for each data point instead of 1. A similar situation exists
-with C<--colormap> where each point contains the position I<and> the
-color. There are other gnuplot styles that require more data (such as error
-bars), but none of these are directly supported by the script. They can still be
-used, though, by specifying the specific style with C<--curvestyle>, and
-specifying how many extra values are needed for each point with
-C<--extraValuesPerPoint extra>. C<--extraValuesPerPoint> is ONLY needed for the
-styles not explicitly supported; supported styles set that variable
-automatically.
+to represent the range of a single point. Basic 2D plots have 2 numbers
+representing each point: 1 domain and 1 range. But if plotting with
+C<--circles>, for instance, then there's an extra range value: the radius. A
+similar situation exists with C<--colormap> where each point contains the
+position I<and> the color. There are other gnuplot styles that require more data
+(such as error bars), but none of these are directly supported by the script.
+They can still be used, however, by specifying the specific style with
+C<--style>, and specifying how many values are needed for each point with
+C<--rangesizeall> or C<--rangesize> or C<--extraValuesPerPoint>. Those options
+that specify the range size are required I<only> for styles not explicitly
+supported by feedgnuplot; supported styles do the right thing automatically.
+
+More examples: if making a 2d plot of y error bars where gnuplot expects a
+(x,y,ydelta) tuple for each point, you want C<--rangesizeall 2> because you have
+one domain value (x) and 2 range values (y,ydelta). Gnuplot can also plot
+lopsided y errorbars by giving a tuple (x,y,ylow,yhigh). This is similar as
+before, but you want C<--rangesizeall 3> instead.
+
 
 =head3 3D data
 
@@ -676,21 +1197,96 @@ instead of I<Y> as a function of I<X>). Thus the first 2 values on each line are
 interpreted as the domain instead of just 1. The rest of the processing happens
 the same way as before.
 
+=head3 Time/date data
+
+If the input data domain is a time/date, this can be interpreted with
+C<--timefmt>. This option takes a single argument: the format to use to parse
+the data. The format is documented in 'set timefmt' in gnuplot, although the
+common flags that C<strftime> understands are generally supported. The backslash
+sequences in the format are I<not> supported, so if you want a tab, put in a tab
+instead of \t. Whitespace in the format I<is> supported. When this flag is
+given, some other options act a little bit differently:
+
+=over
+
+=item
+
+C<--xlen> is an I<integer> in seconds
+
+=item
+
+C<--xmin> and C<--xmax> I<must> use the format passed in to C<--timefmt>
+
+=back
+
+Using this option changes both the way the input is parsed I<and> the way the
+x-axis tics are labelled. Gnuplot tries to be intelligent in this labelling, but
+it doesn't always do what the user wants. The labelling can be controlled with
+the gnuplot C<set format> command, which takes the same type of format string as
+C<--timefmt>. Example:
+
+ $ sar 1 -1 |
+   awk '$1 ~ /..:..:../ && $8 ~/^[0-9\.]*$/ {print $1,$8; fflush()}' |
+   feedgnuplot --stream --domain
+                --lines --timefmt '%H:%M:%S'
+                --set 'format x "%H:%M:%S"'
+
+This plots the 'idle' CPU consumption against time.
+
+Note that while gnuplot supports the time/date on any axis, I<feedgnuplot>
+currently supports it I<only> as the x-axis domain. This may change in the
+future.
+
 =head2 Real-time streaming data
 
-To plot real-time data, pass in the C<--stream> option. Data will then be
-plotted as it is received, with the refresh rate limited to 1Hz (currently
-hard-coded). To plot only the most recent data (instead of I<all> the data),
-C<--xlen windowsize> can be given. This will create an constantly-updating,
-scrolling view of the recent past. C<windowsize> should be replaced by the
-desired length of the domain window to plot, in domain units (passed-in values
-if C<--domain> or line numbers otherwise).
+To plot real-time data, pass in the C<--stream [refreshperiod]> option. Data
+will then be plotted as it is received. The plot will be updated every
+C<refreshperiod> seconds. If the period isn't specified, a 1Hz refresh rate is
+used. To refresh at specific intervals indicated by the data, set the
+refreshperiod to 0 or to 'trigger'. The plot will then I<only> be refreshed when
+a data line 'replot' is received. This 'replot' command works in both triggered
+and timed modes, but in triggered mode, it's the only way to replot. Look in
+L</"Special data commands"> for more information.
+
+To plot only the most recent data (instead of I<all> the data), C<--xlen
+windowsize> can be given. This will create an constantly-updating, scrolling
+view of the recent past. C<windowsize> should be replaced by the desired length
+of the domain window to plot, in domain units (passed-in values if C<--domain>
+or line numbers otherwise). If the domain is a time/date via C<--timefmt>, then
+C<windowsize> is and I<integer> in seconds.
+
+=head3 Special data commands
+
+If we are reading streaming data, the input stream can contain special commands
+in addition to the raw data. Feedgnuplot looks for these at the start of every
+input line. If a command is detected, the rest of the line is discarded. These
+commands are
+
+=over
+
+=item C<replot>
+
+This command refreshes the plot right now, instead of waiting for the next
+refresh time indicated by the timer. This command works in addition to the timed
+refresh, as indicated by C<--stream [refreshperiod]>.
+
+=item C<clear>
+
+This command clears out the current data in the plot. The plotting process
+continues, however, to any data following the C<clear>.
+
+=item C<exit>
+
+This command causes feedgnuplot to exit.
+
+=back
 
 =head2 Hardcopy output
 
 The script is able to produce hardcopy output with C<--hardcopy outputfile>. The
-output type is inferred from the filename with B<.ps>, B<.eps>, B<.pdf> and
-B<.png> currently supported.
+output type can be inferred from the filename, if B<.ps>, B<.eps>, B<.pdf>,
+B<.svg> or B<.png> is requested. If any other file type is requested,
+C<--terminal> I<must> be passed in to tell gnuplot how to make the plot.
 
 =head2 Self-plotting data files
 
@@ -702,7 +1298,7 @@ doing this: with a shebang (#!) or with inline perl data.
 A self-plotting, executable data file C<data> is formatted as
 
  $ cat data
- #!/usr/bin/feedGnuplot --lines --points
+ #!/usr/bin/feedgnuplot --lines --points
  2 1
  4 4
  6 9
@@ -724,10 +1320,10 @@ data file can be plotted simply with
 
  $ ./data
 
-The caveats here are that on Linux the whole #! line is limited to 127 charaters
-and that the full path to feedGnuplot must be given. The 127 character limit is
-a serious limitation, but this can likely be resolved with a kernel patch. I
-have only tried on Linux 2.6.
+The caveats here are that on Linux the whole #! line is limited to 127
+characters and that the full path to feedgnuplot must be given. The 127
+character limit is a serious limitation, but this can likely be resolved with a
+kernel patch. I have only tried on Linux 2.6.
 
 =head3 Self-plotting data with perl inline data
 
@@ -739,7 +1335,7 @@ create self-plotting files:
  use strict;
  use warnings;
 
- open PLOT, "| feedGnuplot --lines --points" or die "Couldn't open plotting pipe";
+ open PLOT, "| feedgnuplot --lines --points" or die "Couldn't open plotting pipe";
  while( <DATA> )
  {
    my @xy = split;
@@ -763,127 +1359,407 @@ create self-plotting files:
  30 225
 
 This is especially useful if the logged data is not in a format directly
-supported by feedGnuplot. Raw data can be stored after the __DATA__ directive,
+supported by feedgnuplot. Raw data can be stored after the __DATA__ directive,
 with a small perl script to manipulate the data into a useable format and send
 it to the plotter.
 
 =head1 ARGUMENTS
 
-  --[no]domain         If enabled, the first element of each line is the
-                       domain variable.  If not, the point index is used
+=over
+
+=item
+
+--C<[no]domain>
+
+If enabled, the first element of each line is the domain variable. If not, the
+point index is used
+
+=item
+
+--C<[no]dataid>
+
+If enabled, each data point is preceded by the ID of the data set that point
+corresponds to. This ID is interpreted as a string, NOT as just a number. If not
+enabled, the order of the point is used.
+
+As an example, if line 3 of the input is "0 9 1 20" then
+
+=over
+
+=item
+
+C<--nodomain --nodataid> would parse the 4 numbers as points in 4 different
+curves at x=3
+
+=item
+
+C<--domain --nodataid> would parse the 4 numbers as points in 3 different
+curves at x=0. Here, 0 is the x-variable and 9,1,20 are the data values
+
+=item
+
+C<--nodomain --dataid> would parse the 4 numbers as points in 2 different
+curves at x=3. Here 0 and 1 are the data IDs and 9 and 20 are the
+data values
+
+=item
+
+C<--domain --dataid> would parse the 4 numbers as a single point at
+x=0. Here 9 is the data ID and 1 is the data value. 20 is an extra
+value, so it is ignored. If another value followed 20, we'd get another
+point in curve ID 20
+
+=back
+
+=item
+
+C<--[no]3d>
+
+Do [not] plot in 3D. This only makes sense with C<--domain>. Each domain here is
+an (x,y) tuple
+
+=item
+
+--C<timefmt [format]>
+
+Interpret the X data as a time/date, parsed with the given format
+
+=item
+
+C<--colormap>
+
+Show a colormapped xy plot. Requires extra data for the color. zmin/zmax can be
+used to set the extents of the colors. Automatically sets the C<--rangesize>.
+
+=item
+
+C<--stream [period]>
+
+Plot the data as it comes in, in realtime. If period is given, replot every
+period seconds. If no period is given, replot at 1Hz. If the period is given as
+0 or 'trigger', replot I<only> when the incoming data dictates this. See the
+L</"Real-time streaming data"> section of the man page.
+
+=item
+
+C<--[no]lines>
+
+Do [not] draw lines to connect consecutive points
+
+=item
+
+C<--[no]points>
 
-  --[no]dataid         If enabled, each data point is preceded by the ID
-                       of the data set that point corresponds to. This ID is
-                       interpreted as a string, NOT as just a number. If not
-                       enabled, the order of the point is used.
+Do [not] draw points
 
-As an example, if line 3 of the input is "0 9 1 20"
- '--nodomain --nodataid' would parse the 4 numbers as points in 4
-   different curves at x=3
-
- '--domain --nodataid' would parse the 4 numbers as points in 3 different
-   curves at x=0. Here, 0 is the x-variable and 9,1,20 are the data values
-
- '--nodomain --dataid' would parse the 4 numbers as points in 2 different
-   curves at x=3. Here 0 and 1 are the data IDs and 9 and 20 are the
-   data values
+=item
 
- '--domain --dataid' would parse the 4 numbers as a single point at
-   x=0. Here 9 is the data ID and 1 is the data value. 20 is an extra
-   value, so it is ignored. If another value followed 20, we'd get another
-   point in curve ID 20
+C<--circles>
 
-  --[no]3d             Do [not] plot in 3D. This only makes sense with --domain.
-                       Each domain here is an (x,y) tuple
+Plot with circles. This requires a radius be specified for each point.
+Automatically sets the C<--rangesize>. C<Not> supported for 3d plots.
 
-  --colormap           Show a colormapped xy plot. Requires extra data for the color.
-                       zmin/zmax can be used to set the extents of the colors.
-                       Automatically increments extraValuesPerPoint
+=item
 
-  --[no]stream         Do [not] display the data a point at a time, as it
-                       comes in
+C<--title xxx>
 
-  --[no]lines          Do [not] draw lines to connect consecutive points
-  --[no]points         Do [not] draw points
-  --circles            Plot with circles. This requires a radius be specified for
-                       each point. Automatically increments extraValuesPerPoint
+Set the title of the plot
 
-  --xlabel xxx         Set x-axis label
-  --ylabel xxx         Set y-axis label
-  --y2label xxx        Set y2-axis label. Does not apply to 3d plots
-  --zlabel xxx         Set y-axis label. Only applies to 3d plots
+=item
 
-  --title  xxx         Set the title of the plot
+C<--legend curveID legend>
 
-  --legend curveID=legend
-                       Set the label for a curve plot. Use this option multiple times
-                       for multiple curves. With --dataid, curveID is the ID. Otherwise,
-                       it's the index of the curve, starting at 0
+Set the label for a curve plot. Use this option multiple times for multiple
+curves. With C<--dataid>, curveID is the ID. Otherwise, it's the index of the
+curve, starting at 0
 
-  --autolegend         Use the curve IDs for the legend. Titles given with --legend
-                       override these
+=item
 
-  --xlen xxx           When using --stream, sets the size of the x-window to plot.
-                       Omit this or set it to 0 to plot ALL the data. Does not
-                       make sense with 3d plots. Implies --monotonic
+C<--autolegend>
 
-  --xmin  xxx          Set the range for the x axis. These are ignored in a
-                       streaming plot
-  --xmax  xxx          Set the range for the x axis. These are ignored in a
-                       streaming plot
-  --ymin  xxx          Set the range for the y axis.
-  --ymax  xxx          Set the range for the y axis.
-  --y2min xxx          Set the range for the y2 axis. Does not apply to 3d plots.
-  --y2max xxx          Set the range for the y2 axis. Does not apply to 3d plots.
-  --zmin  xxx          Set the range for the z axis. Only applies to 3d plots or colormaps.
-  --zmax  xxx          Set the range for the z axis. Only applies to 3d plots or colormaps.
+Use the curve IDs for the legend. Titles given with C<--legend> override these
 
-  --y2    xxx          Plot the data specified by this curve ID on the y2 axis.
-                       Without --dataid, the ID is just an ordered 0-based index.
-                       Does not apply to 3d plots.
+=item
 
-  --curvestyle curveID=style
-                       Additional styles per curve. With --dataid, curveID is the
-                       ID. Otherwise, it's the index of the curve, starting at 0. Use
-                       this option multiple times for multiple curves
+C<--xlen xxx>
 
-  --curvestyleall xxx  Additional styles for ALL curves.
+When using C<--stream>, sets the size of the x-window to plot. Omit this or set
+it to 0 to plot ALL the data. Does not make sense with 3d plots. Implies
+C<--monotonic>
 
-  --extracmds xxx      Additional commands. These could contain extra global styles
-                       for instance
+=item
 
-  --size  xxx          Gnuplot size option
+C<--xmin/xmax/ymin/ymax/y2min/y2max/zmin/zmax xxx>
 
-  --square             Plot data with aspect ratio 1. For 3D plots, this controls the
-                       aspect ratio for all 3 axes
+Set the range for the given axis. These x-axis bounds are ignored in a streaming
+plot. The y2-axis bound do not apply in 3d plots. The z-axis bounds apply
+I<only> to 3d plots or colormaps.
 
-  --square_xy          For 3D plots, set square aspect ratio for ONLY the x,y axes
+=item
 
-  --hardcopy xxx       If not streaming, output to a file specified here. Format
-                       inferred from filename
+C<--xlabel/ylabel/y2label/zlabel xxx>
 
-  --maxcurves xxx      The maximum allowed number of curves. This is 100 by default,
-                       but can be reset with this option. This exists purely to
-                       prevent perl from allocating all of the system's memory when
-                       reading bogus data
+Label the given axis. The y2-axis label does not apply to 3d plots while the
+z-axis label applies I<only> to 3d plots.
 
-  --monotonic          If --domain is given, checks to make sure that the x-
-                       coordinate in the input data is monotonically increasing.
-                       If a given x-variable is in the past, all data currently
-                       cached for this curve is purged. Without --monotonic, all
-                       data is kept. Does not make sense with 3d plots.
-                       No --monotonic by default.
-
-  --extraValuesPerPoint xxx
-                       How many extra values are given for each data point. Normally this
-                       is 0, and does not need to be specified, but sometimes we want
-                       extra data, like for colors or point sizes or error bars, etc.
-                       feedGnuplot options that require this (colormap, circles)
-                       automatically set it. This option is ONLY needed if unknown styles are
-                       used, with --curvestyleall for instance
-
-  --dump               Instead of printing to gnuplot, print to STDOUT. For
-                       debugging.
+=item
+
+C<--y2 xxx>
+
+Plot the data specified by this curve ID on the y2 axis. Without C<--dataid>,
+the ID is just an ordered 0-based index. Does not apply to 3d plots. Can be
+passed multiple times, or passed a comma-separated list. By default the y2-axis
+curves look the same as the y-axis ones. I.e. the viewer of the resulting plot
+has to be told which is which via an axes label, legend, etc. Prior to version
+1.25 of feedgnuplot the curves plotted on the y2 axis were drawn with a thicker
+line. This is no longer the case, but that behavior can be brought back by
+passing something like
+
+ --y2 curveid --style curveid 'linewidth 3'
+
+=item
+
+C<--histogram curveID>
+
+
+Set up a this specific curve to plot a histogram. The bin width is given with
+the C<--binwidth> option (assumed 1.0 if omitted). C<--histogram> does I<not>
+touch the drawing style. It is often desired to plot these with boxes, and this
+I<must> be explicitly requested by C<--with boxes>. This works with C<--domain>
+and/or C<--stream>, but in those cases the x-value is used I<only> to cull old
+data because of C<--xlen> or C<--monotonic>. I.e. the x-values are I<not> drawn
+in any way. Can be passed multiple times, or passed a comma- separated list
+
+=item
+
+C<--binwidth width>
+
+The width of bins when making histograms. This setting applies to ALL histograms
+in the plot. Defaults to 1.0 if not given.
+
+=item
+
+C<--histstyle style>
+
+Normally, histograms are generated with the 'smooth freq' gnuplot style.
+C<--histstyle> can be used to select different 'smooth' settings. Allowed are
+'unique', 'cumulative' and 'cnormal'. 'unique' indicates whether a bin has at
+least one item in it: instead of counting the items, it'll always report 0 or 1.
+'cumulative' is the integral of the "normal" histogram. 'cnormal' is like
+'cumulative', but rescaled to end up at 1.0.
+
+=item
+
+C<--style curveID style>
+
+Additional styles per curve. With C<--dataid>, curveID is the ID. Otherwise,
+it's the index of the curve, starting at 0. Use this option multiple times for
+multiple curves. C<--styleall> does I<not> apply to curves that have a
+C<--style>
+
+=item
+
+C<--curvestyle curveID>
+
+Synonym for C<--style>
+
+=item
+
+C<--styleall xxx>
+
+Additional styles for all curves that have no C<--style>. This is overridden by
+any applicable C<--style>. Exclusive with C<--with>.
+
+=item
+
+C<--curvestyleall xxx>
+
+Synonym for C<--styleall>
+
+=item
+
+C<--with xxx>
+
+Same as C<--styleall>, but prefixed with "with". Thus
+
+ --with boxes
+
+is equivalent to
+
+ --styleall 'with boxes'
+
+Exclusive with C<--styleall>.
+
+=item
+
+C<--extracmds xxx>
+
+Additional commands to pass on to gnuplot verbatim. These could contain extra
+global styles for instance. Can be passed multiple times.
+
+=item
+
+C<--set xxx>
+
+Additional 'set' commands to pass on to gnuplot verbatim. C<--set 'a b c'> will
+result in gnuplot seeing a C<set a b c> command. Can be passed multiple times.
+
+=item
+
+C<--unset xxx>
+
+Additional 'unset' commands to pass on to gnuplot verbatim. C<--unset 'a b c'>
+will result in gnuplot seeing a C<unset a b c> command. Can be passed multiple
+times.
+
+=item
+
+C<--square>
+
+Plot data with aspect ratio 1. For 3D plots, this controls the aspect ratio for
+all 3 axes
+
+=item
+
+C<--square_xy>
+
+For 3D plots, set square aspect ratio for ONLY the x,y axes
+
+=item
+
+C<--hardcopy xxx>
+
+If not streaming, output to a file specified here. Format inferred from
+filename, unless specified by C<--terminal>
+
+=item
+
+C<--terminal xxx>
+
+String passed to 'set terminal'. No attempts are made to validate this.
+C<--hardcopy> sets this to some sensible defaults if --hardcopy is given .png,
+.pdf, .ps, .eps or .svg. If any other file type is desired, use both
+C<--hardcopy> and C<--terminal>
+
+=item
+
+C<--maxcurves xxx>
+
+The maximum allowed number of curves. This is 100 by default, but can be reset
+with this option. This exists purely to prevent perl from allocating all of the
+system's memory when reading bogus data
+
+=item
+
+C<--monotonic>
+
+If C<--domain> is given, checks to make sure that the x- coordinate in the input
+data is monotonically increasing. If a given x-variable is in the past, all data
+currently cached for this curve is purged. Without C<--monotonic>, all data is
+kept. Does not make sense with 3d plots. No C<--monotonic> by default. The data is
+replotted before being purged
+
+=item
+
+C<--rangesize curveID xxx>
+
+The options C<--rangesizeall>, C<--rangesize> and C<--extraValuesPerPoint> set
+the number of values are needed to represent each point being plotted (see
+L</"Multi-value style support"> above). These options are I<only> needed if
+unknown styles are used, with C<--styleall> or C<--with> for instance.
+
+C<--rangesize> is used to set how many values are needed to represent the range
+of a point for a particular curve. This overrides any defaults that may exist
+for this curve only.
+
+=item
+
+C<--rangesizeall xxx>
+
+Like C<--rangesize>, but applies to I<all> the curves.
+
+C<--extraValuesPerPoint xxx>
+
+Like C<--rangesizeall>, but instead of overriding the default, adds to it. For
+example, if plotting non-lopsided y errorbars gnuplot wants (x,y,ydelta) tuples.
+These can be specified both with C<--rangesizeall 2> (because there are 2 range
+values) or C<--extraValuesPerPoint 1> (because there's 1 more value than usual).
+
+This option is I<only> needed if unknown styles are used, with C<--styleall> or
+C<--with> for instance.
+
+=item
+
+C<--dump>
+
+Instead of printing to gnuplot, print to STDOUT. Very useful for debugging. It
+is possible to send the output produced this way to gnuplot directly.
+
+=item
+
+C<--exit>
+
+Terminate the feedgnuplot process after passing data to gnuplot. The window will
+persist but will not be interactive. Without this option feedgnuplot keeps
+running and must be killed by the user. Note that this option works only with
+later versions of gnuplot and only with some gnuplot terminals.
+
+=item
+
+C<--geometry>
+
+If using X11, specifies the size, position of the plot window
+
+=item
+
+C<--version>
+
+Print the version and exit
+
+=back
+
+=head1 RECIPES
+
+=head2 Basic plotting of piped data
+
+ $ seq 5 | awk '{print 2*$1, $1*$1}'
+ 2 1
+ 4 4
+ 6 9
+ 8 16
+ 10 25
+
+ $ seq 5 | awk '{print 2*$1, $1*$1}' |
+   feedgnuplot --lines --points --legend 0 "data 0" --title "Test plot" --y2 1
+
+=head2 Realtime plot of network throughput
+
+Looks at wlan0 on Linux.
+
+ $ while true; do sleep 1; cat /proc/net/dev; done |
+   gawk '/wlan0/ {if(b) {print $2-b; fflush()} b=$2}' |
+   feedgnuplot --lines --stream --xlen 10 --ylabel 'Bytes/sec' --xlabel seconds
+
+=head2 Realtime plot of battery charge in respect to time
+
+Uses the result of the C<acpi> command.
+
+ $ while true; do acpi; sleep 15; done |
+   perl -nE 'BEGIN{ $| = 1; } /([0-9]*)%/; say join(" ", time(), $1);' |
+   feedgnuplot --stream --ymin 0 --ymax 100 --lines --domain --xlabel 'Time' --timefmt '%s' --ylabel "Battery charge (%)"
+
+=head2 Realtime plot of temperatures in an IBM Thinkpad
+
+Uses C</proc/acpi/ibm/thermal>, which reports temperatures at various locations
+in a Thinkpad.
+
+ $ while true; do cat /proc/acpi/ibm/thermal | awk '{$1=""; print}' ; sleep 1; done |
+   feedgnuplot --stream --xlen 100 --lines --autolegend --ymax 100 --ymin 20 --ylabel 'Temperature (deg C)'
+
+=head2 Plotting a histogram of file sizes in a directory
+
+ $ ls -l | awk '{print $5/1e6}' |
+   feedgnuplot --histogram 0 --with boxes --ymin 0 --xlabel 'File size (MB)' --ylabel Frequency
 
 =head1 ACKNOWLEDGEMENT
 
@@ -897,11 +1773,11 @@ L<https://github.com/dkogan/feedgnuplot>
 
 =head1 AUTHOR
 
-Dima Kogan, C<< <dkogan at cds.caltech.edu> >>
+Dima Kogan, C<< <dima at secretsauce.net> >>
 
 =head1 LICENSE AND COPYRIGHT
 
-Copyright 2011 Dima Kogan.
+Copyright 2011-2012 Dima Kogan.
 
 This program is free software; you can redistribute it and/or modify it
 under the terms of either: the GNU General Public License as published
@@ -910,3 +1786,4 @@ by the Free Software Foundation; or the Artistic License.
 See http://dev.perl.org/licenses/ for more information.
 
 =cut
+
diff --git a/perl/gen_events.pl b/perl/gen_events.pl
index f5736ad..4833ccc 100755
--- a/perl/gen_events.pl
+++ b/perl/gen_events.pl
@@ -5,11 +5,16 @@ use warnings;
 
 my $arch;
 my $key;
+my $optkey = "";
 my $eventId;
+my $eventname;
 my $limit;
 my $umask;
 my $cmask;
 my $cfg;
+my $opts = "";
+my $defoptkey = "";
+my $defopts = "";
 my $num_events=0;
 my @events = ();
 
@@ -33,31 +38,94 @@ while (<INFILE>) {
     if (/^#/) {
         # Skip comment
     }elsif (/(EVENT_[A-Z0-9_]*)[ ]+(0x[A-F0-9]+)[ ]+([A-Z0-9|]+)/) {
+        $eventname = $1;
         $eventId = $2;
         $limit = $3;
+        $opts = "EVENT_OPTION_NONE_MASK";
     } elsif (/UMASK_([A-Z0-9_]*)[ ]*(0x[A-F0-9]+)[ ]*(0x[A-F0-9]+)[ ]*(0x[A-F0-9]+)/) {
         $key   = $1;
         $umask = $2;
         $cfg   = $3;
         $cmask = $4;
+        my $defaultopts = "{";
+        my $nropts = 0;
+        if ($key ne $optkey or $optkey eq "")
+        {
+            $opts = "EVENT_OPTION_NONE_MASK";
+        }
+        if ($key =~ m/$defoptkey[A-Z0-9_]*/)
+        {
+            my @optlist = split(",", $defopts);
+            foreach my $opt (@optlist)
+            {
+                my @tmplist = split("=", $opt);
+                $defaultopts = $defaultopts."{".$tmplist[0].",".$tmplist[1]."},";
+                $nropts++;
+            }
+        }
+        if (length($defaultopts) > 1)
+        {
+            substr($defaultopts,length($defaultopts)-1,1) = '}';
+        }
+        else
+        {
+            $defaultopts = $defaultopts."}";
+        }
         push(@events,{name=>$key,
                 limit=>$limit,
                 eventId=>$eventId,
                 cfg=>$cfg,
                 cmask=>$cmask,
-                mask=>$umask});
+                mask=>$umask,
+                nropts=>$nropts,
+                opts=>$opts,
+                defopts=>$defaultopts});
         $num_events++;
     } elsif (/UMASK_([A-Z0-9_]*)[ ]*(0x[A-F0-9]+)/) {
         $key = $1;
         $umask = $2;
+        my $defaultopts = "{";
+        my $nropts = 0;
+        if ($key ne $optkey or $optkey eq "")
+        {
+            $opts = "EVENT_OPTION_NONE_MASK"
+        }
+        if ($key =~ m/$defoptkey[A-Z0-9_]*/)
+        {
+            my @optlist = split(",", $defopts);
+            foreach my $opt (@optlist)
+            {
+                my @tmplist = split("=", $opt);
+                $defaultopts = $defaultopts."{".$tmplist[0].",".$tmplist[1]."},";
+                $nropts++;
+            }
+        }
+        if (length($defaultopts) > 1)
+        {
+            substr($defaultopts,length($defaultopts)-1,1) = '}';
+        }
+        else
+        {
+            $defaultopts = $defaultopts."}";
+        }
         push(@events,{name=>$key,
                 limit=>$limit,
                 eventId=>$eventId,
                 cfg=>0x00,
                 cmask=>0x00,
-                mask=>$umask});
+                mask=>$umask,
+                nropts=>$nropts,
+                opts=>$opts,
+                defopts=>$defaultopts});
         $num_events++;
     }
+    elsif (/DEFAULT_OPTIONS_([A-Z0-9_]*)[ ]*([xA-Z0-9_=,]*)/) {
+        $defoptkey = $1;
+        $defopts = $2;
+    } elsif (/OPTIONS_([A-Z0-9_]*)[ ]*([A-Z0-9_\|]+)/) {
+        $optkey = $1;
+        $opts = $2;
+    }
 }
 close INFILE;
 
@@ -72,11 +140,8 @@ print OUTFILE "#define NUM_ARCH_EVENTS_$ucArch $num_events\n\n";
 print OUTFILE "static PerfmonEvent  ".$arch."_arch_events[NUM_ARCH_EVENTS_$ucArch] = {\n";
 
 foreach my $event (@events) {
-
     print OUTFILE <<END;
-$delim {\"$event->{name}\",
-   \"$event->{limit}\", 
-   $event->{eventId},$event->{mask},$event->{cfg},$event->{cmask}}
+$delim {\"$event->{name}\", \"$event->{limit}\", $event->{eventId},$event->{mask},$event->{cfg},$event->{cmask},$event->{nropts},$event->{opts},$event->{defopts}}
 END
     $delim = ',';
 }
diff --git a/perl/generatePas.pl b/perl/generatePas.pl
index 9c1dcd1..520cbc6 100755
--- a/perl/generatePas.pl
+++ b/perl/generatePas.pl
@@ -98,7 +98,7 @@ while (defined(my $file = readdir(DIR))) {
                 }
             } elsif ($line =~ /TYPE[ ]+(SINGLE|DOUBLE)/) {
                 $type = $1;
-            } elsif ($line =~ /FLOPS[ ]+([0-9.]+)/) {
+            } elsif ($line =~ /FLOPS[ ]+([0-9]+)/) {
                 $flops = $1;
             } elsif ($line =~ /BYTES[ ]+([0-9]+)/) {
                 $bytes = $1;
diff --git a/perl/likwid-mpirun b/perl/likwid-mpirun
deleted file mode 100755
index b922359..0000000
--- a/perl/likwid-mpirun
+++ /dev/null
@@ -1,456 +0,0 @@
-#!/usr/bin/perl
-# =======================================================================================
-#
-#      Filename:  likwid-mpirun
-#
-#      Description:  Wrapper application to mpi startup mechanisms. Builds on
-#                    likwid to control affinity and has integrated perfctr support.
-#
-#      Version:   <VERSION>
-#      Released:  <DATE>
-#
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
-#      Project:  likwid
-#
-#      Copyright (C) 2014 Jan Treibig
-#
-#      This program is free software: you can redistribute it and/or modify it under
-#      the terms of the GNU General Public License as published by the Free Software
-#      Foundation, either version 3 of the License, or (at your option) any later
-#      version.
-#
-#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
-#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
-#
-#      You should have received a copy of the GNU General Public License along with
-#      this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-# =======================================================================================
-
-use Getopt::Long;
-##############################
-#       CONFIGURATION        #
-##############################
-my $LIKWIDPIN  = '<PREFIX>/bin/likwid-pin';
-my $LIKWIDPERF = '<PREFIX>/bin/likwid-perfctr';
-my $MPIROOT_openmpi  =  $ENV{'MPIHOME'};
-my $MPIROOT_intelmpi =  $ENV{'MPIHOME'};
-my $MPIEXEC_openmpi  = "$MPIROOT_openmpi/bin/mpiexec";
-my $MPIEXEC_intelmpi = "$MPIROOT_intelmpi/bin/mpiexec";
-my $MPIEXEC_mvapich2 = "mpirun";
-##############################
-
-my $OMPType = '';
-my $MPIType = '';
-my $WrapperScript = "mpiexec.$$";
-my %Domains;
-my $NP = 0;
-my $PPN = 0;
-my $NperNode = 0;
-my %NodeList;
-my $NumberOfNodes = 0;
-my $NumberOfUsedNodes = 0;
-my $Hostfilename = 0;
-my $Hostfile = '';
-my $PerformanceGroup = '';
-my $LikwidCall = "$LIKWIDPIN -c ";
-my $debug = 0;
-my $marker = '';
-
-sub readHostfile
-{
-    open FILE, "<$Hostfilename";
-
-    while (<FILE>) {
-        chomp;
-        if (not exists $NodeList{$host}) {
-            $NodeList{$_} = 1;
-        }
-    }
-    close FILE;
-
-    $NumberOfNodes = keys %NodeList;
-}
-
-# MPI implementations
-# OpenMPI  #<# 
-sub generateNodelist_openmpi
-{
-    open FILE, ">$Hostfilename-openmpi";
-
-    #FIXME  Order may be different
-    foreach my $node (keys %NodeList) {
-        print FILE "$node slots=$PPN\n"
-    }
-
-    close FILE;
-
-    $Hostfile = "-hostfile $Hostfilename-openmpi";
-}
-
-sub setEnvironment_openmpi
-{
-}
-
-sub executeMPI_openmpi
-{
-    if ($debug) {
-        print "$MPIEXEC_openmpi $Hostfile -np $NP -npernode $NperNode ./$WrapperScript";
-    }
-
-    system ("$MPIEXEC_openmpi $Hostfile -np $NP -npernode $NperNode ./$WrapperScript");
-}
-#>#
-
-# mvapich2  #<# 
-sub generateNodelist_mvapich2
-{
-}
-
-sub setEnvironment_mvapich2
-{
-    $ENV{'MV2_ENABLE_AFFINITY'}='0';
-}
-
-#tw
-#mvapich2: pinning aus
-# Hybrid programming options:
-#    -ranks-per-proc                  assign so many ranks to each process
-#
-#  Processor topology options:
-#    -binding                         process-to-core binding mode
-#    -topolib                         processor topology library ( hwloc plpa)
-
-sub executeMPI_mvapich2
-{    
-    if ($debug) {
-        print "$MPIEXEC_mvapich2 $Hostfile -np $NP -npernode $NperNode ./$WrapperScript";
-    }
-
-    system ("$MPIEXEC_mvapich2 $Hostfile -np $NP -ppn $NperNode ./$WrapperScript");
-
-}
-
-#generate wrapper script
-#mpirank
-#mpitype = mvapich
-
-#>#
-
-# Intel MPI  #<# 
-sub generateNodelist_intelmpi
-{
-    open FILE, ">$Hostfilename-intelmpi";
-
-    #FIXME  Order may be different
-    foreach my $node (keys %NodeList) {
-        print FILE "$node\:$NperNode\n"
-    }
-
-    close FILE;
-
-    $Hostfile = "-f $Hostfilename-intelmpi";
-}
-
-sub setEnvironment_intelmpi
-{
-    $ENV{'I_MPI_PIN'}='off';
-    $ENV{'KMP_AFFINITY'}='disabled';
-}
-
-sub executeMPI_intelmpi
-{
-    if ($debug) {
-        print "$MPIROOT_intelmpi/bin/mpdboot -r ssh -n $NumberOfNodes $Hostfile \n";
-        print "$MPIROOT_intelmpi/bin/mpiexec -np $NP $WrapperScript \n";
-        print "$MPIROOT_intelmpi/bin/mpdallexit \n";
-    }
-
-    system ("$MPIROOT_intelmpi/bin/mpdboot -r ssh -n $NumberOfNodes $Hostfile ");
-    system ("$MPIROOT_intelmpi/bin/mpiexec  -perhost $NperNode -np $NP ./$WrapperScript");
-    system ("$MPIROOT_intelmpi/bin/mpdallexit");
-}
-#>#
-
-sub generateHostlist  #<# 
-{
-    $ppnHost = '';
-    open FILE, "<$ENV{'PBS_NODEFILE'}";
-    my @hostArray = <FILE>;
-    close FILE;
-
-    $ppnhost = $hostArray[0];
-    chomp $ppnhost;
-
-    # generate unique host list
-    foreach my $host (@hostArray) {
-        chomp $host;
-        if ($ppnhost eq $host) {
-            $PPN++;
-        }
-        if (not exists $NodeList{$host}) {
-            $NodeList{$host} = 1;
-        }
-    }
-
-    $NumberOfNodes = keys %NodeList;
-}
-#>#
-
-sub usage  #<# 
-{
-    print <<END;
-usage: $0 -np <NUMPROC>
-
-Required:
--np <NUMPROC> : number of MPI processes
-
-Optional:
--h                     : this (help) message
--d                     : debug run
--hostfile <argument>   : Specify nodes if not in in a scheduler
--nperdomain <argument> : Run specified number of processes per domain.
-                         Supported domains are:
-                         N Node
-                         S Socket
-                         C last level cache group
-                         M NUMA domain
--pin <argument>        : Specify pinning for hybrid execution.
-                         Processes are separated by underscore.
-                         The threaded pinning must be a valid likwid-pin list.
--omp <argument>        : Enables support for specific hybrid setup. Use only 
-                         together with -pin option. Currently recognized values: intel
--mpi <argument>        : Specify which mpi implementation should be used. Current recognized 
-                         values: intelmpi, openmpi, mvapich2
---                     : Stop the likwid-mpirun parser. Useful for saving options to
-                         the MPI application.
-
-You can either use -nperdomain OR -pin for specifying pinning.
-For pure MPI pinning use only the nperdomain option. For hybrid use the pin option.
-
-Example: 
-$0 -np 32 ./a.out
-
-$0 will use as many processes per node as available in ppn 
-
-Example with pinning:
-$0 -np 32 -nperdomain S:2 ./a.out
-starts 2 processes per socket.
-
-Example for hybrid run:
-$0 -np 32 -pin M0:0-3_M1:0-3
-starts 2 processes per node. Threads of first process are pinned to first four
-cores in NUMA domain 0. Threads of second process are pinned to first four cores 
-in NUMA domain 1.
-END
-
-exit(0);
-}
-#>#
-
-sub generateDomains  #<# 
-{
-    my $output = `$LIKWIDPIN -p`;
-
-    foreach my $line (split("\n",$output)) {
-        if ($line =~ /Tag ([NSCM])[0-9]*: ([0-9 ]+)/) {
-            if (exists $Domains{$1}) {
-                $Domains{$1}++;
-            } else {
-                $Domains{$1} = 1;
-            }
-
-            if ($1 eq 'N') {
-                $PPN =  split(/ /,$2);
-            }
-        }
-    }
-}
-#>#
-
-sub generateWrapperScript  #<# 
-{
-    my $pinStrings = shift;
-    my $mpiType = shift;
-    open FILE, ">$WrapperScript";
-    my $environment = '';
-    my $doRest = '';
-
-    if ($mpiType eq 'openmpi') {
-        $environment = 'OMPI_COMM_WORLD_RANK';
-    } elsif ($mpiType eq 'intelmpi') {
-        $environment = 'PMI_RANK';
-    } elsif ($mpiType eq 'mvapich2') {
-        $environment = 'PMI_RANK'; #tw maybe????
-    } 
-
-    if ($NP % $NperNode) {
-        my $rest = $NP-($NP % $NperNode);
-        $doRest = "if (\$myRank >= $rest) {\$localId = \$myRank - $rest;}\n";
-    }
-
-    print FILE <<END;
-#!/usr/bin/perl 
-use strict;
-use warnings;
-
-my \$args = join \@ARGV;
-my \$myRank = \$ENV{$environment};
-
-my \$localId = \$myRank \% $NperNode  ;
-
-$doRest
-
-if (\$localId == 0) {
-    system ("$LikwidCall $pinStrings->[0] $PerformanceGroup $OMPType  $cmdline \$args ");
-} 
-END
-
-    foreach my $process ( 1 .. ($NperNode-1) ) {
-    print FILE <<END;
-elsif (\$localId == $process) {
-    system ("$LikwidCall $pinStrings->[$process] $PerformanceGroup $OMPType  $cmdline \$args ");
-} 
-END
-    }
-
-    close FILE;
-}
-#>#
-
-my $pinString = '';
-my $domain = '';
-my @pinStrings;
-
-GetOptions ('np=i'         => \$NP,
-            'nperdomain=s' => \$NperDomain,
-            'hostfile=s'   => \$Hostfilename,
-            'pin=s'        => \$pinString,
-            'mpi=s'        => \$MPIType,
-            'omp=s'        => \$OMPType,
-            'perf=s'       => \$PerformanceGroup,
-            'debug'        => \$debug,
-            'marker'       => sub { $marker = ' -m '; },
-            'help'         => \&usage);
-
-# MPI implementation switch
-$generateNodelist = "generateNodelist_$MPIType";
-$setEnvironment = "setEnvironment_$MPIType";
-$executeMPI = "executeMPI_$MPIType";
-
-generateDomains();
-
-# check for PBS batch system
-if (not defined ($ENV{'PBS_JOBID'})) {
-    readHostfile();
-} else {
-    $NumberOfNodes = `uniq \$PBS_NODEFILE | wc -l`;
-}
-
-if ($pinString) {
-    @pinStrings = split('_',$pinString);
-    $NperNode = ($#pinStrings+1);
-
-    if ($MPIType eq 'openmpi') {
-        if ($OMPType eq 'intel') {
-            $OMPType = '';
-            $OMPType = '-s 0xF';
-        }
-    } elsif ($MPIType eq 'intelmpi') {
-        if ($OMPType eq 'intel' and ($NumberOfNodes == 1)) {
-            $OMPType = '-t intel';
-        } elsif ($OMPType eq 'intel') {
-            $OMPType = '-s 0x7';
-        }
-    }elsif ($MPIType eq 'mvapich2') {
-        if ($OMPType eq 'intel' and ($NumberOfNodes == 1)) {
-            $OMPType = '-t intel';
-        } elsif ($OMPType eq 'intel') {
-            $OMPType = '-s 0x7';
-        }
-    }
-
-} elsif ($NperDomain) {
-
-    $OMPType = '';
-    if ($NperDomain =~ /([NSCM]):([0-9]+)/) {
-        $domain = $1;
-        $NperDomain = $2;
-    } else {
-        die "Parse Error \n";
-    }
-
-    $NperNode = $NperDomain * $Domains{$domain};
-
-    if (not $domain eq 'N') {
-        foreach my $currentDomain ( 0 .. ($Domains{$domain}-1)) {
-            foreach my $currentProcess ( 0 .. ($NperDomain-1)) {
-                push @pinStrings, "$domain"."$currentDomain".":$currentProcess";
-            }
-        }
-    } else {
-        foreach my $currentProcess ( 0 .. ($NperDomain-1)) {
-            push @pinStrings, "$domain".":$currentProcess";
-        }
-    }
-} elsif ($NP) {
-    print "PPN = $PPN\n";
-    $NperNode = $PPN;
-    $OMPType = '';
-
-    foreach my $currentProcess ( 0 .. ($PPN-1)) {
-        push @pinStrings, "N".":$currentProcess";
-    }
-} else {
-    usage();
-}
-
-if (not defined ($ENV{'PBS_JOBID'})) {
-    $Hostfilename .= $$;
-    &{$generateNodelist}();
-} else {
-    if ($MPIType eq 'intelmpi') {
-        $Hostfilename = "pbshosts$$";
-        generateHostlist();
-        &{$generateNodelist}();
-    }
-}
-
-map {$cmdline .= "$_ " ;}  @ARGV;
-$NumberOfUsedNodes = $NP / $NperNode;
-
-if ($NumberOfUsedNodes > $NumberOfNodes) {
-    die "ERROR: Require $NumberOfUsedNodes nodes, but only $NumberOfNodes available!";
-}
-
-if ($NumberOfUsedNodes < 1) {
-    die "ERROR: Requested $NperNode processes per Node with only $NP total processes!";
-}
-
-if ($PerformanceGroup) {
-    $LikwidCall = "$LIKWIDPERF -C";
-    $PerformanceGroup  = ' -g '.$PerformanceGroup ;
-    $PerformanceGroup .= " $marker -o perf_%h_%r.txt ";
-} else {
-    $PerformanceGroup  = ' -q ';
-}
-
-generateWrapperScript(\@pinStrings,$MPIType);
-chmod 0755,$WrapperScript;
-&{$setEnvironment}();
-
-if ($debug) {
-    print  "Number of nodes: $NumberOfNodes \n";
-    $NumberOfUsedNodes = $NP / $NperNode;
-    print  "Number of used nodes: $NumberOfUsedNodes \n";
-    print  "Number of processes per node: $NperNode \n";
-}
-&{$executeMPI}();
-
-if (-e $WrapperScript and not $debug) {
-    unlink ($WrapperScript);
-    unlink ($Hostfilename);
-}
-
-# vim: foldmethod=marker foldmarker=#<#,#>#
diff --git a/perl/likwid-perfscope b/perl/likwid-perfscope
deleted file mode 100755
index 84f99da..0000000
--- a/perl/likwid-perfscope
+++ /dev/null
@@ -1,110 +0,0 @@
-#!/usr/bin/perl
-use strict;
-use warnings;
-
-use Getopt::Long;
-
-sub usage  #<# 
-{
-    print <<END;
-usage: $0 --group <Performance Group> --cores <physical core list>
-
-Required:
--cores <CORELIST> : list of physical cores
-
-Optional:
--h                     : this (help) message
--freq                  : frequency of updates, in ms or s (e.g. 500ms), default: 1s
--group <PERFGROUP>     : Specify what to plot, default FLOPS_DP
-
-Example:
-$0 -group FLOPS_DP -cores 0-3 
-END
-
-exit(0);
-}
-#>#
-
-my $CONFIG = {   #<# 
-    "FLOPS_DP" => {
-        "group" => 'FLOPS_DP',
-        "expr" => 'DP MFlops/s',
-        "title" => 'Double Precision Flop Rate',
-        "yaxis" => 'MFlops/s'},
-    "FLOPS_SP" => {
-        "group" => 'FLOPS_SP',
-        "expr" => 'SP MFlops/s',
-        "title" => 'Single Precision Flop Rate',
-        "yaxis" => 'MFlops/s'},
-    "L2" => {
-        "group" => 'L2',
-        "expr" => 'L2 bandwidth [MBytes/s]',
-        "title" => 'L2 cache bandwidth',
-        "yaxis" => 'bandwidth [MB/s]'},
-    "L3" => {
-        "group" => 'L3',
-        "expr" => 'L3 bandwidth [MBytes/s]',
-        "title" => 'L3 cache bandwidth',
-        "yaxis" => 'bandwidth [MB/s]'},
-    "CLOCK" => {
-        "group" => 'CLOCK',
-        "expr"  => 'Clock [MHz]',
-        "title" => 'Clock rate',
-        "yaxis" => 'MHz'},
-    "NUMA" => {
-        "group" => 'MEM',
-        "expr" => 'Remote BW [MBytes/s]',
-        "title" => 'Remote NUMA bandwidth',
-        "yaxis" => 'bandwidth [MB/s]'},
-    "MEM" => {
-        "group" => 'MEM',
-        "expr" => 'MBytes/s',
-        "title" => 'Main memory bandwidth',
-        "yaxis" => 'bandwidth [MB/s]'}};
-#>#
-
-my $FREQ = '1s';
-my $CORES = '';
-my $optGroup = 'FLOPS_DP';
-my $optPlot;
-
-GetOptions ('group=s' => \$optGroup, 'freq=s' => \$FREQ, 'cores=s' => \$CORES, 'plot=s' => \$optPlot, 'help' => \&usage);
-
-my $GROUP = $CONFIG->{$optGroup}->{'group'};
-my $yaxis = $CONFIG->{$optGroup}->{'yaxis'};
-my $title = $CONFIG->{$optGroup}->{'title'};
-my $expr  = $CONFIG->{$optGroup}->{'expr'};
-my $legend = '';
-
-open (INPUT, "likwid-perfctr -g $GROUP -d $FREQ -c $CORES |");
-
-select((select(INPUT), $| = 1)[0]);
-
-while (<INPUT>) {
-    if (/CORES: ([0-9 ]+)/) {
-        my @cores = split ' ',$1;
-        my $coreNumber = 0;
-
-        foreach my $core (@cores) {
-            $legend .= " --legend $coreNumber=\"core $core\" ";
-            $coreNumber++;
-        }
-        last;
-    }
-}
-
-open (OUTPUT, "| feedGnuplot --lines  --domain --stream --xlabel \"seconds\" --ylabel \"$yaxis\" --title \"$title\" $legend");
-
-select((select(OUTPUT), $| = 1)[0]);
-
-while (<INPUT>) {
-    if (/$expr/) {
-        s/$expr//;
-        print OUTPUT;
-    }
-}
-close(INPUT);
-close(OUTPUT);
-
-
-# vim: foldmethod=marker foldmarker=#<#,#># 
diff --git a/perl/likwid-setFrequencies b/perl/likwid-setFrequencies
deleted file mode 100755
index 5834441..0000000
--- a/perl/likwid-setFrequencies
+++ /dev/null
@@ -1,185 +0,0 @@
-#!/usr/bin/perl
-# =======================================================================================
-#
-#      Filename:  likwid-setFrequencies
-#
-#      Description:  Application allowing to change core frequencies
-#
-#      Version:   <VERSION>
-#      Released:  <DATE>
-#
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
-#      Project:  likwid
-#
-#      Copyright (C) 2014 Jan Treibig
-#
-#      This program is free software: you can redistribute it and/or modify it under
-#      the terms of the GNU General Public License as published by the Free Software
-#      Foundation, either version 3 of the License, or (at your option) any later
-#      version.
-#
-#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
-#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
-#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
-#
-#      You should have received a copy of the GNU General Public License along with
-#      this program.  If not, see <http://www.gnu.org/licenses/>.
-#
-# =======================================================================================
-
-use Getopt::Std;
-
-my $LIKWIDPIN  = '<PREFIX>/bin/likwid-pin';
-my $SYSPATH = '/sys/devices/system/cpu';
-my $SYSCMD = '<PREFIX>/sbin/likwid-setFreq';
-my $domain = 'N';
-my $governor = 'ondemand';
-my @processors;
-my %frequencies;
-my $freq_string;
-use vars qw/ %opt /;
-
-sub init
-{
-    my $opt_string = 'g:c:f:lph';
-    getopts( $opt_string, \%opt ) or usage();
-    usage() if $opt{h};
-    if (scalar(keys %opt) == 0)
-    {
-    	usage();
-    }
-}
-
-sub usage
-{
-    print STDERR << "EOF";
-
-This script allows to switch governors and set fixed
-frequencies on Linux system.
-
-usage: $0 [-hlp] [-g governor] [-c domain] [-f frequency]
--h          : this (help) message
--p          : print current frequencies
--l          : list available frequencies
--c domain   : likwid thread domain which to apply settings
-              (set to N if omitted)
--g governor : set governor (ondemand, performance, turbo)
-              (set to ondemand if omitted)
--f frequency: set fixed frequency, implicitly sets userspace
-              governor
-
-example: $0 -c S0 -f 2.7 (set all CPUs on socket 0 to 2.7 GHz)
-EOF
-    exit;
-}
-
-sub extractAvailableFrequencies
-{
-    my @tmp_keys;
-    open FILE, "<$SYSPATH/cpu0/cpufreq/scaling_available_frequencies";
-    my $tmp = <FILE>;
-    my @list = split(/ /,$tmp);
-    close FILE;
-    $frequencies{'turbo'} = $list[0];
-
-    foreach my $item ( @list ) {
-        if( not $item =~ /\n/ ) {
-            my $key = $item/1000000.0;
-            push @tmp_keys, $key;
-            $frequencies{$key} = $item;
-        }
-    }
-
-    $freq_string = join(' ', sort @tmp_keys);
-}
-
-sub extractProcessorList
-{
-    my $output = `$LIKWIDPIN -p`;
-    my $found = 0;
-
-    foreach my $line (split("\n",$output)) {
-        if ($line =~ /Tag ([NSCM][0-9]*): ([0-9 ]+)/) {
-            if ($domain eq $1) {
-                $found = 1;
-                @processors =  split(/ /,$2);
-                last;
-            }
-        }
-    }
-
-    if ( not $found ) {
-		print "Domain $domain not available!\n";
-        exit;
-    }
-}
-
-
-init();
-
-if (! -s $SYSCMD) {
-    die "ERROR Binary $SYSCMD not existing!\n\n";
-}
-
-if ( defined $opt{c}) {
-    $domain = $opt{c};
-}
-
-extractProcessorList();
-extractAvailableFrequencies();
-
-if ($opt{f}) {
-    $freq = $opt{f};
-
-	if (not exists($frequencies{$freq})) {
-		print "Frequency $freq not available!\nPlease select one of $freq_string\n";
-		exit;
-	}
-
-	foreach my $processID (@processors) {
-#		print "$SYSCMD $processID $frequencies{$freq}\n";
-        system("$SYSCMD $processID $frequencies{$freq}");
-	}
-}
-
-if ($opt{p}) {
-	foreach my $processID (@processors) {
-		open FILE,"<$SYSPATH/cpu".$processID."/cpufreq/scaling_governor";
-		my $gov = <FILE>;
-		chomp $gov;
-		close FILE;
-		open FILE,"<$SYSPATH/cpu".$processID."/cpufreq/scaling_cur_freq";
-		my $freq = <FILE>;
-		chomp $freq;
-		close FILE;
-		print "CPU $processID: governor $gov frequency $freq\n"
-	}
-	exit;
-}
-
-if ($opt{l}) {
-    print "Available frequencies: $freq_string\n";
-    exit;
-}
-
-if ($opt{g} eq 'turbo') {
-    foreach my $processID (@processors) {
-#        print "$SYSCMD $processID $frequencies{turbo}\n";
-        system("$SYSCMD $processID $frequencies{turbo}");
-    }
-    exit;
-}
-
-if ($opt{g}) {
-    $governor = $opt{g};
-    if (($governor ne "ondemand") and ($governor ne "performance")) {
-        print "Governor $governor not valid\n";
-    } else {
-        print "Set governor in domain $domain to $governor \n";
-        foreach my $processID (@processors) {
-            system("$SYSCMD $processID 0 $governor");
-        }
-    }
-}
-
-# vim: foldmethod=marker foldmarker=#<#,#>#
diff --git a/perl/set_license.pl b/perl/set_license.pl
index f80326d..1eb590d 100755
--- a/perl/set_license.pl
+++ b/perl/set_license.pl
@@ -8,15 +8,18 @@ use File::Copy;
 my $mc = '#';
 my $cc = ' *';
 my $fc = '!';
+my $lc = ' *';
 
 #my $VERSION   = '<VERSION>';
 #my $DATE   = '<DATE>';
-my $VERSION   = '3.1.3';
-my $DATE   = '4.11.2014';
-my $YEAR  = '2014';
-my $AUTHOR = 'Jan Treibig';
+my $VERSION   = '4.0';
+my $DATE   = '16.6.2015';
+my $YEAR  = '2015';
+my $AUTHOR = 'RRZE, University Erlangen-Nuremberg';
 my $LICENSE = 'gpl';
 
+my @SKIPLIST = ('ghash.c','ghash.h','loadData.S','bstrlib.c','bstrlib.h');
+
 sub print_copyright
 {
     my $fh = shift;
@@ -72,108 +75,143 @@ END
     }
 }
 
-sub wanted 
+sub wanted
 {
-	my $filename;
-
-	if (scalar(@_)) {
-		$filename = shift;
-	} else {
-		$filename = $_;
-	}
-
-	if (($filename =~ /^\./) or (-d $_)) {
-		return;
-	}
-
-	my $in_copyright = 0;
-	my $in_header = 0;
-	my $style = $cc;
-	my $enter = 0;
-	open INFILE, "< $filename";
-	open OUTFILE, "> $filename.tmp";
-	print "Process $filename\n";
-
-	while( <INFILE> ) {
-
-		if (/\/\*/ and !$enter) {
-			$style = $cc;
-			$enter = 1;
-			$in_header = 1;
-			print  OUTFILE "/*\n";
-			print  OUTFILE "$style =======================================================================================\n";
-			next;
-		} elsif (/# =/ and !$enter) {
-			$style = $mc;
-			$enter = 1;
-			$in_header = 1;
-			print  OUTFILE "$style =======================================================================================\n";
-			next;
-		} elsif (/! =/ and !$enter) {
-			$style = $fc;
-			$enter = 1;
-			$in_header = 1;
-			print  OUTFILE "$style =======================================================================================\n";
-			next;
-		} elsif (!$enter) {
-			print "Skip $filename: No header found!\n";
-			return;
-		}
-
-		if ($in_header) {
-			if(/Filename:[ ]+([A-za-z0-9._\-]+)/) {
-				if ($1 ne $filename) {
-					print "File name mismatch: $filename header says $1\n";
-				}
-				print  OUTFILE "$_";
-			} elsif(/Version:/) {
-				print OUTFILE  "$style      Version:   $VERSION\n";
-			} elsif(/Released:/) {
-				print  OUTFILE "$style      Released:  $DATE\n";
-			} elsif(/Company:/) {
-				#Skip company from header
-			} elsif(/Copyright/) {
-				$in_copyright = 1;
-#				print  OUTFILE "$style\n";
-				print_copyright(\*OUTFILE, $style);
-			} elsif(/# =/ or /! =/) {
-				$in_copyright = 0;
-				$in_header = 0;
-			} elsif (/\*\//) {
-				$in_copyright = 0;
-				$in_header = 0;
-				print  OUTFILE " */\n";
-			} elsif (/\* =/) {
-				# Skip initial hline
-			} else {
-				if($in_copyright eq 0) {
-					print  OUTFILE "$_";
-				}
-			}
-
-		} else {
-			print  OUTFILE "$_";
-		}
-	}
-
-	close INFILE;
-	close OUTFILE;
-
-	unlink $filename or die  "Failed to delete file $filename\n";
-	copy ("$filename.tmp", $filename) or die "Copy failed\n";
-	unlink "$filename.tmp" or die  "Failed to delete file $filename\n";
+    my $filename;
+
+    if (scalar(@_)) {
+        $filename = shift;
+    } else {
+        $filename = $_;
+    }
+
+    if (($filename =~ /^\./) or (-d $filename)) {
+        return;
+    }
+
+    foreach my $filter ( @SKIPLIST ) {
+        if ( $filename eq $filter ) {
+            print "SKIP $filename\n";
+            return;
+        }
+    }
+
+    my $in_copyright = 0;
+    my $in_header = 0;
+    my $style = $cc;
+    my $enter = 0;
+    open INFILE, "< $filename";
+    open OUTFILE, "> $filename.tmp";
+    print "Process $filename\n";
+
+    while( <INFILE> ) {
+        # Ensure UNIX line ending
+        $_ =~ s/\cM\cJ|\cM|\cJ/\n/g;
+
+        if (/\/\*/ and !$enter) {
+            $style = $cc;
+            $enter = 1;
+            $in_header = 1;
+            print  OUTFILE "/*\n";
+            print  OUTFILE "$style =======================================================================================\n";
+            next;
+        } elsif (/# =/ and !$enter) {
+            $style = $mc;
+            $enter = 1;
+            $in_header = 1;
+            print  OUTFILE "$style =======================================================================================\n";
+            next;
+        } elsif (/! =/ and !$enter) {
+            $style = $fc;
+            $enter = 1;
+            $in_header = 1;
+            print  OUTFILE "$style =======================================================================================\n";
+            next;
+        } elsif (/#!/ and !$enter) {
+            $style = $lc;
+            $enter = 1;
+            $in_header = 1;
+            print  OUTFILE "$_";
+            print  OUTFILE "--[[\n";
+            print  OUTFILE "$style =======================================================================================\n";
+            next;
+        } elsif (/\-\-\[\[/ and !$enter) {
+            $style = $lc;
+            $enter = 1;
+            $in_header = 1;
+            print  OUTFILE "--[[\n";
+            print  OUTFILE "$style =======================================================================================\n";
+            next;
+        } elsif (!$enter) {
+            print "Skip $filename: No header found!\n";
+            unlink "$filename.tmp" or die  "Failed to delete file $filename\n";
+            return;
+        }
+
+        if ($in_header) {
+            if(/Filename:[ ]+([A-za-z0-9._\-]+)/) {
+                if ($1 ne $filename) {
+                    print "File name mismatch: $filename header says $1\n";
+                }
+                print  OUTFILE "$_";
+            } elsif(/Version:/) {
+                print OUTFILE  "$style      Version:   $VERSION\n";
+            } elsif(/Released:/) {
+                print  OUTFILE "$style      Released:  $DATE\n";
+            } elsif(/Copyright/) {
+                $in_copyright = 1;
+                print_copyright(\*OUTFILE, $style);
+            } elsif(/# =/ or /! =/) {
+                $in_copyright = 0;
+                $in_header = 0;
+            } elsif (/\*\//) {
+                $in_copyright = 0;
+                $in_header = 0;
+                print  OUTFILE " */\n";
+            } elsif (/\]\]$/) {
+                $in_copyright = 0;
+                $in_header = 0;
+                print  OUTFILE "]]\n";
+            } elsif (/\* =/ or /\-\-\[\[/) {
+                # Skip initial hline
+            } else {
+                if($in_copyright eq 0) {
+                    print  OUTFILE "$_";
+                }
+            }
+        } else {
+            print  OUTFILE "$_";
+        }
+    }
+
+    close INFILE;
+    close OUTFILE;
+
+    unlink $filename or die  "Failed to delete file $filename\n";
+    copy ("$filename.tmp", $filename) or die "Copy failed\n";
+    unlink "$filename.tmp" or die  "Failed to delete file $filename\n";
 }
 
 
 if (defined $ARGV[0]) {
     my $filename = $ARGV[0];
     wanted($filename);
-	exit (0);
+    exit (0);
 }
 
 my @directories;
 push @directories, 'src';
+push @directories, 'bench/src';
+push @directories, 'bench/includes';
+push @directories, 'examples';
 
 find(\&wanted,  @directories);
 
+# single files
+wanted('Makefile');
+chdir 'bench';
+wanted('Makefile');
+wanted('likwid-bench.c');
+
+
 
diff --git a/perl/templates/group.tt b/perl/templates/group.tt
index 2122caf..5676318 100644
--- a/perl/templates/group.tt
+++ b/perl/templates/group.tt
@@ -2,65 +2,14 @@
 
 #define NUM_GROUPS_[% arch FILTER upper %] [% numGroups %]
 
-[% FOREACH group IN groups %]
-static const char* group_names_[% arch FILTER ucfirst %]_[% group.name %] [] = {[% FOREACH metric IN group.metrics %] "[% metric.label %]", [% END %] NULL};
-[% END %]
-
 static PerfmonGroupMap [% arch %]_group_map[NUM_GROUPS_[% arch FILTER upper %]] = {
 [% FOREACH group IN groups %]
-    {"[% group.name %]",[% group.name %],[% group.isUncore %],"[% group.shortHelp %]","[% group.eventSet %]", 0 [% FOREACH metric IN group.metrics %] +1 [% END %], group_names_[% arch FILTER ucfirst %]_[% group.name %]
-    },
+    {"[% group.name %]",[% group.name %],[% group.isUncore %],"[% group.shortHelp %]","[% group.eventSet %]"},
 [% END %]
 };
 
-void perfmon_getDerivedCounterValues[% arch FILTER ucfirst %](PerfmonGroup group, float * values, float * out_max, float * out_min){
-    double time = rdtscTime;
-    double inverseClock = 1.0 /(double) timer_getCpuClock();
-
-    values[0] = time;
-    out_min[0] = time;
-    out_max[0] = time;
-
-    switch ( group ) {
-    [% FOREACH group IN groups %]
-        case [% group.name %]:{
-            int threadId;
-            int counter = 0;
-            double sum,min,max;
-
-        [% FOREACH metric IN group.metrics %]
-            sum = 0;
-            min = 1e300;
-            max = 0;
-
-            for(threadId=0; threadId < perfmon_numThreads; threadId++)
-            {
-                double cur = [% metric.rule %];
-                cur = isnan(cur) ? 0.0 : cur;
-                sum += cur;
-                max = max > cur ? max : cur;
-                min = min < cur ? min : cur;                        
-            }
-
-            values[counter] = (float) sum / perfmon_numThreads;
-            out_min[counter] = (float) min;
-            out_max[counter] = (float) max;
-            counter++;
-        [% END %]
-        return;        
-        }
-    [% END %]
-
-        default:
-            fprintf (stderr, "perfmon_getDerivedCounterValues[% arch %]: Unknown group! Exiting!\n" );
-            exit (EXIT_FAILURE);
-            break;
-    }
-}
-
-
-void
-perfmon_printDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup groupId)
+/*void
+perfmon_printDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup group)
 {
     int threadId;
     double time = rdtscTime;
@@ -76,7 +25,7 @@ perfmon_printDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup groupId)
     uint64_t cpi_cyc  = 0;
     int cpi_index = 0;
 
-    switch ( groupId ) 
+    switch ( group ) 
     {
 [% FOREACH group IN groups %]
         case [% group.name %]:
@@ -136,7 +85,7 @@ perfmon_printDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup groupId)
     printResultTable(&tableData);
     freeResultTable(&tableData);
 
-    /* for threaded results print sum, max, min and avg */
+    // for threaded results print sum, max, min and avg 
     if (perfmon_numThreads > 1)
     {
         initStatisticTable(&tableData, fc, numRows);
@@ -196,7 +145,7 @@ perfmon_logDerivedMetrics[% arch FILTER ucfirst %](PerfmonGroup group, double ti
                 exit (EXIT_FAILURE);
                 break;
     }
-}
+}*/
 
 
 
diff --git a/src/access-daemon/Makefile b/src/access-daemon/Makefile
index afd751b..17fb996 100644
--- a/src/access-daemon/Makefile
+++ b/src/access-daemon/Makefile
@@ -4,13 +4,13 @@
 #
 #      Description:  accessDaemon Makefile
 #
-#      Version:   3.1.3
-#      Released:  4.11.2014
+#      Version:   4.0
+#      Released:  16.6.2015
 #
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -32,19 +32,22 @@ include  ../../make/include_$(COMPILER).mk
 DAEMON_TARGET = likwid-accessD
 SETFREQ_TARGET = likwid-setFreq
 
-DEFINES   = -D_GNU_SOURCE -DMAX_NUM_THREADS=$(MAX_NUM_THREADS)
+DEFINES   = -D_GNU_SOURCE -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) -DPCILIST="$(PCI_SOCKETS)"
 INCLUDES  = -I../includes
 ifeq ($(COMPILER),GCC)
-CFLAGS    +=  -pedantic -Wall -Wextra -std=c99
+CFLAGS    +=  -Wall -Wextra -std=c99
+endif
+ifeq ($(COMPILER),ICC)
+CFLAGS  += -std=c99
 endif
 CPPFLAGS :=  $(DEFINES) $(INCLUDES)
-Q=
 
 all: $(DAEMON_TARGET) $(SETFREQ_TARGET)
 
 $(DAEMON_TARGET): accessDaemon.c
-	$(CC) $(ANSI_CFLAGS) $(CFLAGS) $(CPPFLAGS) -o ../../$(DAEMON_TARGET) accessDaemon.c
+	$(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -o ../../$(DAEMON_TARGET) accessDaemon.c
 
 $(SETFREQ_TARGET): setFreq.c
-	$(CC) $(ANSI_CFLAGS) $(CFLAGS) $(CPPFLAGS) -o ../../$(SETFREQ_TARGET) setFreq.c
+	$(Q)$(CC) $(CFLAGS) $(CPPFLAGS) -o ../../$(SETFREQ_TARGET) setFreq.c
+
 
diff --git a/src/access-daemon/accessDaemon.c b/src/access-daemon/accessDaemon.c
index 5679a92..ea9c1bd 100644
--- a/src/access-daemon/accessDaemon.c
+++ b/src/access-daemon/accessDaemon.c
@@ -5,14 +5,15 @@
  *
  *      Description:  Implementation of access daemon.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
  *      Authors:  Michael Meier, michael.meier at rrze.fau.de
- *                Jan Treibig (jt), jan.treibig at gmail.com
+ *                Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -50,93 +51,49 @@
 #include <lock.h>
 #include <accessClient_types.h>
 
+
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 #define SA struct sockaddr
 #define str(x) #x
 
 #define CHECK_ERROR(func, msg)  \
-    if ((func) < 0) { \
-        syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
-    }
+    if ((func) < 0) { syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); }
 
 #define CHECK_FILE_ERROR(func, msg)  \
-    if ((func) == 0) { \
-        syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
-    }
+    if ((func) == 0) { syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); }
 
 
 #define EXIT_IF_ERROR(func, msg)  \
-    if ((func) < 0) { \
-        syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
-        stop_daemon(); \
-        exit(EXIT_FAILURE); \
-    }
+    if ((func) < 0) { syslog(LOG_ERR, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); stop_daemon(); exit(EXIT_FAILURE); }
 
 
-#define CPUID \
-    __asm__ volatile ("cpuid" \
-            : "=a" (eax), "=b" (ebx) \
+#define CPUID                    \
+    __asm__ volatile ("cpuid"    \
+            : "=a" (eax),            \
+            "=b" (ebx)             \
             : "0" (eax))
 
 
-/* Intel P6 */
-#define PENTIUM_M_BANIAS     0x09U
-#define PENTIUM_M_DOTHAN     0x0DU
-#define CORE_DUO             0x0EU
-#define CORE2_65             0x0FU
-#define CORE2_45             0x17U
-#define ATOM                 0x1CU
-#define ATOM_45              0x26U
-#define ATOM_32              0x36U
-#define ATOM_22              0x27U
-#define ATOM_SILVERMONT      0x4DU
-#define NEHALEM              0x1AU
-#define NEHALEM_BLOOMFIELD   0x1AU
-#define NEHALEM_LYNNFIELD    0x1EU
-#define NEHALEM_LYNNFIELD_M  0x1FU
-#define NEHALEM_WESTMERE     0x2CU
-#define NEHALEM_WESTMERE_M   0x25U
+#define  P6_FAMILY        0x6U
+#define  K8_FAMILY        0xFU
+#define  K10_FAMILY       0x10U
+#define  K15_FAMILY       0x15U
+#define  K16_FAMILY       0x16U
+
 #define SANDYBRIDGE          0x2AU
 #define SANDYBRIDGE_EP       0x2DU
-#define HASWELL              0x3CU
-#define HASWELL_EX           0x3FU
-#define HASWELL_M1           0x45U
-#define HASWELL_M2           0x46U
 #define IVYBRIDGE            0x3AU
 #define IVYBRIDGE_EP         0x3EU
-#define NEHALEM_EX           0x2EU
-#define WESTMERE_EX          0x2FU
-#define XEON_MP              0x1DU
-
-/* Intel MIC */
-#define XEON_PHI           0x01U
-
-/* AMD K10 */
-#define BARCELONA      0x02U
-#define SHANGHAI       0x04U
-#define ISTANBUL       0x08U
-#define MAGNYCOURS     0x09U
-
-/* AMD K8 */
-#define OPTERON_SC_1MB  0x05U
-#define OPTERON_DC_E    0x21U
-#define OPTERON_DC_F    0x41U
-#define ATHLON64_X2     0x43U
-#define ATHLON64_X2_F   0x4BU
-#define ATHLON64_F1     0x4FU
-#define ATHLON64_F2     0x5FU
-#define ATHLON64_X2_G   0x6BU
-#define ATHLON64_G1     0x6FU
-#define ATHLON64_G2     0x7FU
-
-
-#define  P6_FAMILY        0x6U
-#define  MIC_FAMILY       0xBU
-#define  NETBURST_FAMILY  0xFFU
-#define  K15_FAMILY       0x15U
-#define  K16_FAMILY       0x16U
-#define  K10_FAMILY       0x10U
-#define  K8_FAMILY        0xFU
+#define HASWELL              0x3CU
+#define HASWELL_EP           0x3FU
+#define ATOM_SILVERMONT_E    0x37U
+#define ATOM_SILVERMONT_C    0x4DU
+#define ATOM_SILVERMONT_Z1   0x4AU
+#define ATOM_SILVERMONT_Z2   0x5AU
+#define ATOM_SILVERMONT_F    0x5DU
+#define BROADWELL            0x3DU
+#define BROADWELL_E          0x4FU
+#define BROADWELL_D          0x56U
 
 #define PCI_ROOT_PATH    "/proc/bus/pci/"
 #define MAX_PATH_LENGTH   60
@@ -156,25 +113,9 @@ static char* filepath;
 static const char* ident = "accessD";
 static FuncPrototype allowed = NULL;
 static int FD_MSR[MAX_NUM_THREADS];
-static int FD_PCI[MAX_NUM_NODES][MAX_NUM_DEVICES];
+static int FD_PCI[MAX_NUM_NODES][MAX_NUM_PCI_DEVICES];
 static int isPCIUncore = 0;
-
-static char* pci_DevicePath[MAX_NUM_DEVICES] = {
- "13.5",   /* PCI_R3QPI_DEVICE_LINK_0 */
- "13.6",   /* PCI_R3QPI_DEVICE_LINK_1 */
- "13.1",   /* PCI_R2PCIE_DEVICE */
- "10.0",   /* PCI_IMC_DEVICE_CH_0 */
- "10.1",   /* PCI_IMC_DEVICE_CH_1 */
- "10.4",   /* PCI_IMC_DEVICE_CH_2 */
- "10.5",   /* PCI_IMC_DEVICE_CH_3 */
- "0e.1",   /* PCI_HA_DEVICE */
- "08.2",   /* PCI_QPI_DEVICE_PORT_0 */
- "09.2",   /* PCI_QPI_DEVICE_PORT_1 */
- "08.6",   /* PCI_QPI_MASK_DEVICE_PORT_0 */
- "09.6",   /* PCI_QPI_MASK_DEVICE_PORT_1 */
- "08.0",   /* PCI_QPI_MISC_DEVICE_PORT_0 */
- "09.0" }; /* PCI_QPI_MISC_DEVICE_PORT_1 */
-
+static PciDevice* pci_devices = NULL;
 static char pci_filepath[MAX_PATH_LENGTH];
 
 /* Socket to bus mapping -- will be determined at runtime;
@@ -185,8 +126,78 @@ static char pci_filepath[MAX_PATH_LENGTH];
  *   2                  0xbf
  *   3                  0xff
  */
-static char* socket_bus[MAX_NUM_NODES];
-
+static char* socket_bus[MAX_NUM_NODES] = { [0 ... (MAX_NUM_NODES-1)] = NULL};
+
+
+static PciDevice sandybridgeEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [MSR_DEV] = {NONE, NULL, NULL, NULL, 0x0, 0},
+ [PCI_R3QPI_DEVICE_LINK_0] = {R3QPI, "13.5", "PCI_R3QPI_DEVICE_LINK_0", "RBOX0", 0x3c44, 0},
+ [PCI_R3QPI_DEVICE_LINK_1] = {R3QPI, "13.6", "PCI_R3QPI_DEVICE_LINK_1", "RBOX1", 0x3c45, 0},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "13.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x3c43, 0},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "10.0", "PCI_IMC_DEVICE_CH_0", "MBOX0", 0x3cb0, 0},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "10.1", "PCI_IMC_DEVICE_CH_1", "MBOX1", 0x3cb1, 0},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "10.4", "PCI_IMC_DEVICE_CH_2", "MBOX2", 0x3cb4, 0},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "10.5", "PCI_IMC_DEVICE_CH_3", "MBOX3", 0x3cb5, 0},
+ [PCI_HA_DEVICE_0] = {HA, "0e.1", "PCI_HA_DEVICE", "BBOX", 0x3c46, 0},
+ [PCI_QPI_DEVICE_PORT_0] = {QPI, "08.2", "PCI_QPI_DEVICE_PORT_0", "SBOX0", 0x3c41, 0},
+ [PCI_QPI_DEVICE_PORT_1] = {QPI, "09.2", "PCI_QPI_DEVICE_PORT_1", "SBOX1", 0x3c42, 0},
+ [PCI_QPI_MASK_DEVICE_PORT_0] = {QPI, "08.6", "PCI_QPI_MASK_DEVICE_PORT_0", NULL, 0x3c86, 0},
+ [PCI_QPI_MASK_DEVICE_PORT_1] = {QPI, "09.6", "PCI_QPI_MASK_DEVICE_PORT_1", NULL, 0x3c96, 0},
+ [PCI_QPI_MISC_DEVICE_PORT_0] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_0", "SBOX0FIX",0x3c80, 0},
+ [PCI_QPI_MISC_DEVICE_PORT_1] = {QPI, "09.0", "PCI_QPI_MISC_DEVICE_PORT_1", "SBOX1FIX", 0x3c91, 0},
+};
+
+
+static PciDevice ivybridgeEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [MSR_DEV] = {NONE, "", "", "", 0x0, 0},
+ [PCI_R3QPI_DEVICE_LINK_0] = {R3QPI, "13.5", "PCI_R3QPI_DEVICE_LINK_0", "RBOX0", 0x0e36, 0},
+ [PCI_R3QPI_DEVICE_LINK_1] = {R3QPI, "13.6", "PCI_R3QPI_DEVICE_LINK_1", "RBOX1", 0x0e37, 0},
+ [PCI_R3QPI_DEVICE_LINK_2] = {R3QPI, "12.5", "PCI_R3QPI_DEVICE_LINK_2", "RBOX2", 0x0e3e, 0},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "13.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x0e34, 0},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "10.4", "PCI_IMC_DEVICE_0_CH_0", "MBOX0", 0x0eb4, 0},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "10.5", "PCI_IMC_DEVICE_0_CH_1", "MBOX1", 0x0eb5, 0},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "10.0", "PCI_IMC_DEVICE_0_CH_2", "MBOX2", 0x0eb0, 0},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "10.1", "PCI_IMC_DEVICE_0_CH_3", "MBOX3", 0x0eb1, 0},
+ [PCI_HA_DEVICE_0] = {HA, "0e.1", "PCI_HA_DEVICE_0", "BBOX0", 0x0e30, 0},
+ [PCI_HA_DEVICE_1] = {HA, "1c.1", "PCI_HA_DEVICE_1", "BBOX1", 0x0e38, 0},
+ [PCI_IMC_DEVICE_1_CH_0] = {IMC, "1e.4", "PCI_IMC_DEVICE_1_CH_0", "MBOX0", 0x0ef4, 0},
+ [PCI_IMC_DEVICE_1_CH_1] = {IMC, "1e.5", "PCI_IMC_DEVICE_1_CH_1", "MBOX1", 0x0ef5, 0},
+ [PCI_IMC_DEVICE_1_CH_2] = {IMC, "1e.0", "PCI_IMC_DEVICE_1_CH_2", "MBOX2", 0x0ef0, 0},
+ [PCI_IMC_DEVICE_1_CH_3] = {IMC, "1e.1", "PCI_IMC_DEVICE_1_CH_3", "MBOX3", 0x0ef1, 0},
+ [PCI_IRP_DEVICE] = {IRP, "05.6", "PCI_IRP_DEVICE", NULL, 0x0e39, 0},
+ [PCI_QPI_DEVICE_PORT_0] = {QPI, "08.2", "PCI_QPI_DEVICE_PORT_0", "SBOX0", 0x0e32, 0},
+ [PCI_QPI_DEVICE_PORT_1] = {QPI, "09.2", "PCI_QPI_DEVICE_PORT_1", "SBOX1", 0x0e33, 0},
+ [PCI_QPI_DEVICE_PORT_2] = {QPI, "0a.2", "PCI_QPI_DEVICE_PORT_2", "SBOX2", 0x0e3a, 0},
+ [PCI_QPI_MASK_DEVICE_PORT_0] = {QPI, "08.6", "PCI_QPI_MASK_DEVICE_PORT_0", NULL, 0x0e86, 0},
+ [PCI_QPI_MASK_DEVICE_PORT_1] = {QPI, "09.6", "PCI_QPI_MASK_DEVICE_PORT_1", NULL, 0x0e96, 0},
+ [PCI_QPI_MASK_DEVICE_PORT_2] = {QPI, "0a.6", "PCI_QPI_MASK_DEVICE_PORT_2", NULL, 0x0ec6, 0},
+ [PCI_QPI_MISC_DEVICE_PORT_0] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_0/1", "SBOX01FIX",0x0e80, 0},
+ [PCI_QPI_MISC_DEVICE_PORT_2] = {QPI, "0a.0", "PCI_QPI_MISC_DEVICE_PORT_2", "SBOX2FIX", 0x0ec0, 0},
+};
+
+static PciDevice haswellEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [MSR_DEV] = {NONE, NULL, NULL, NULL, 0x0, 0},
+ [PCI_R3QPI_DEVICE_LINK_0] = {R3QPI, "0b.1", "PCI_R3QPI_DEVICE_LINK_0", "RBOX0", 0x2f36, 0},
+ [PCI_R3QPI_DEVICE_LINK_1] = {R3QPI, "0b.2", "PCI_R3QPI_DEVICE_LINK_1", "RBOX1", 0x2f37, 0},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "10.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x2f34, 0},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "14.0", "PCI_IMC_DEVICE_0_CH_0", "MBOX0", 0x2fb4, 0},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "14.1", "PCI_IMC_DEVICE_0_CH_1", "MBOX1", 0x2fb5, 0},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "15.0", "PCI_IMC_DEVICE_0_CH_2", "MBOX2", 0x2fb0, 0},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "15.1", "PCI_IMC_DEVICE_0_CH_3", "MBOX3", 0x2fb1, 0},
+ [PCI_HA_DEVICE_0] = {HA, "12.1", "PCI_HA_DEVICE_0", "BBOX0", 0x2f30, 0},
+ [PCI_HA_DEVICE_1] = {HA, "12.5", "PCI_HA_DEVICE_1", "BBOX1", 0x2f38, 0},
+ [PCI_IMC_DEVICE_1_CH_0] = {IMC, "17.0", "PCI_IMC_DEVICE_1_CH_0", "MBOX4", 0x2fd4, 0},
+ [PCI_IMC_DEVICE_1_CH_1] = {IMC, "17.1", "PCI_IMC_DEVICE_1_CH_1", "MBOX5", 0x2fd5, 0},
+ [PCI_IMC_DEVICE_1_CH_2] = {IMC, "18.0", "PCI_IMC_DEVICE_1_CH_2", "MBOX6", 0x2fd0, 0},
+ [PCI_IMC_DEVICE_1_CH_3] = {IMC, "18.1", "PCI_IMC_DEVICE_1_CH_3", "MBOX7", 0x2fd1, 0},
+ [PCI_IRP_DEVICE] = {IRP, "05.6", "PCI_IRP_DEVICE", NULL, 0x2f39, 0},
+ [PCI_QPI_DEVICE_PORT_0] = {QPI, "08.2", "PCI_QPI_DEVICE_PORT_0", "SBOX0", 0x2f32, 0},
+ [PCI_QPI_DEVICE_PORT_1] = {QPI, "09.2", "PCI_QPI_DEVICE_PORT_1", "SBOX1", 0x2f33, 0},
+ [PCI_QPI_MASK_DEVICE_PORT_0] = {QPI, "08.6", "PCI_QPI_MASK_DEVICE_PORT_0", NULL, 0x2f86, 0},
+ [PCI_QPI_MASK_DEVICE_PORT_1] = {QPI, "09.6", "PCI_QPI_MASK_DEVICE_PORT_1", NULL, 0x2f96, 0},
+ [PCI_QPI_MISC_DEVICE_PORT_0] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_0", "SBOX0FIX", 0x2f80, 0},
+ [PCI_QPI_MISC_DEVICE_PORT_1] = {QPI, "09.0", "PCI_QPI_MISC_DEVICE_PORT_1", "SBOX1FIX", 0x2f80, 0},
+};
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
 
@@ -203,31 +214,7 @@ static int allowed_intel(uint32_t reg)
             (reg == 0x19C)  ||
             (reg == 0x1A2)  ||
             (reg == 0x1AD)  ||
-            (reg == 0x1A6))
-    {
-        return 1;
-    }
-    else
-    {
-        return 0;
-    }
-}
-
-static int allowed_silvermont(uint32_t reg)
-{
-    if ( ((reg & 0x0F8U) == 0x0C0U) ||
-            ((reg & 0xFF0U) == 0x180U) ||
-            ((reg & 0xF00U) == 0x300U) ||
-            ((reg & 0xF00U) == 0x600U) ||
-            ((reg & 0xF00U) == 0xC00U) ||
-            ((reg & 0xF00U) == 0xD00U) ||
-            (reg == 0x1A0)  ||
-            (reg == 0x0CE)  ||
-            (reg == 0x1AD)  ||
-            (reg == 0x19C)  ||
-            (reg == 0x1A2)  ||
-            (reg == 0x1A6) ||
-            (reg == 0x1A6) ||
+            (reg == 0x1A6)  ||
             (reg == 0x1A7))
     {
         return 1;
@@ -238,33 +225,21 @@ static int allowed_silvermont(uint32_t reg)
     }
 }
 
-static int allowed_westmereEX(uint32_t reg)
+static int allowed_sandybridge(uint32_t reg)
 {
-    if (allowed_intel(reg) == 1)
-    {
-        return 1;
-    }
-    else if ((reg & 0xF00) == 0xF00)
+    if ((allowed_intel(reg)) ||
+        (((reg & 0xF00U) == 0x600U)))
     {
         return 1;
     }
     return 0;
 }
 
-static int allowed_sandybridge(uint32_t reg)
+static int allowed_haswell(uint32_t reg)
 {
-    if ( ((reg & 0x0F8U) == 0x0C0U) ||
-            ((reg & 0xFF0U) == 0x180U) ||
-            ((reg & 0xF00U) == 0x300U) ||
-            ((reg & 0xF00U) == 0x600U) ||
-            ((reg & 0xF00U) == 0xC00U) ||
-            ((reg & 0xF00U) == 0xD00U) ||
-            (reg == 0x1A0)  ||
-            (reg == 0x0CE)  ||
-            (reg == 0x1AD)  ||
-            (reg == 0x19C)  ||
-            (reg == 0x1A2)  ||
-            (reg == 0x1A6))
+    if ((allowed_intel(reg)) ||
+        (allowed_sandybridge(reg)) ||
+        (((reg & 0xF00U) == 0x700U)))
     {
         return 1;
     }
@@ -274,22 +249,23 @@ static int allowed_sandybridge(uint32_t reg)
     }
 }
 
-static int allowed_haswell(uint32_t reg)
+
+static int allowed_silvermont(uint32_t reg)
 {
+
     if ( ((reg & 0x0F8U) == 0x0C0U) ||
             ((reg & 0xFF0U) == 0x180U) ||
             ((reg & 0xF00U) == 0x300U) ||
+            ((reg & 0xF00U) == 0x600U) ||
             ((reg & 0xF00U) == 0xC00U) ||
             ((reg & 0xF00U) == 0xD00U) ||
-            ((reg & 0xF00U) == 0xE00U) ||
-            ((reg & 0xF00U) == 0x600U) ||
-            ((reg & 0xF00U) == 0x700U) ||
             (reg == 0x1A0)  ||
             (reg == 0x0CE)  ||
+            (reg == 0x1AD)  ||
             (reg == 0x19C)  ||
             (reg == 0x1A2)  ||
-            (reg == 0x1AD)  ||
-            (reg == 0x1A6))
+            (reg == 0x1A6) ||
+            (reg == 0x1A7))
     {
         return 1;
     }
@@ -347,21 +323,16 @@ static void msr_read(AccessDataRecord * dRecord)
     dRecord->errorcode = ERR_NOERROR;
     dRecord->data = 0;
 
-    if (FD_MSR[cpu] == -2)
-    {
-        dRecord->errorcode = ERR_NODEV;
-        return;
-    }
     if (!allowed(reg))
     {
-        syslog(LOG_ERR, "attempt to read from restricted register 0x%x", reg);
+        syslog(LOG_ERR, "Attempt to read to restricted register 0x%x on core %u", reg, cpu);
         dRecord->errorcode = ERR_RESTREG;
         return;
     }
 
     if (pread(FD_MSR[cpu], &data, sizeof(data), reg) != sizeof(data))
     {
-        syslog(LOG_ERR, "Failed to read data from msr device file on core %u", cpu);
+        syslog(LOG_ERR, "Failed to read data to register 0x%x on core %u", reg, cpu);
         dRecord->errorcode = ERR_RWFAIL;
         return;
     }
@@ -377,22 +348,16 @@ static void msr_write(AccessDataRecord * dRecord)
 
     dRecord->errorcode = ERR_NOERROR;
 
-    if (FD_MSR[cpu] == -2)
-    {
-        dRecord->errorcode = ERR_NODEV;
-        return;
-    }
-
     if (!allowed(reg))
     {
-        syslog(LOG_ERR, "attempt to write to restricted register %x", reg);
+        syslog(LOG_ERR, "Attempt to write to restricted register 0x%x on core %u", reg, cpu);
         dRecord->errorcode = ERR_RESTREG;
         return;
     }
 
     if (pwrite(FD_MSR[cpu], &data, sizeof(data), reg) != sizeof(data))
     {
-        syslog(LOG_ERR, "Failed to write data to msr device file on core %u", cpu);
+        syslog(LOG_ERR, "Failed to write data to register 0x%x on core %u", reg, cpu);
         dRecord->errorcode = ERR_RWFAIL;
         return;
     }
@@ -413,26 +378,29 @@ static void pci_read(AccessDataRecord* dRecord)
         dRecord->errorcode = ERR_NODEV;
         return;
     }
-    else if ( !FD_PCI[socketId][device] )
+
+    if ( !FD_PCI[socketId][device] )
     {
         strncpy(pci_filepath, PCI_ROOT_PATH, 30);
         strncat(pci_filepath, socket_bus[socketId], 10);
-        strncat(pci_filepath, pci_DevicePath[device], 20);
-
+        strncat(pci_filepath, pci_devices[device].path, 20);
         FD_PCI[socketId][device] = open( pci_filepath, O_RDWR);
 
         if ( FD_PCI[socketId][device] < 0)
         {
-            syslog(LOG_ERR, "Failed to open device file %s on socket %u", pci_filepath, socketId);
+            syslog(LOG_ERR, "Failed to open device file %s for device %s (%s) on socket %u", pci_filepath,
+                    pci_types[pci_devices[device].type].name, pci_devices[device].name, socketId);
             dRecord->errorcode = ERR_OPENFAIL;
             return;
         }
+        syslog(LOG_ERR, "Open device file %s for device %s (%s) on socket %u", pci_filepath,
+                    pci_types[pci_devices[device].type].name, pci_devices[device].name, socketId);
     }
 
-    if ( pread(FD_PCI[socketId][device], &data, sizeof(data), reg) != sizeof(data))
+    if (FD_PCI[socketId][device] > 0 && pread(FD_PCI[socketId][device], &data, sizeof(data), reg) != sizeof(data))
     {
-        syslog(LOG_ERR, "Failed to read data from pci device file on socket %u device %u",
-                socketId, device);
+        syslog(LOG_ERR, "Failed to read data from pci device file %s for device %s (%s) on socket %u",
+                pci_filepath,pci_types[pci_devices[device].type].name, pci_devices[device].name,socketId);
         dRecord->errorcode = ERR_RWFAIL;
         return;
     }
@@ -450,30 +418,36 @@ static void pci_write(AccessDataRecord* dRecord)
     uint32_t data = (uint32_t) dRecord->data;
 
     dRecord->errorcode = ERR_NOERROR;
+
     if (FD_PCI[socketId][device] == -2)
     {
         dRecord->errorcode = ERR_NODEV;
         return;
     }
-    else if ( !FD_PCI[socketId][device] )
+
+    if ( !FD_PCI[socketId][device] )
     {
         strncpy(pci_filepath, PCI_ROOT_PATH, 30);
         strncat(pci_filepath, socket_bus[socketId], 10);
-        strncat(pci_filepath, pci_DevicePath[device], 20);
+        strncat(pci_filepath, pci_devices[device].path, 20);
 
         FD_PCI[socketId][device] = open( pci_filepath, O_RDWR);
 
         if ( FD_PCI[socketId][device] < 0)
         {
-            syslog(LOG_ERR, "Failed to open device file %s on socket %u", pci_filepath, socketId);
+            syslog(LOG_ERR, "Failed to open device file %s for device %s (%s) on socket %u", pci_filepath,
+                        pci_types[pci_devices[device].type].name, pci_devices[device].name, socketId);
             dRecord->errorcode = ERR_OPENFAIL;
             return;
         }
+        syslog(LOG_ERR, "Open device file %s for device %s (%s) on socket %u", pci_filepath,
+                    pci_types[pci_devices[device].type].name, pci_devices[device].name, socketId);
     }
 
-    if (pwrite(FD_PCI[socketId][device], &data, sizeof data, reg) != sizeof data)
+    if (FD_PCI[socketId][device] > 0 && pwrite(FD_PCI[socketId][device], &data, sizeof data, reg) != sizeof data)
     {
-        syslog(LOG_ERR, "Failed to write data to pci device file on socket %u", socketId);
+        syslog(LOG_ERR, "Failed to write data to pci device file %s for device %s (%s) on socket %u",pci_filepath,
+                pci_types[pci_devices[device].type].name, pci_devices[device].name, socketId);
         dRecord->errorcode = ERR_RWFAIL;
         return;
     }
@@ -496,6 +470,13 @@ static void stop_daemon(void)
 {
     kill_client();
     syslog(LOG_NOTICE, "daemon exiting");
+    for (int i=0;i<MAX_NUM_NODES;i++)
+    {
+        if (socket_bus[i] != NULL)
+        {
+            free(socket_bus[i]);
+        }
+    }
 
     if (sockfd != -1)
     {
@@ -507,6 +488,41 @@ static void stop_daemon(void)
     exit(EXIT_SUCCESS);
 }
 
+int getBusFromSocket(const uint32_t socket)
+{
+    int cur_bus = 0;
+    uint32_t cur_socket = 0;
+    char pci_filepath[1024];
+    int fp;
+    int ret = 0;
+    while(cur_socket <= socket)
+    {
+        sprintf(pci_filepath, "%s%02x/05.0", PCI_ROOT_PATH, cur_bus);
+        fp = open(pci_filepath, O_RDONLY);
+        if (fp < 0)
+        {
+            return -1;
+        }
+        uint32_t cpubusno = 0;
+        ret = pread(fp, &cpubusno, sizeof(uint32_t), 0x108);
+        if (ret != sizeof(uint32_t))
+        {
+            close(fp);
+            return -1;
+        }
+        cur_bus = (cpubusno >> 8) & 0x0ff;
+        close(fp);
+        if(socket == cur_socket)
+            return cur_bus;
+        ++cur_socket;
+        ++cur_bus;
+        if(cur_bus > 0x0ff)
+           return -1;
+    }
+
+    return -1;
+}
+
 static void Signal_Handler(int sig)
 {
     if (sig == SIGPIPE)
@@ -516,7 +532,7 @@ static void Signal_Handler(int sig)
     }
 
     /* For SIGALRM we just return - we're just here to create a EINTR */
-    if ((sig == SIGTERM))
+    if (sig == SIGTERM)
     {
         stop_daemon();
     }
@@ -543,7 +559,7 @@ static void daemonize(int* parentPid)
     /* If we got a good PID, then we can exit the parent process. */
     if (pid > 0)
     {
-        exit(ERR_NOERROR);
+        exit(EXIT_SUCCESS);
     }
 
     /* At this point we are executing as the child process */
@@ -585,90 +601,88 @@ int main(void)
     mode_t oldumask;
     uint32_t numHWThreads = sysconf(_SC_NPROCESSORS_CONF);
     uint32_t model;
-    int isIntel = 1;
+#ifdef REVERSE_HASWELL_PCI_SOCKETS
+    char** socket_bus_copy;
+#endif
+
+
+    openlog(ident, 0, LOG_USER);
 
     if (!lock_check())
     {
-        fprintf(stderr,"Access to performance counters is locked. Exiting!\n");
-        exit(EXIT_FAILURE);
+        syslog(LOG_ERR,"Access to performance counters is locked.\n");
+        stop_daemon();
     }
 
-    for ( uint32_t i=0; i < numHWThreads; i++ )
-    {
-        FD_MSR[i] = -1;
-    }
+    daemonize(&pid);
 
-    uint32_t  eax = 0x00;
-    uint32_t  ebx = 0x00;
-    
-    CPUID;
-    if (ebx == 0x68747541U)
     {
-        isIntel = 0;
-    }
+        uint32_t  eax = 0x00;
+        uint32_t  ebx = 0x00;
+        /*int isIntel = 1;
+        CPUID;
+        if (ebx == 0x68747541U)
+        {
+            isIntel = 0;
+        }*/
 
-    eax = 0x01;
-    CPUID;
-    uint32_t family = ((eax >> 8) & 0xFU) + ((eax >> 20) & 0xFFU);
-    model  = (((eax >> 16) & 0xFU) << 4) + ((eax >> 4) & 0xFU);
+        eax = 0x01;
+        CPUID;
+        uint32_t family = ((eax >> 8) & 0xFU) + ((eax >> 20) & 0xFFU);
+        model  = (((eax >> 16) & 0xFU) << 4) + ((eax >> 4) & 0xFU);
 
-    switch (family)
-    {
-        case P6_FAMILY:
-            allowed = allowed_intel;
+        switch (family)
+        {
+            case P6_FAMILY:
+                allowed = allowed_intel;
 
-            if (isIntel && ((model == SANDYBRIDGE)    ||
-                            (model == SANDYBRIDGE_EP) ||
-                            (model == IVYBRIDGE)      ||
-                            (model == IVYBRIDGE_EP) ))
-            {
-                allowed = allowed_sandybridge;
-                isPCIUncore = 1;
-            }
-            else if (isIntel && ((model == HASWELL)    ||
-                                 (model == HASWELL_M1) ||
-                                 (model == HASWELL_M2) ||
-                                 (model == HASWELL_EX)))
-            {
-                allowed = allowed_haswell;
-            }
-            else if (isIntel && (model == ATOM_SILVERMONT))
-            {
-                allowed = allowed_silvermont;
-            }
-            else if (isIntel && (model == WESTMERE_EX))
-            {
-                allowed = allowed_westmereEX;
-            }
-            break;
-        case K8_FAMILY:
-        case K10_FAMILY:
-            if (!isIntel) 
-            {
+                if ((model == SANDYBRIDGE) || (model == IVYBRIDGE))
+                {
+                    allowed = allowed_sandybridge;
+                }
+                else if ((model == SANDYBRIDGE_EP) || (model == IVYBRIDGE_EP))
+                {
+                    allowed = allowed_sandybridge;
+                    isPCIUncore = 1;
+                }
+                else if ((model == HASWELL) ||
+                         (model == BROADWELL) ||
+                         (model == BROADWELL_D) ||
+                         (model == BROADWELL_E))
+                {
+                    allowed = allowed_haswell;
+                }
+                else if (model == HASWELL_EP)
+                {
+                    isPCIUncore = 1;
+                    allowed = allowed_haswell;
+                }
+                else if ((model == ATOM_SILVERMONT_C) ||
+                         (model == ATOM_SILVERMONT_E) ||
+                         (model == ATOM_SILVERMONT_Z1) ||
+                         (model == ATOM_SILVERMONT_Z2) ||
+                         (model == ATOM_SILVERMONT_F))
+                {
+                    allowed = allowed_silvermont;
+                }
+                break;
+            case K8_FAMILY:
+            case K10_FAMILY:
                 allowed = allowed_amd;
-            }
-            break;
-        case K15_FAMILY:
-            if (!isIntel) 
-            {
+                break;
+            case K15_FAMILY:
                 allowed = allowed_amd15;
-            }
-            break;
-        case K16_FAMILY:
-            if (!isIntel) 
-            {
+                break;
+            case K16_FAMILY:
                 allowed = allowed_amd16;
-            }
             break;
-        default:
-            fprintf(stderr, "ERROR - [%s:%d] - Unsupported processor. Exiting!\n",
-                    __FILE__, __LINE__);
-            exit(EXIT_FAILURE);
+            default:
+                syslog(LOG_ERR, "ERROR - [%s:%d] - Unsupported processor. Exiting!  \n",
+                        __FILE__, __LINE__);
+                exit(EXIT_FAILURE);
+        }
     }
 
-    openlog(ident, 0, LOG_USER);
-    daemonize(&pid);
-
     /* setup filename for socket */
     filepath = (char*) calloc(sizeof(addr1.sun_path), 1);
     snprintf(filepath, sizeof(addr1.sun_path), "/tmp/likwid-%d", pid);
@@ -691,10 +705,6 @@ int main(void)
     EXIT_IF_ERROR(listen(sockfd, 1), listen failed);
     EXIT_IF_ERROR(chmod(filepath, S_IRUSR|S_IWUSR), chmod failed);
 
-    /* Restore the old umask and fs ids. */
-    (void) umask(oldumask);
-    CHECK_ERROR(setfsuid(geteuid()), setfsuid failed);
-
     socklen = sizeof(addr1);
 
     { /* Init signal handler */
@@ -730,6 +740,10 @@ int main(void)
     CHECK_ERROR(unlink(filepath), unlink of socket failed);
     syslog(LOG_NOTICE, "daemon accepted client");
 
+    /* Restore the old umask and fs ids. */
+    (void) umask(oldumask);
+    CHECK_ERROR(setfsuid(geteuid()), setfsuid failed);
+
     {
         char* msr_file_name = (char*) malloc(MAX_PATH_LENGTH * sizeof(char));
 
@@ -737,75 +751,66 @@ int main(void)
          * NOTICE: This assumes consecutive processor Ids! */
         for ( uint32_t i=0; i < numHWThreads; i++ )
         {
-#ifdef __MIC
-            sprintf(msr_file_name,"/dev/msr%d",i);
-            if (access(msr_file_name, F_OK))
-            {
-                sprintf(msr_file_name,"/dev/cpu/%d/msr",i);
-            }
-#else
             sprintf(msr_file_name,"/dev/cpu/%d/msr",i);
-#endif
             FD_MSR[i] = open(msr_file_name, O_RDWR);
 
             if ( FD_MSR[i] < 0 )
             {
-                syslog(LOG_ERR, "Failed to open device file %s.",msr_file_name);
-                FD_MSR[i] = -2;
+                syslog(LOG_ERR, "Failed to open device file %s: %s, trying /dev/msr%d", msr_file_name, strerror(errno), i);
+                sprintf(msr_file_name,"/dev/msr%d",i);
+                FD_MSR[i] = open(msr_file_name, O_RDWR);
+                if ( FD_MSR[i] < 0 )
+                {
+                    syslog(LOG_ERR, "Failed to open device file %s: %s.", msr_file_name, strerror(errno));
+                }
             }
         }
 
         free(msr_file_name);
-
         if (isPCIUncore)
         {
-            for (int j=0; j<MAX_NUM_NODES; j++)
-            {
-                socket_bus[j] = "N-A";
-                for (int i=0; i<MAX_NUM_DEVICES; i++)
-                {
-                    FD_PCI[j][i] = -2;
-                }
-            }
-
-            /* determine PCI-BUSID mapping ... */
-            FILE *fptr;
-            char buf[1024];
-            uint32_t testDevice;
-            uint32_t sbus, sdevfn, svend;
             int cntr = 0;
             int socket_count = 0;
-
             if (model == SANDYBRIDGE_EP)
             {
-                testDevice = 0x80863c44;
+                //testDevice = 0x80863c44;
+                pci_devices = sandybridgeEP_pci_devices;
             }
             else if (model == IVYBRIDGE_EP)
             {
-                testDevice = 0x80860e36;
+                //testDevice = 0x80860e36;
+                pci_devices = ivybridgeEP_pci_devices;
+            }
+            else if (model == HASWELL_EP)
+            {
+                //testDevice = 0x80862f30;
+                pci_devices = haswellEP_pci_devices;
             }
             else
             {
-                testDevice = 0;
+                //testDevice = 0;
                 syslog(LOG_NOTICE, "PCI Uncore not supported on this system");
             }
 
-            if ( ((fptr = fopen("/proc/bus/pci/devices", "r")) == NULL) || !testDevice)
-            {
-                syslog(LOG_NOTICE, "Unable to open /proc/bus/pci/devices");
-            }
-            else
+            for (int j=0; j<MAX_NUM_NODES; j++)
             {
-                while( fgets(buf, sizeof(buf)-1, fptr) )
+                socket_bus[j] = "N-A";
+                for (int i=0; i<MAX_NUM_PCI_DEVICES; i++)
                 {
-                    if ( sscanf(buf, "%2x%2x %8x", &sbus, &sdevfn, &svend) == 3 &&
-                            svend == testDevice )
-                    {
-                        socket_bus[cntr] = (char*)malloc(4);
-                        sprintf(socket_bus[cntr++], "%02x/", sbus);
-                    }
+                    FD_PCI[j][i] = -2;
                 }
-                fclose(fptr);
+            }
+
+            /* determine PCI-BUSID mapping ... */
+            int sbus = -1;
+            cntr = 0;
+            sbus = getBusFromSocket(cntr);
+            while (sbus != -1)
+            {
+                socket_bus[cntr] = (char*)malloc(4);
+                sprintf(socket_bus[cntr], "%02x/", sbus);
+                cntr++;
+                sbus = getBusFromSocket(cntr);
             }
 
             if ( cntr == 0 )
@@ -815,20 +820,25 @@ int main(void)
             else
             {
                 socket_count = cntr;
-
+                int fd;
                 for (int j=0; j<socket_count; j++)
                 {
-                    for (int i=0; i<MAX_NUM_DEVICES; i++)
+                    for (int i=1; i<MAX_NUM_PCI_DEVICES; i++)
                     {
-                        sprintf(pci_filepath, "%s%s%s",PCI_ROOT_PATH,socket_bus[j],pci_DevicePath[i]);
-
-                        if (!access(pci_filepath,F_OK))
-                        {
-                            FD_PCI[j][i] = 0;
-                        }
-                        else
+                        if (pci_devices[i].path)
                         {
-                            syslog(LOG_NOTICE, "Device %s not found, excluded it from device list\n",pci_filepath);
+                            sprintf(pci_filepath, "%s%s%s", PCI_ROOT_PATH, socket_bus[j], pci_devices[i].path);
+                            fd = open(pci_filepath, O_RDWR);
+                            if (fd > 0)
+                            {
+                                FD_PCI[j][i] = 0;
+                                pci_devices[i].online = 1;
+                                close(fd);
+                            }
+                            else if (j==0)
+                            {
+                                syslog(LOG_NOTICE, "Device %s for socket %d not found at path %s, excluded it from device list: %s\n",pci_devices[i].name,j, pci_filepath, strerror(errno));
+                            }
                         }
                     }
                 }
@@ -846,7 +856,7 @@ int main(void)
                     __FILE__, __LINE__, strerror(errno));
             stop_daemon();
         }
-        else if (ret == 0)
+        else if ((ret == 0) && (dRecord.type != DAEMON_EXIT))
         {
             syslog(LOG_ERR, "ERROR - [%s:%d] zero read", __FILE__, __LINE__);
             stop_daemon();
diff --git a/src/access-daemon/setFreq.c b/src/access-daemon/setFreq.c
index 967dbbf..50edc6d 100644
--- a/src/access-daemon/setFreq.c
+++ b/src/access-daemon/setFreq.c
@@ -1,18 +1,18 @@
 /*
  * =======================================================================================
- *
- *      Filename:  setFreq.c
- *
- *      Description:  Wrapper for accessing setfreq kernel FS files
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Authors:  Michael Meier, michael.meier at rrze.fau.de
- *                Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
+ *
+ *      Filename:  setFreq.c
+ *
+ *      Description:  Implementation of frequency daemon
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,103 +28,120 @@
  *
  * =======================================================================================
  */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-static int get_numCPUs()
-{
-    int cpucount = 0;
-    char line[1024];
-    FILE* fp = fopen("/proc/cpuinfo","r");
-    if (fp != NULL)
-    {
-        while( fgets(line,1024,fp) )
-        {
-            if (strncmp(line, "processor", 9) == 0)
-            {
-                cpucount++;
-            }
-        }
-    }
-    return cpucount;
-}
-
-int main (int argn, char** argv)
-{
-    int cpuid;
-    int freq;
-    int numCPUs = 0;
-    char* gov;
-    char* gpath = malloc(100);
-    char* fpath = malloc(100);
-    FILE* f;
-
-    if (argn < 3 || argn > 4)
-    {
-        fprintf(stderr, "Usage: %s <processorID> <frequency> [<governor>] \n",argv[0]);
-        exit(EXIT_FAILURE);
-    }
-
-    cpuid = atoi(argv[1]);
-    numCPUs = get_numCPUs();
-    if (cpuid < 0 || cpuid > numCPUs)
-    {
-        fprintf(stderr, "CPU %d not a valid CPU ID. Range from 0 to %d.\n",cpuid,numCPUs);
-        exit(EXIT_FAILURE);
-    }
-    freq  = atoi(argv[2]);
-    if (freq < 0)
-    {
-        fprintf(stderr, "Frequency must be greater than 0.\n");
-        exit(EXIT_FAILURE);
-    }
-    snprintf(gpath, 60, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);
-    snprintf(fpath, 60, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_setspeed", cpuid);
-
-    if (argn == 4)
-    {
-        gov = argv[3];
-
-        if ((strncmp(gov,"ondemand",12)) && (strncmp(gov,"performance",12)))
-        {
-            fprintf(stderr, "Invalid governor %s!\n",gov);
-            return (EXIT_FAILURE);
-        }
-
-        f = fopen(gpath, "w");
-        if (f == NULL)
-        {
-            fprintf(stderr, "Unable to open path for writing\n");
-            return (EXIT_FAILURE);
-        }
-        fprintf(f,"%s",gov);
-        fclose(f);
-        return(EXIT_SUCCESS);
-    }
-    else
-    {
-        f = fopen(gpath, "w");
-        if (f == NULL)
-        {
-            fprintf(stderr, "Unable to open path for writing\n");
-            return (EXIT_FAILURE);
-        }
-        fprintf(f,"userspace");
-        fclose(f);
-    }
-
-    f = fopen(fpath, "w");
-    if (f == NULL)
-    {
-        fprintf(stderr, "Unable to open path for writing\n");
-        return (EXIT_FAILURE);
-    }
-    fprintf(f,"%d",freq);
-    fclose(f);
-
-    return(EXIT_SUCCESS);
-}
-
-
+/* #####   HEADER FILE INCLUDES   ######################################### */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+static int get_numCPUs()
+{
+    int cpucount = 0;
+    char line[1024];
+    FILE* fp = fopen("/proc/cpuinfo","r");
+    if (fp != NULL)
+    {
+        while( fgets(line,1024,fp) )
+        {
+            if (strncmp(line, "processor", 9) == 0)
+            {
+                cpucount++;
+            }
+        }
+    }
+    return cpucount;
+}
+
+/* #####  MAIN FUNCTION DEFINITION   ################## */
+int main (int argn, char** argv)
+{
+    int cpuid;
+    int freq;
+    int numCPUs = 0;
+    char* gov;
+    char* gpath = malloc(100);
+    char* fpath = malloc(100);
+
+    if (argn < 3 || argn > 4)
+    {
+        fprintf(stderr, "Usage: %s <processorID> <frequency> [<governor>] \n",argv[0]);
+        free(gpath);
+        free(fpath);
+        exit(EXIT_FAILURE);
+    }
+
+    cpuid = atoi(argv[1]);
+    numCPUs = get_numCPUs();
+    if (cpuid < 0 || cpuid > numCPUs)
+    {
+        fprintf(stderr, "CPU %d not a valid CPU ID. Range from 0 to %d.\n",cpuid,numCPUs);
+        free(gpath);
+        free(fpath);
+        exit(EXIT_FAILURE);
+    }
+    freq  = atoi(argv[2]);
+    if (freq <= 0)
+    {
+        fprintf(stderr, "Frequency must be greater than 0.\n");
+        free(gpath);
+        free(fpath);
+        exit(EXIT_FAILURE);
+    }
+
+    if (argn == 4)
+    {
+        gov = argv[3];
+
+        if ((strncmp(gov,"ondemand",8)) &&
+            (strncmp(gov,"performance",11)) &&
+            (strncmp(gov,"conservative",12)) &&
+            (strncmp(gov,"powersave",9))) {
+            fprintf(stderr, "Invalid governor %s!\n",gov);
+            free(gpath);
+            free(fpath);
+            return (EXIT_FAILURE);
+        }
+        snprintf(gpath, 60, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);
+
+        FILE* f = fopen(gpath, "w");
+        if (f == NULL) {
+            fprintf(stderr, "Unable to open path for writing\n");
+            free(gpath);
+            free(fpath);
+            return (EXIT_FAILURE);
+        }
+        fprintf(f,"%s",gov);
+        fclose(f);
+        free(gpath);
+        free(fpath);
+        return(EXIT_SUCCESS);
+    }
+
+    snprintf(gpath, 60, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_governor", cpuid);
+    snprintf(fpath, 60, "/sys/devices/system/cpu/cpu%d/cpufreq/scaling_setspeed", cpuid);
+
+    FILE* f = fopen(gpath, "w");
+    if (f == NULL) {
+        fprintf(stderr, "Unable to open path for writing\n");
+        free(gpath);
+        free(fpath);
+        return (EXIT_FAILURE);
+    }
+    fprintf(f,"userspace");
+    fclose(f);
+
+    f = fopen(fpath, "w");
+    if (f == NULL) {
+        fprintf(stderr, "Unable to open path for writing\n");
+        free(gpath);
+        free(fpath);
+        return (EXIT_FAILURE);
+    }
+    fprintf(f,"%d",freq);
+    fclose(f);
+    free(gpath);
+    free(fpath);
+    return(EXIT_SUCCESS);
+}
+
+
diff --git a/src/access.c b/src/access.c
new file mode 100644
index 0000000..2f7bbae
--- /dev/null
+++ b/src/access.c
@@ -0,0 +1,224 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  access.c
+ *
+ *      Description:  Interface for the different register access modules.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <fcntl.h>
+#include <string.h>
+#include <unistd.h>
+#include <signal.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/un.h>
+
+
+#include <types.h>
+#include <error.h>
+#include <topology.h>
+#include <msr.h>
+#include <pci.h>
+#include <accessClient.h>
+#include <perfmon.h>
+#include <access.h>
+
+
+static int globalSocket = -1;
+static int cpuSockets[MAX_NUM_THREADS] = { [0 ... MAX_NUM_THREADS-1] = -1};
+static int registeredCpus = 0;
+
+int _HPMinit(int cpu_id)
+{
+    int ret = 0;
+    if (accessClient_mode == ACCESSMODE_DIRECT)
+    {
+        ret = msr_init(0);
+        if (ret == 0)
+        {
+            if (cpuid_info.supportUncore)
+            {
+                ret = pci_init(0);
+            }
+        }
+    }
+    else if (accessClient_mode == ACCESSMODE_DAEMON)
+    {
+        accessClient_init(&cpuSockets[cpu_id]);
+        if (globalSocket == -1)
+        {
+            globalSocket = cpuSockets[cpu_id];
+            ret = msr_init(globalSocket);
+            if (ret == 0)
+            {
+                if (cpuid_info.supportUncore)
+                {
+                    ret = pci_init(globalSocket);
+                }
+            }
+        }
+    }
+    if (ret == 0)
+    {
+        registeredCpus++;
+    }
+    return 0;
+}
+
+int HPMinit(void)
+{
+    return _HPMinit(0);
+}
+
+int HPMinitialized(void)
+{
+    return registeredCpus;
+}
+
+int HPMaddThread(int cpu_id)
+{
+    if (((cpuSockets[cpu_id] == -1) && (accessClient_mode == ACCESSMODE_DAEMON)) ||
+         (accessClient_mode == ACCESSMODE_DIRECT))
+    {
+        return _HPMinit(cpu_id);
+    }
+    return 0;
+}
+
+void HPMfinalize(void)
+{
+    msr_finalize();
+    pci_finalize();
+    if (accessClient_mode == ACCESSMODE_DAEMON)
+    {
+        for (int i=0;i<cpuid_topology.numHWThreads; i++)
+        {
+            if (cpuSockets[i] != -1)
+            {
+                close(cpuSockets[i]);
+                cpuSockets[i] = -1;
+                registeredCpus--;
+            }
+        }
+    }
+    globalSocket = -1;
+    return;
+}
+
+int HPMread(int cpu_id, PciDeviceIndex dev, uint32_t reg, uint64_t* data)
+{
+    int socket = globalSocket;
+    uint64_t tmp = 0x0ULL;
+    int err = 0;
+    if ((dev >= MAX_NUM_PCI_DEVICES) || (data == NULL))
+    {
+        return -EFAULT;
+    }
+    if ((cpu_id < 0) || (cpu_id >= cpuid_topology.numHWThreads))
+    {
+        return -ERANGE;
+    }
+    if (accessClient_mode == ACCESSMODE_DAEMON)
+    {
+        if ((cpuSockets[cpu_id] >= 0) && (cpuSockets[cpu_id] != socket))
+        {
+            socket = cpuSockets[cpu_id];
+        }
+        else if (socket < 0)
+        {
+            return -ENOENT;
+        }
+    }
+    *data = 0x0ULL;
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, READ S[%d] C[%d] DEV[%d] R 0x%X, socket, cpu_id, dev, reg);
+    if (dev == MSR_DEV)
+    {
+        err = msr_tread(socket, cpu_id, reg, &tmp);
+        *data = tmp;
+    }
+    else if (pci_checkDevice(dev, cpu_id))
+    {
+        err = pci_tread(socket, cpu_id, dev, reg, (uint32_t*)&tmp);
+        *data = tmp;
+    }
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, READ S[%d] C[%d] DEV[%d] R 0x%X = 0x%llX ERR[%d], socket, cpu_id, dev, reg, LLU_CAST tmp, err);
+    return err;
+}
+
+int HPMwrite(int cpu_id, PciDeviceIndex dev, uint32_t reg, uint64_t data)
+{
+    int socket = globalSocket;
+    int err = 0;
+    uint64_t tmp;
+    if (dev >= MAX_NUM_PCI_DEVICES)
+    {
+        ERROR_PRINT(MSR WRITE D %d NOT VALID, dev);
+        return -EFAULT;
+    }
+    if ((cpu_id < 0) || (cpu_id >= cpuid_topology.numHWThreads))
+    {
+        ERROR_PRINT(MSR WRITE C %d OUT OF RANGE, cpu_id);
+        return -ERANGE;
+    }
+    if (accessClient_mode == ACCESSMODE_DAEMON)
+    {
+        if ((cpuSockets[cpu_id] >= 0) && (cpuSockets[cpu_id] != socket))
+        {
+            socket = cpuSockets[cpu_id];
+        }
+        if (socket < 0)
+        {
+            ERROR_PRINT(MSR WRITE S %d INVALID, socket);
+            return -ENOENT;
+        }
+    }
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, WRITE S[%d] C[%d] DEV[%d] R 0x%X D 0x%llX, socket, cpu_id, dev, reg, LLU_CAST data);
+    if (dev == MSR_DEV)
+    {
+        err = msr_twrite(socket, cpu_id, reg, data);
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, WRITE S[%d] C[%d] DEV[%d] R 0x%X D 0x%llX ERR[%d], socket, cpu_id, dev, reg, LLU_CAST data, err);
+        if (perfmon_verbosity == DEBUGLEV_DEVELOP)
+        {
+            int err2 = msr_tread(socket, cpu_id, reg, &tmp);
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, VERIFY S[%d] C[%d] DEV[%d] R 0x%X D 0x%llX ERR[%d] CMP %d, socket, cpu_id, dev, reg, LLU_CAST tmp, err2, (data == tmp));
+        }
+    }
+    else if (pci_checkDevice(dev, cpu_id))
+    {
+        err = pci_twrite(socket, cpu_id, dev, reg, data);
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, WRITE S[%d] C[%d] DEV[%d] R 0x%X D 0x%llX ERR[%d], socket, cpu_id, dev, reg, LLU_CAST data, err);
+        if (perfmon_verbosity == DEBUGLEV_DEVELOP)
+        {
+            int err2 = pci_tread(socket, cpu_id, dev, reg, (uint32_t*)&tmp);
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, VERIFY S[%d] C[%d] DEV[%d] R 0x%X D 0x%llX ERR[%d] CMP %d, socket, cpu_id, dev, reg, LLU_CAST tmp, err2, (data == tmp));
+        }
+    }
+    return err;
+}
diff --git a/src/accessClient.c b/src/accessClient.c
index ba4cb59..abfc202 100644
--- a/src/accessClient.c
+++ b/src/accessClient.c
@@ -5,16 +5,16 @@
  *
  *      Description:  Implementation of client to the access daemon.
  *                   Provides API to read and write values to MSR or
- *                   PCI Cfg Adresses. This module is used by the
+ *                   PCI Cfg Adresses. This module is used by the 
  *                   msr and pci modules.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -50,6 +50,8 @@
 #include <error.h>
 #include <cpuid.h>
 #include <accessClient.h>
+#include <perfmon.h>
+#include <configuration.h>
 
 int accessClient_mode = ACCESSMODE;
 
@@ -60,7 +62,8 @@ int accessClient_mode = ACCESSMODE;
 /* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-static char* accessClient_strerror(AccessErrorType det)
+static char*
+accessClient_strerror(AccessErrorType det)
 {
     switch (det)
     {
@@ -70,14 +73,29 @@ static char* accessClient_strerror(AccessErrorType det)
         case ERR_OPENFAIL:   return "failed to open device file";
         case ERR_RWFAIL:     return "failed to read/write register";
         case ERR_DAEMONBUSY: return "daemon already has a same/higher priority client";
-        case ERR_LOCKED:     return "access to HPM is locked";
-        case ERR_UNSUPPORTED: return "unsupported processor";
-        case ERR_NODEV:      return "no such device";
+        case ERR_NODEV:      return "no such pci device";
         default:             return "UNKNOWN errorcode";
     }
 }
 
-static int startDaemon(void)
+static int
+accessClient_errno(AccessErrorType det)
+{
+    switch (det)
+    {
+        case ERR_NOERROR:    return 0;
+        case ERR_UNKNOWN:    return -EFAULT;
+        case ERR_RESTREG:    return -EPERM;
+        case ERR_OPENFAIL:   return -ENXIO;
+        case ERR_RWFAIL:     return -EIO;
+        case ERR_DAEMONBUSY: return -EBUSY;
+        case ERR_NODEV:      return -ENODEV;
+        default:             return -EFAULT;
+    }
+}
+
+static int
+startDaemon(void)
 {
     /* Check the function of the daemon here */
     char* filepath;
@@ -91,26 +109,36 @@ static int startDaemon(void)
     int timeout = 1000;
     int socket_fd = -1;
 
-    if (accessClient_mode == DAEMON_AM_ACCESS_D)
+    if (accessClient_mode == ACCESSMODE_DIRECT)
+    {
+        return 0;
+    }
+
+    if (config.daemonPath != NULL)
+    {
+        strcpy(exeprog, config.daemonPath);
+    }
+
+    if (access(exeprog, X_OK))
+    {
+        ERROR_PRINT(Failed to find the daemon '%s'\n, exeprog);
+        exit(EXIT_FAILURE);
+    }
+
+    if (accessClient_mode == ACCESSMODE_DAEMON)
     {
-        if (access(exeprog, F_OK))
-        {
-            fprintf(stderr, "Daemon '%s' cannot be found\n", exeprog);
-            exit(EXIT_FAILURE);
-        }
-        if (access(exeprog, X_OK))
-        {
-            fprintf(stderr, "Daemon '%s' not executable\n", exeprog);
-            exit(EXIT_FAILURE);
-        }
         pid = fork();
 
         if (pid == 0)
         {
             ret = execve (exeprog, newargv, newenv);
-            ERRNO_PRINT;
-            fprintf(stderr, "Failed to execute the daemon '%s' (see error above)\n", exeprog);
-            exit(EXIT_FAILURE);
+
+            if (ret < 0)
+            {
+                //ERRNO_PRINT;
+                ERROR_PRINT(Failed to execute the daemon '%s'\n, exeprog);
+                exit(EXIT_FAILURE);
+            }
         }
         else if (pid < 0)
         {
@@ -124,9 +152,10 @@ static int startDaemon(void)
     address_length = sizeof(address);
     snprintf(address.sun_path, sizeof(address.sun_path), "/tmp/likwid-%d", pid);
     filepath = strdup(address.sun_path);
-    DEBUG_PRINT(0, "%ssocket pathname is %s\n",
-            ((accessClient_mode == DAEMON_AM_ACCESS_D) ? "Generated " : ""),
-            filepath);
+    if (accessClient_mode == ACCESSMODE_DAEMON)
+    {
+        DEBUG_PRINT(DEBUGLEV_INFO, Socket pathname is %s, filepath);
+    }
 
     while (timeout > 0)
     {
@@ -140,9 +169,9 @@ static int startDaemon(void)
         }
 
         timeout--;
-        DEBUG_PRINT(1, "%s\n", "Still waiting for socket...");
+        DEBUG_PRINT(DEBUGLEV_INFO, Still waiting for socket %s ..., filepath);
     }
-
+    
     if (timeout <= 0)
     {
         ERRNO_PRINT;  /* should hopefully still work, as we make no syscalls in between. */
@@ -151,12 +180,9 @@ static int startDaemon(void)
         fprintf(stderr, "Consult the error message above this to find out why.\n");
         fprintf(stderr, "If the error is 'no such file or directoy', \
                 it usually means that likwid-accessD just failed to start.\n");
-        fprintf(stderr, "In case the daemon itself output an error', \
-                ignore this.\n");
         exit(EXIT_FAILURE);
     }
-
-    DEBUG_PRINT(0, "%s\n", "Successfully opened socket to daemon.");
+    DEBUG_PRINT(DEBUGLEV_INFO, Successfully opened socket %s to daemon, filepath);
     free(filepath);
 
     return socket_fd;
@@ -164,26 +190,32 @@ static int startDaemon(void)
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
-void accessClient_setaccessmode(int mode)
+void 
+accessClient_setaccessmode(int mode)
 {
-    if ((accessClient_mode > DAEMON_AM_ACCESS_D) || (accessClient_mode < DAEMON_AM_DIRECT))
+    if ((accessClient_mode > ACCESSMODE_DAEMON) || (accessClient_mode < ACCESSMODE_DIRECT))
     {
-        fprintf(stderr, "Invalid accessmode %d\n", accessClient_mode);
+        ERROR_PRINT(Invalid accessmode %d, accessClient_mode);
         exit(EXIT_FAILURE);
     }
-
     accessClient_mode = mode;
 }
 
-void accessClient_init(int* socket_fd)
+void 
+accessClient_init(int* socket_fd)
 {
-    if ((accessClient_mode == DAEMON_AM_ACCESS_D))
+    if (config.daemonMode != -1)
+    {
+        accessClient_mode = config.daemonMode;
+    }
+    if ((accessClient_mode == ACCESSMODE_DAEMON) && (*socket_fd == -1))
     {
         (*socket_fd) = startDaemon();
     }
 }
 
-void accessClient_finalize(int socket_fd)
+void 
+accessClient_finalize(int socket_fd)
 {
     if ( socket_fd != -1 )
     { /* Only if a socket is actually open */
@@ -195,11 +227,13 @@ void accessClient_finalize(int socket_fd)
 }
 
 
-uint64_t accessClient_read(
+int
+accessClient_read(
         int socket_fd,
         const int cpu,
         const int device,
-        uint32_t reg)
+        uint32_t reg,
+        uint64_t *result)
 {
     AccessDataRecord data;
 
@@ -214,16 +248,16 @@ uint64_t accessClient_read(
 
     if (data.errorcode != ERR_NOERROR)
     {
-        fprintf(stderr, "Failed to read data through daemon: "
-                "daemon returned error %d '%s' for cpu %d reg 0x%x\n",
-                data.errorcode, accessClient_strerror(data.errorcode), cpu, reg);
-        //exit(EXIT_FAILURE);
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, Got error '%s' from access daemon reading reg 0x%X at CPU %d, accessClient_strerror(data.errorcode), data.reg, data.cpu);
+        *result = 0;
+        return accessClient_errno(data.errorcode);
     }
-
-    return data.data;
+    *result = data.data;
+    return 0;
 }
 
-void accessClient_write(
+int 
+accessClient_write(
         int socket_fd,
         const int cpu,
         const int device,
@@ -242,16 +276,11 @@ void accessClient_write(
 
     if (data.errorcode != ERR_NOERROR)
     {
-        fprintf(stderr, "Failed to write data through daemon: "
-                "daemon returned error %d '%s' for cpu %d reg 0x%x\n",
-                data.errorcode, accessClient_strerror(data.errorcode), cpu, reg);
-        //exit(EXIT_FAILURE);
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, Got error '%s' from access daemon writing reg 0x%X at CPU %d, accessClient_strerror(data.errorcode), data.reg, data.cpu);
+        return accessClient_errno(data.errorcode);
     }
 
-    if (data.data != 0x00ULL)
-    {
-        ERROR_PLAIN_PRINT(daemon write failed);
-    }
+    return 0;
 }
 
 
diff --git a/src/affinity.c b/src/affinity.c
index 59b05da..4cb4813 100644
--- a/src/affinity.c
+++ b/src/affinity.c
@@ -5,13 +5,14 @@
  *
  *      Description:  Implementation of affinity module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -33,7 +34,6 @@
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
-#include <math.h>
 #include <sys/types.h>
 #include <sys/syscall.h>
 #include <sys/time.h>
@@ -42,13 +42,15 @@
 #include <sched.h>
 #include <time.h>
 #include <pthread.h>
+#include <math.h>
 
-#include <error.h>
 #include <types.h>
+#include <error.h>
+#include <likwid.h>
 #include <numa.h>
 #include <affinity.h>
-#include <cpuid.h>
 #include <tree.h>
+#include <topology.h>
 
 /* #####   EXPORTED VARIABLES   ########################################### */
 
@@ -64,6 +66,8 @@ int affinity_core2node_lookup[MAX_NUM_THREADS];
 static int  affinity_numberOfDomains = 0;
 static AffinityDomain*  domains;
 
+AffinityDomains affinityDomains;
+
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
 
 static int
@@ -81,7 +85,7 @@ getProcessorID(cpu_set_t* cpu_set)
     return processorId;
 }
 
-static void
+static int
 treeFillNextEntries(
     TreeNode* tree,
     int* processorIds,
@@ -101,8 +105,7 @@ treeFillNextEntries(
 
         if ( node == NULL )
         {
-          printf("ERROR: Socket %d not existing!",i);
-          exit(EXIT_FAILURE);
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, Cannot find socket %d in topology tree, i);
         }
     }
 
@@ -114,10 +117,10 @@ treeFillNextEntries(
 
         if ( node == NULL )
         {
-          printf("ERROR: Core %d on socket %d not existing!",i,socketId);
-          exit(EXIT_FAILURE);
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, Cannot find core %d in topology tree, i);
         }
     }
+
     /* Traverse horizontal */
     while ( node != NULL )
     {
@@ -127,12 +130,20 @@ treeFillNextEntries(
 
         while ( thread != NULL )
         {
-            processorIds[numberOfEntries-counter] = thread->id;
-            thread = tree_getNextNode(thread);
-            counter--;
+            if (cpuid_topology.threadPool[thread->id].inCpuSet)
+            {
+                processorIds[numberOfEntries-counter] = thread->id;
+                thread = tree_getNextNode(thread);
+                counter--;
+            }
+            else
+            {
+                thread = tree_getNextNode(thread);
+            }
         }
         node = tree_getNextNode(node);
     }
+    return numberOfEntries-counter;
 }
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
@@ -144,7 +155,8 @@ affinity_init()
     int currentDomain;
     int subCounter = 0;
     int offset = 0;
-    int numberOfSocketDomains = cpuid_topology.numSockets;;
+    int tmp;
+    int numberOfSocketDomains = cpuid_topology.numSockets;
     int numberOfNumaDomains = numa_info.numberOfNodes;
     int numberOfProcessorsPerSocket =
         cpuid_topology.numCoresPerSocket * cpuid_topology.numThreadsPerCore;
@@ -156,6 +168,7 @@ affinity_init()
 
     int numberOfProcessorsPerCache =
         cpuid_topology.cacheLevels[cpuid_topology.numCacheLevels-1].threads;
+    int numberOfCoresPerNUMA = 
 
     /* for the cache domain take only into account last level cache and assume
      * all sockets to be uniform. */
@@ -165,36 +178,44 @@ affinity_init()
         (cpuid_topology.numCoresPerSocket/numberOfCoresPerCache);
 
     /* determine total number of domains */
-    if ( numberOfNumaDomains > 1 )
-    {
-        numberOfDomains += numberOfSocketDomains + numberOfCacheDomains + numberOfNumaDomains;
-    }
-    else
-    {
-        numberOfDomains += numberOfSocketDomains + numberOfCacheDomains;
-    }
+    numberOfDomains += numberOfSocketDomains + numberOfCacheDomains + numberOfNumaDomains;
     domains = (AffinityDomain*) malloc(numberOfDomains * sizeof(AffinityDomain));
     if (!domains)
     {
-        fprintf(stderr, "Cannot allocate affinity domain memory\n");
+        fprintf(stderr,"No more memory for %ld bytes for array of affinity domains\n",numberOfDomains * sizeof(AffinityDomain));
         return;
     }
 
     /* Node domain */
-    domains[0].numberOfProcessors = cpuid_topology.numHWThreads;
+    domains[0].numberOfProcessors = cpuid_topology.activeHWThreads;
     domains[0].numberOfCores = cpuid_topology.numSockets * cpuid_topology.numCoresPerSocket;
-    domains[0].processorList = (int*) malloc(cpuid_topology.numHWThreads*sizeof(int));
     domains[0].tag = bformat("N");
+    domains[0].processorList = (int*) malloc(cpuid_topology.numHWThreads*sizeof(int));
+    if (!domains[0].processorList)
+    {
+        fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+                cpuid_topology.numHWThreads*sizeof(int), 
+                bdata(domains[0].tag));
+        return;
+    }
     offset = 0;
 
-    for (int i=0; i<numberOfSocketDomains; i++)
+    if (numberOfSocketDomains > 1)
     {
-      treeFillNextEntries(
-          cpuid_topology.topologyTree,
-          domains[0].processorList + offset,
-          i, 0, numberOfProcessorsPerSocket);
-
-      offset += numberOfProcessorsPerSocket;
+        for (int i=0; i<numberOfSocketDomains; i++)
+        {
+          tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+                                    domains[0].processorList + offset,
+                                    i, 0, numberOfProcessorsPerSocket);
+          offset += tmp;
+        }
+    }
+    else
+    {
+        tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+                                  domains[0].processorList,
+                                  0, 0, domains[0].numberOfProcessors);
+        domains[0].numberOfProcessors = tmp;
     }
 
     /* Socket domains */
@@ -202,91 +223,155 @@ affinity_init()
 
     for (int i=0; i < numberOfSocketDomains; i++ )
     {
-      domains[currentDomain + i].numberOfProcessors = numberOfProcessorsPerSocket;
-      domains[currentDomain + i].numberOfCores =  cpuid_topology.numCoresPerSocket;
-      domains[currentDomain + i].processorList = (int*) malloc( domains[currentDomain + i].numberOfProcessors * sizeof(int));
-      domains[currentDomain + i].tag = bformat("S%d", i);
-
-      treeFillNextEntries(
-          cpuid_topology.topologyTree,
-          domains[currentDomain + i].processorList,
-          i, 0, domains[currentDomain + i].numberOfProcessors);
+        domains[currentDomain + i].numberOfProcessors = numberOfProcessorsPerSocket;
+        domains[currentDomain + i].numberOfCores =  cpuid_topology.numCoresPerSocket;
+        domains[currentDomain + i].tag = bformat("S%d", i);
+        domains[currentDomain + i].processorList = (int*) malloc( domains[currentDomain + i].numberOfProcessors * sizeof(int));
+        if (!domains[currentDomain + i].processorList)   
+        {
+            fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+                    domains[currentDomain + i].numberOfProcessors * sizeof(int),
+                    bdata(domains[currentDomain + i].tag));
+            return;
+        }
+
+        tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+                                  domains[currentDomain + i].processorList,
+                                  i, 0, domains[currentDomain + i].numberOfProcessors);
+        domains[currentDomain + i].numberOfProcessors = tmp;
     }
 
     /* Cache domains */
     currentDomain += numberOfSocketDomains;
     subCounter = 0;
-
     for (int i=0; i < numberOfSocketDomains; i++ )
     {
-      offset = 0;
-
-      for ( int j=0; j < (numberOfCacheDomains/numberOfSocketDomains); j++ )
-      {
-        domains[currentDomain + subCounter].numberOfProcessors = numberOfProcessorsPerCache;
-        domains[currentDomain + subCounter].numberOfCores =  numberOfCoresPerCache;
-        domains[currentDomain + subCounter].processorList = (int*) malloc(numberOfProcessorsPerCache*sizeof(int));
-        domains[currentDomain + subCounter].tag = bformat("C%d", subCounter);
-
-        treeFillNextEntries(
-            cpuid_topology.topologyTree,
-            domains[currentDomain + subCounter].processorList,
-            i, offset, domains[currentDomain + subCounter].numberOfProcessors);
-
-        offset += numberOfCoresPerCache;
-        subCounter++;
-      }
-    }
+        offset = 0;
 
-    if ( numberOfNumaDomains > 1 )
-    {
-        /* Memory domains */
-        currentDomain += numberOfCacheDomains;
-        subCounter = 0;
+        for ( int j=0; j < (numberOfCacheDomains/numberOfSocketDomains); j++ )
+        {
+            domains[currentDomain + subCounter].numberOfProcessors = numberOfProcessorsPerCache;
+            domains[currentDomain + subCounter].numberOfCores =  numberOfCoresPerCache;
+            domains[currentDomain + subCounter].tag = bformat("C%d", subCounter);
+            domains[currentDomain + subCounter].processorList = (int*) malloc(numberOfProcessorsPerCache*sizeof(int));
+            if (!domains[currentDomain + subCounter].processorList)   
+            {
+                fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+                        numberOfProcessorsPerCache*sizeof(int),
+                        bdata(domains[currentDomain + subCounter].tag));
+                return;
+            }
 
+            tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+                                      domains[currentDomain + subCounter].processorList,
+                                      i, offset,
+                                      domains[currentDomain + subCounter].numberOfProcessors);
+            domains[currentDomain + subCounter].numberOfProcessors = tmp;
+            offset += (tmp < numberOfCoresPerCache ? tmp : numberOfCoresPerCache);
+            subCounter++;
+        }
+    }
+    /* Memory domains */
+    currentDomain += numberOfCacheDomains;
+    subCounter = 0;
+    if ((numberOfNumaDomains >= numberOfSocketDomains) && (numberOfNumaDomains > 1))
+    {
         for (int i=0; i < numberOfSocketDomains; i++ )
         {
             offset = 0;
-            for ( int j=0; j < (int)ceil((double)numberOfNumaDomains/numberOfSocketDomains); j++ )
+            for ( int j=0; j < (int)ceil((double)(numberOfNumaDomains)/numberOfSocketDomains); j++ )
             {
-                domains[currentDomain + subCounter].numberOfProcessors = numa_info.nodes[subCounter].numberOfProcessors;
-                domains[currentDomain + subCounter].numberOfCores =  numberOfCoresPerCache;
-                domains[currentDomain + subCounter].processorList = (int*) malloc(numa_info.nodes[subCounter].numberOfProcessors*sizeof(int));
+                domains[currentDomain + subCounter].numberOfProcessors =
+                                numa_info.nodes[subCounter].numberOfProcessors;
+                domains[currentDomain + subCounter].numberOfCores =
+                                numa_info.nodes[subCounter].numberOfProcessors/cpuid_topology.numThreadsPerCore;
                 domains[currentDomain + subCounter].tag = bformat("M%d", subCounter);
-
-                treeFillNextEntries(
-                        cpuid_topology.topologyTree,
-                        domains[currentDomain + subCounter].processorList,
-                        i, offset, domains[currentDomain + subCounter].numberOfProcessors);
-
+                domains[currentDomain + subCounter].processorList =
+                                (int*) malloc(numa_info.nodes[subCounter].numberOfProcessors*sizeof(int));
+                if (!domains[currentDomain + subCounter].processorList)
+                {
+                    fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+                            numa_info.nodes[subCounter].numberOfProcessors*sizeof(int),
+                            bdata(domains[currentDomain + subCounter].tag));
+                    return;
+                }
+
+                tmp = treeFillNextEntries(cpuid_topology.topologyTree,
+                                          domains[currentDomain + subCounter].processorList,
+                                          i, offset,
+                                          domains[currentDomain + subCounter].numberOfProcessors);
+                domains[currentDomain + subCounter].numberOfProcessors = tmp;
                 offset += domains[currentDomain + subCounter].numberOfCores;
-
                 subCounter++;
             }
         }
+    }
+    else
+    {
+        offset = 0;
+        int NUMAthreads = numberOfProcessorsPerSocket * numberOfSocketDomains;
+        domains[currentDomain + subCounter].numberOfProcessors = NUMAthreads;
+        domains[currentDomain + subCounter].numberOfCores =  numberOfProcessorsPerSocket;
+        domains[currentDomain + subCounter].tag = bformat("M%d", subCounter);
+        domains[currentDomain + subCounter].processorList = (int*) malloc(NUMAthreads*sizeof(int));
+        if (!domains[currentDomain + subCounter].processorList)
+        {
+            fprintf(stderr,"No more memory for %ld bytes for processor list of affinity domain %s\n",
+                    NUMAthreads*sizeof(int), 
+                    bdata(domains[currentDomain + subCounter].tag));
+            return;
+        }
+        tmp = 0;
+        for (int i=0; i < numberOfSocketDomains; i++ )
+        {
+            tmp += treeFillNextEntries(
+                cpuid_topology.topologyTree,
+                &(domains[currentDomain + subCounter].processorList[offset]),
+                i, 0, numberOfProcessorsPerSocket);
+            offset += numberOfProcessorsPerSocket;
+        }
+        domains[currentDomain + subCounter].numberOfProcessors = tmp;
+    }
 
-        /* This is redundant ;-). Create thread to node lookup */
-        for ( uint32_t i = 0; i < numa_info.numberOfNodes; i++ )
+    /* This is redundant ;-). Create thread to node lookup */
+    for ( uint32_t i = 0; i < numa_info.numberOfNodes; i++ )
+    {
+        for ( int j = 0; j < numa_info.nodes[i].numberOfProcessors; j++ )
         {
-            for ( int j = 0; j < numa_info.nodes[i].numberOfProcessors; j++ )
-            {
-                affinity_core2node_lookup[numa_info.nodes[i].processors[j]] = i;
-            }
+            affinity_core2node_lookup[numa_info.nodes[i].processors[j]] = i;
         }
     }
 
     affinity_numberOfDomains = numberOfDomains;
+    affinityDomains.numberOfAffinityDomains = numberOfDomains;
+    affinityDomains.numberOfSocketDomains = numberOfSocketDomains;
+    affinityDomains.numberOfNumaDomains = numberOfNumaDomains;
+    affinityDomains.numberOfProcessorsPerSocket = numberOfProcessorsPerSocket;
+    affinityDomains.numberOfCacheDomains = numberOfCacheDomains;
+    affinityDomains.numberOfCoresPerCache = numberOfCoresPerCache;
+    affinityDomains.numberOfProcessorsPerCache = numberOfProcessorsPerCache;
+    affinityDomains.domains = domains;
 }
 
 
 void
 affinity_finalize()
 {
-    for ( int i=0; i < affinity_numberOfDomains; i++ )
+    if (!affinityDomains.domains)
+    {
+        return;
+    }
+    for ( int i=0; i < affinityDomains.numberOfAffinityDomains; i++ )
+    {
+        if (affinityDomains.domains[i].processorList)
+        {
+            free(affinityDomains.domains[i].processorList);
+        }
+    }
+    if (affinityDomains.domains)
     {
-        free(domains[i].processorList);
+        free(affinityDomains.domains);
     }
-    free(domains);
 }
 
 
@@ -347,6 +432,20 @@ affinity_pinProcess(int processorId)
     sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
 }
 
+void
+affinity_pinProcesses(int cpu_count, int* processorIds)
+{
+    int i;
+    cpu_set_t cpuset;
+
+    CPU_ZERO(&cpuset);
+    for(i=0;i<cpu_count;i++)
+    {
+        CPU_SET(processorIds[i], &cpuset);
+    }
+    sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
+}
+
 
 const AffinityDomain*
 affinity_getDomain(bstring domain)
@@ -364,22 +463,24 @@ affinity_getDomain(bstring domain)
 }
 
 void
-affinity_printDomains(FILE* OUTSTREAM)
+affinity_printDomains()
 {
-    if (OUTSTREAM)
+    for ( int i=0; i < affinity_numberOfDomains; i++ )
     {
-        for ( int i=0; i < affinity_numberOfDomains; i++ )
-        {
-            fprintf(OUTSTREAM, "Domain %d:\n", i);
-            fprintf(OUTSTREAM, "\tTag %s:", bdata(domains[i].tag));
+        printf("Domain %d:\n",i);
+        printf("\tTag %s:",bdata(domains[i].tag));
 
-            for ( uint32_t j=0; j < domains[i].numberOfProcessors; j++ )
-            {
-                fprintf(OUTSTREAM, " %d", domains[i].processorList[j]);
-            }
-            fprintf(OUTSTREAM, "\n");
-            fflush(OUTSTREAM);
+        for ( uint32_t j=0; j < domains[i].numberOfProcessors; j++ )
+        {
+            printf(" %d",domains[i].processorList[j]);
         }
+        printf("\n");
     }
 }
 
+AffinityDomains_t
+get_affinityDomains(void)
+{
+    return &affinityDomains;
+}
+
diff --git a/src/allocator.c b/src/allocator.c
deleted file mode 100644
index 83e8164..0000000
--- a/src/allocator.c
+++ /dev/null
@@ -1,199 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  allocator.c
- *
- *      Description:  Implementation of allocator module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-
-#include <error.h>
-#include <types.h>
-#include <allocator.h>
-#include <affinity.h>
-
-/* #####   EXPORTED VARIABLES   ########################################### */
-
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static int numberOfAllocatedVectors = 0;
-static void** allocations;
-
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-void
-allocator_init(int numVectors)
-{
-    allocations = (void**) malloc(numVectors * sizeof(void*));
-}
-
-
-void
-allocator_finalize()
-{
-    int i;
-
-    for (i=0; i<numberOfAllocatedVectors; i++)
-    {
-        free(allocations[i]);
-    }
-}
-
-void
-allocator_allocateVector(
-        FILE* OUTSTREAM,
-        void** ptr,
-        int alignment,
-        uint64_t size,
-        int offset,
-        DataType type,
-        bstring domainString)
-{
-    size_t bytesize = 0;
-    const AffinityDomain* domain;
-    int errorCode;
-
-    switch ( type )
-    {
-        case SINGLE:
-        case SINGLE_RAND:
-            bytesize = (size+offset) * sizeof(float);
-            break;
-
-        case DOUBLE_RAND:
-        case DOUBLE:
-            bytesize = (size+offset) * sizeof(double);
-            break;
-    }
-
-    errorCode =  posix_memalign(ptr, alignment, bytesize);
-
-    if (errorCode)
-    {
-        if (errorCode == EINVAL) 
-        {
-            fprintf(stderr,
-                    "Alignment parameter is not a power of two\n");
-            exit(EXIT_FAILURE);
-        }
-        if (errorCode == ENOMEM) 
-        {
-            fprintf(stderr,
-                    "Insufficient memory to fulfill the request\n");
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    if ((*ptr) == NULL)
-    {
-            fprintf(stderr, "posix_memalign failed!\n");
-            exit(EXIT_FAILURE);
-
-    }
-
-    allocations[numberOfAllocatedVectors] = *ptr;
-    numberOfAllocatedVectors++;
-    domain = affinity_getDomain(domainString);
-    affinity_pinProcess(domain->processorList[0]);
-
-    if (OUTSTREAM)
-    {
-        fprintf(OUTSTREAM, "Allocate: Process running on core %d - Vector length %llu Offset %d\n",
-            affinity_processGetProcessorId(),
-            LLU_CAST size,
-            offset);
-    }
-
-    switch ( type )
-    {
-        case SINGLE:
-            {
-                float* sptr = (float*) (*ptr);
-                sptr += offset;
-
-                for ( uint64_t i=0; i < size; i++ )
-                {
-                    sptr[i] = 1.0;
-                }
-                *ptr = (void*) sptr;
-
-            }
-            break;
-
-        case DOUBLE:
-            {
-                double* dptr = (double*) (*ptr);
-                dptr += offset;
-
-                for ( uint64_t i=0; i < size; i++ )
-                {
-                    dptr[i] = 1.0;
-                }
-                *ptr = (void*) dptr;
-            }
-            break;
-        case SINGLE_RAND:
-            {
-                srand((uint64_t)ptr);
-                float* sptr = (float*) (*ptr);
-                sptr += offset;
-
-                for ( uint64_t i=0; i < size; i++ )
-                {
-                    sptr[i] = rand()/((float)RAND_MAX)*2.0-1.0;
-                }
-                *ptr = (void*) sptr;
-            }
-            break;
-        case DOUBLE_RAND:
-            {
-                srand((uint64_t)ptr);
-                double* dptr = (double*) (*ptr);
-                dptr += offset;
-
-                for ( uint64_t i=0; i < size; i++ )
-                {
-                    dptr[i] = rand()/((double)RAND_MAX)*2.0-1.0;
-                }
-                *ptr = (void*) dptr;
-            }
-            break;
-        
-    }
-}
-
diff --git a/src/applications/likwid-agent.lua b/src/applications/likwid-agent.lua
new file mode 100644
index 0000000..76f2906
--- /dev/null
+++ b/src/applications/likwid-agent.lua
@@ -0,0 +1,573 @@
+#!<PREFIX>/bin/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-agent.lua
+ *
+ *      Description:  A monitoring daemon for hardware performance counters.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = '<PREFIX>/share/lua/?.lua;' .. package.path
+local likwid = require("likwid")
+
+dconfig = {}
+dconfig["groupStrings"] ={}
+dconfig["groupData"] ={}
+dconfig["accessmode"] = 1
+dconfig["duration"] = 1
+dconfig["groupPath"] = "<PREFIX>/share/likwid/mongroups"
+dconfig["logPath"] = nil
+dconfig["logStyle"] = "log"
+dconfig["gmetric"] = false
+dconfig["gmetricPath"] = "gmetric"
+dconfig["gmetricConfig"] = nil
+dconfig["gmetricHasUnit"] = false
+dconfig["rrd"] = false
+dconfig["rrdPath"] = "."
+dconfig["syslog"] = false
+dconfig["syslogPrio"] = "local0.notice"
+dconfig["stdout"] = false
+
+rrdconfig = {}
+
+
+local function read_daemon_config(filename)
+    if filename == nil or filename == "" then
+        print("Not a valid config filename")
+        os.exit(1)
+    end
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("Cannot open config file "..filename)
+        os.exit(1)
+    end
+    local t = f:read("*all")
+    f:close()
+
+    for i, line in pairs(likwid.stringsplit(t,"\n")) do
+
+        if not line:match("^#") then
+            if line:match("^GROUPPATH%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["groupPath"] = linelist[1]
+            end
+
+            if line:match("^EVENTSET%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                for i=#linelist,0,-1 do
+                    if linelist[i] == "" then
+                        table.remove(linelist, i)
+                    else
+                        table.insert(dconfig["groupStrings"], linelist[i])
+                    end
+                end
+            end
+
+            if line:match("^DURATION%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["duration"] = tonumber(linelist[1])
+            end
+
+            if line:match("^ACCESSMODE%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["accessmode"] = tonumber(linelist[1])
+            end
+
+            if line:match("^LOGPATH%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["logPath"] = linelist[1]
+            end
+
+            if line:match("^LOGSTYLE%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                if linelist[1] ~= "log" and linelist[1] ~= "update" then
+                    print("LOGSTYLE argument not valid, available are log and update. Fallback to log.")
+                else
+                    dconfig["logStyle"] = linelist[1]
+                end
+            end
+
+            if line:match("^GMETRIC%s%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                if linelist[1] == "True" then
+                    dconfig["gmetric"] = true
+                end
+            end
+
+            if line:match("^GMETRICPATH%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["gmetricPath"] = linelist[1]
+            end
+
+            if line:match("^GMETRICCONFIG%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["gmetricConfig"] = linelist[1]
+            end
+
+            if line:match("^RRD%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                if linelist[1] == "True" then
+                    dconfig["rrd"] = true
+                end
+            end
+
+            if line:match("^RRDPATH%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["rrdPath"] = linelist[1]
+            end
+
+            if line:match("^SYSLOG%s%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                if linelist[1] == "True" then
+                    dconfig["syslog"] = true
+                end
+            end
+
+            if line:match("^SYSLOGPRIO%a*") ~= nil then
+                local linelist = likwid.stringsplit(line, "%s+", nil, "%s+")
+                table.remove(linelist, 1)
+                dconfig["syslogPrio"] = linelist[1]
+            end
+        end
+    end
+end
+
+local function calc_sum(key, results)
+    local sum = 0.0
+    local numThreads = likwid.getNumberOfThreads()
+    for thread=1, numThreads do
+        sum = sum + results[thread][key]
+    end
+    return sum
+end
+
+local function calc_avg(key, results)
+    local sum = 0.0
+    local numThreads = likwid.getNumberOfThreads()
+    for thread=1, numThreads do
+        sum = sum + results[thread][key]
+    end
+    return sum/numThreads
+end
+
+local function calc_min(key, results)
+    local min = math.huge
+    local numThreads = likwid.getNumberOfThreads()
+    for thread=1, numThreads do
+        if results[thread][key] < min then
+            min = results[thread][key]
+        end
+    end
+    return min
+end
+
+local function calc_max(key, results)
+    local max = 0
+    local numThreads = likwid.getNumberOfThreads()
+    for thread=1, numThreads do
+        if results[thread][key] > max then
+            max = results[thread][key]
+        end
+    end
+    return max
+end
+
+local function check_logfile()
+    local g = io.popen("ls "..dconfig["logPath"], "r")
+    if g == nil then
+        print("Logfile path".. dconfig["logPath"].. " does not exist.")
+        return false
+    end
+    g:close()
+    return true
+end
+
+local function logfile(groupID, results)
+    open_function = "a"
+    if dconfig["logStyle"] == "update" then
+        open_function = "w"
+    end
+    filename = "likwid."..tostring(groupID)..".log"
+    local s,e = dconfig["groupData"][groupID]["GroupString"]:find(":")
+    if not s then
+        filename = "likwid."..dconfig["groupData"][groupID]["GroupString"]..".log"
+    end
+    local f = io.open(dconfig["logPath"].."/"..filename, open_function)
+    if f == nil then
+        print("Cannot open logfile ".. dconfig["logPath"].."/"..filename)
+        return
+    end
+    local timestamp = results["Timestamp"]
+    for k,v in pairs(results) do
+        if k ~= "Timestamp" then
+            f:write(timestamp..","..k:gsub("%(",""):gsub("%)","").. ","..v.."\n")
+        end
+    end
+    f:close()
+end
+
+local function check_logger()
+    cmd = "which logger"
+    local f = io.popen(cmd)
+    if f == nil then
+        return false
+    end
+    f:close()
+    return true
+end
+
+local function logger(results)
+    cmd = "logger -t LIKWID "
+    if dconfig["syslogPrio"] ~= nil then
+        cm = cmd .."-p "..dconfig["syslogPrio"].." "
+    end
+    local timestamp = results["Timestamp"]
+    for k,v in pairs(results) do
+        if k ~= "Timestamp" then
+            local resultcmd = cmd .. k:gsub("%(",""):gsub("%)","") .. " " ..v
+            local f = io.popen(resultcmd)
+            if f == nil then
+                print("Cannot use logger, maybe not in $PATH")
+                return
+            end
+            f:close()
+        end
+    end
+    
+end
+
+local function check_gmetric()
+    if dconfig["gmetricPath"] == nil then
+        return false
+    end
+    local f = io.popen(dconfig["gmetricPath"].." -h","r")
+    if f == nil then
+        return false
+    end
+    local msg = f:read("*a")
+    if msg:match("units=") then
+        dconfig["gmetricHasUnit"] = true
+    end
+    f:close()
+    return true
+end
+
+local function gmetric(gdata, results)
+    execList = {}
+    if dconfig["gmetricPath"] == nil then
+        return
+    end
+    table.insert(execList, dconfig["gmetricPath"])
+    if dconfig["gmetricConfig"] ~= nil then
+        table.insert(execList, "-c")
+        table.insert(execList, dconfig["gmetricConfig"])
+    end
+    if gdata["GroupString"] ~= gdata["EventString"] then
+        table.insert(execList, "-g")
+        table.insert(execList, gdata["GroupString"])
+    end
+    for k,v in pairs(results) do
+        local execStr = table.concat(execList, " ")
+        if k ~= "Timestamp" then
+            execStr = execStr .. " -t double "
+        else
+            execStr = execStr .. " -t string "
+        end
+        local name = k
+        local unit = nil
+        local s,e = k:find("%[")
+        if s ~= nil then
+            name = k:sub(0,s-1):gsub("^%s*(.-)%s*$", "%1")
+            unit = k:sub(s+1,k:len()-1):gsub("^%s*(.-)%s*$", "%1")
+        end
+        execStr = execStr .. " --name=\"" .. name .."\""
+        if dconfig["gmetricHasUnit"] and unit ~= nil then
+            execStr = execStr .. " --units=\"" .. unit .."\""
+        end
+        local value = tonumber(v)
+        if v ~= nil and value ~= nil then
+            execStr = execStr .. " --value=\"" .. string.format("%f", value) .."\""
+        elseif value ~= nil then
+            execStr = execStr .. " --value=\"" .. tostring(v) .."\""
+        else
+            execStr = execStr .. " --value=\"0\""
+        end
+        os.execute(execStr)
+    end
+end
+
+local function normalize_rrd_string(str)
+    str = str:gsub(" ","_")
+    str = str:gsub("%(","")
+    str = str:gsub("%)","")
+    str = str:gsub("%[","")
+    str = str:gsub("%]","")
+    str = str:gsub("%/","")
+    str = str:sub(1,19)
+    return str
+end
+
+local function check_rrd()
+    local f = io.popen("rrdtool")
+    if f == nil then
+        return false
+    end
+    f:close()
+    return true
+end
+
+local function create_rrd(numGroups, duration, groupData)
+    local rrdname = dconfig["rrdPath"].."/".. groupData["GroupString"] .. ".rrd"
+    local rrdstring = "rrdtool create "..rrdname.." --step ".. tostring(numGroups*duration)
+    if rrdconfig[groupData["GroupString"]] == nil then
+        rrdconfig[groupData["GroupString"]] = {}
+    end
+    for i, metric in pairs(groupdata["Metrics"]) do
+        rrdstring = rrdstring .. " DS"..":" .. normalize_rrd_string(metric["description"]) ..":GAUGE:"
+        rrdstring = rrdstring ..tostring(numGroups*duration) ..":0:U"
+        table.insert(rrdconfig[groupData["GroupString"]], metric["description"])
+    end
+    rrdstring = rrdstring .." RRA:AVERAGE:0.5:" .. tostring(60/duration)..":10"
+    rrdstring = rrdstring .." RRA:MIN:0.5:" .. tostring(60/duration)..":10"
+    rrdstring = rrdstring .." RRA:MAX:0.5:" .. tostring(60/duration)..":10"
+    --Average, min and max of hours of last day
+    rrdstring = rrdstring .." RRA:AVERAGE:0.5:" .. tostring(3600/duration)..":24"
+    rrdstring = rrdstring .." RRA:MIN:0.5:" .. tostring(3600/duration)..":24"
+    rrdstring = rrdstring .." RRA:MAX:0.5:" .. tostring(3600/duration)..":24"
+    --Average, min and max of day of last month
+    rrdstring = rrdstring .." RRA:AVERAGE:0.5:" .. tostring(86400/duration)..":31"
+    rrdstring = rrdstring .." RRA:MIN:0.5:" .. tostring(86400/duration)..":31"
+    rrdstring = rrdstring .." RRA:MAX:0.5:" .. tostring(86400/duration)..":31"
+    os.execute(rrdstring)
+end
+
+local function rrd(groupData, results)
+    local rrdname = dconfig["rrdPath"].."/".. groupData["GroupString"] .. ".rrd"
+    local rrdstring = "rrdtool update "..rrdname.." N"
+    for i, id in pairs(rrdconfig[groupData["GroupString"]]) do
+        rrdstring = rrdstring .. ":" .. tostring(results[id])
+    end
+    os.execute(rrdstring)
+end
+
+-- Read commandline arguments
+if #arg ~= 1 then
+    print("Usage:")
+    print(arg[0] .. " <configFile>")
+    os.exit(1)
+end
+
+-- Get architectural information for the current system
+local cpuinfo = likwid.getCpuInfo()
+local cputopo = likwid.getCpuTopology()
+-- Read LIKWID configuration file, mainly to avoid topology lookup
+local config = likwid.getConfiguration()
+-- Read LIKWID daemon configuration file
+read_daemon_config(arg[1])
+
+if #dconfig["groupStrings"] == 0 then
+    print("No monitoring groups defined, exiting...")
+    os.exit(1)
+end
+if dconfig["duration"] == 0 then
+    print("Invalid value 0 for duration. Sanitizing to 1 second.")
+    dconfig["duration"] = 1
+end
+
+if dconfig["syslog"] then
+    if check_logger() == false then
+        print("Cannot find tool logger, disabling syslog output.")
+        dconfig["syslog"] = false
+    end
+end
+if dconfig["logPath"] then
+    if check_logfile() == false then
+        print("Cannot create logfile path "..dconfig["logPath"]..". Deactivating logfile output.")
+        dconfig["logPath"] = nil
+    end
+end
+if dconfig["gmetric"] then
+    if check_gmetric() == false then
+        print("Cannot find gmetric using path "..dconfig["gmetricPath"]..". Deactivating gmetric output.")
+        dconfig["gmetric"] = false
+    end
+end
+if dconfig["rrd"] then
+    if check_rrd() == false then
+        print("Cannot find rrdtool. Deactivating rrd output.")
+        dconfig["rrd"] = false
+    end
+end
+
+-- Activate output to stdout only if no other backend is set
+if dconfig["logPath"] == nil and dconfig["rrd"] == false and dconfig["gmetric"] == false and dconfig["syslog"] == false then
+    dconfig["stdout"] = true
+end
+
+-- Add all cpus to the cpulist
+local cpulist = {}
+for i, thread in pairs(cputopo["threadPool"]) do
+    table.insert(cpulist, thread["apicId"])
+end
+
+-- Select access mode to msr devices, try configuration file first
+access_mode = dconfig["accessmode"]
+if access_mode < 0 or access_mode > 1 then
+    access_mode = 1
+end
+if likwid.setAccessClientMode(access_mode) ~= 0 then
+    os.exit(1)
+end
+
+-- Select group directory for monitoring
+likwid.groupfolder = dconfig["groupPath"]
+
+-- Evaluate eventSet given on commandline. If it's a group, resolve to events
+for k,v in pairs(dconfig["groupStrings"]) do
+    local gdata = nil
+    gdata = likwid.get_groupdata(v)
+    if gdata ~= nil then
+        table.insert(dconfig["groupData"], gdata)
+    end
+end
+if #dconfig["groupData"] == 0 then
+    print("None of the event strings can be added for current architecture.")
+    os.exit(1)
+end
+
+-- Initialize likwid perfctr
+likwid.init(cputopo["numHWThreads"], cpulist)
+for k,v in pairs(dconfig["groupData"]) do
+    local groupID = likwid.addEventSet(v["EventString"])
+    if dconfig["rrd"] then
+        create_rrd(#dconfig["groupData"], dconfig["duration"], v)
+    end
+end
+likwid.catchSignal()
+while likwid.getSignalState() == 0 do
+
+    for groupID,gdata in pairs(dconfig["groupData"]) do
+        local old_mtime = likwid_getRuntimeOfGroup(groupID)
+        local cur_time = os.time()
+        likwid.setupCounters(groupID)
+
+        -- Perform the measurement
+        likwid.startCounters()
+        sleep(dconfig["duration"])
+        likwid.stopCounters()
+
+        -- temporal array collecting counter to values for each thread for metric calculation
+        threadResults = {}
+        local mtime = likwid_getRuntimeOfGroup(groupID)
+        local clock = likwid_getCpuClock();
+
+        for event=1, likwid.getNumberOfEvents(groupID) do
+            for thread=1, likwid.getNumberOfThreads() do
+                if threadResults[thread] == nil then
+                    threadResults[thread] = {}
+                end
+                threadResults[thread]["time"] = mtime - old_mtime
+                threadResults[thread]["inverseClock"] = 1.0/clock;
+                local result = likwid.getResult(groupID, event, thread)
+                if threadResults[thread][gdata["Events"][event]["Counter"]] == nil then
+                    threadResults[thread][gdata["Events"][event]["Counter"]] = result
+                end
+            end
+        end
+
+
+        if gdata["Metrics"] then
+            local threadOutput = {}
+            for i, metric in pairs(gdata["Metrics"]) do
+                for thread=1, likwid.getNumberOfThreads() do
+                    if threadOutput[thread] == nil then
+                        threadOutput[thread] = {}
+                    end
+                    local result = likwid.calculate_metric(metric["formula"], threadResults[thread])
+                    threadOutput[thread][metric["description"]] = result
+                end
+            end
+            output = {}
+            output["Timestamp"] = os.date("%m/%d/%Y_%X",cur_time)
+            for i, metric in pairs(gdata["Metrics"]) do
+                itemlist = likwid.stringsplit(metric["description"], "%s+", nil, "%s+")
+                func = itemlist[1]
+                table.remove(itemlist, 1)
+                desc = table.concat(itemlist," ")
+                if func == "AVG" then
+                    output[metric["description"]] = calc_avg(metric["description"], threadOutput)
+                elseif func == "SUM" then
+                    output[metric["description"]] = calc_sum(metric["description"], threadOutput)
+                elseif func == "MIN" then
+                    output[metric["description"]] = calc_min(metric["description"], threadOutput)
+                elseif func == "MAX" then
+                    output[metric["description"]] = calc_max(metric["description"], threadOutput)
+                elseif func == "ONCE" then
+                    output[metric["description"]] = threadOutput[1][metric["description"]]
+                else
+                    for thread=1, likwid.getNumberOfThreads() do
+                        output["T"..cpulist[thread] .. " " .. metric["description"]] = threadOutput[thread][metric["description"]]
+                    end
+                end
+            end
+            if dconfig["logPath"] ~= nil then
+                logfile(groupID, output)
+            end
+            if dconfig["syslog"] then
+                logger(output)
+            end
+            if dconfig["gmetric"] then
+                gmetric(gdata, output)
+            end
+            if dconfig["rrd"] then
+                rrd(gdata, output)
+            end
+            if dconfig["stdout"] then
+                for i,o in pairs(output) do
+                    print(i,o)
+                end
+                print(likwid.hline)
+            end
+        end
+    end
+end
+
+-- Finalize likwid perfctr
+likwid.catchSignal()
+likwid.finalize()
+likwid.putConfiguration()
+likwid.putTopology()
diff --git a/src/applications/likwid-bench.c b/src/applications/likwid-bench.c
deleted file mode 100644
index 15f6f0d..0000000
--- a/src/applications/likwid-bench.c
+++ /dev/null
@@ -1,536 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-bench.c
- *
- *      Description:  A flexible and extensible benchmarking toolbox
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <errno.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <bstrlib.h>
-#include <types.h>
-#include <error.h>
-#include <cpuid.h>
-#include <numa.h>
-#include <affinity.h>
-#include <timer.h>
-#include <threads.h>
-#include <barrier.h>
-#include <testcases.h>
-#include <strUtil.h>
-#include <allocator.h>
-
-#include <likwid.h>
-#ifdef PAPI
-#include <papi.h>
-#include <omp.h>
-#endif
-
-extern void* runTest(void* arg);
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#define HELP_MSG \
-    fprintf(stdout, "Threaded Memory Hierarchy Benchmark --  Version  %d.%d \n\n",VERSION,RELEASE); \
-    fprintf(stdout, "\n"); \
-    fprintf(stdout, "Supported Options:\n"); \
-    fprintf(stdout, "-h\t Help message\n"); \
-    fprintf(stdout, "-v\t Version information\n"); \
-    fprintf(stdout, "-q\t Silent without output\n"); \
-    fprintf(stdout, "-a\t list available benchmarks \n"); \
-    fprintf(stdout, "-p\t list available thread domains\n"); \
-    fprintf(stdout, "-l <TEST>\t list properties of benchmark \n"); \
-    fprintf(stdout, "-i <INT>\t number of iterations \n"); \
-    fprintf(stdout, "-g <INT>\t number of workgroups (mandatory)\n"); \
-    fprintf(stdout, "-t <TEST>\t type of test \n"); \
-    fprintf(stdout, "-w\t <thread_domain>:<size>[:<num_threads>[:<chunk size>:<stride>][-<streamId>:<domain_id>[:<offset>]], size in kB, MB or GB  (mandatory)\n"); \
-    fprintf(stdout, "Processors are in compact ordering. Optionally every stream can be placed. Either no stream or all streams must be placed. Multiple streams are separated by commas.\n"); \
-    fprintf(stdout, "Usage: likwid-bench -t copy -i 1000 -g 1 -w S0:100kB:10:1:2 \n"); \
-    fprintf(stdout, "\tRun 10 threads on socket 0 using physical cores only (presuming SMT2 system).\n"); \
-    fprintf(stdout, "Example with data placement: likwid-bench -t copy -i 1000 -g 1 -w S0:100kB:20-0:S1,1:S1 \n"); \
-    fprintf(stdout, "\tRun 20 threads on socket 0 and place both arrays of the copy test case on socket 1.\n"); \
-    fflush(stdout);
-
-#define VERSION_MSG \
-    fprintf(stdout, "likwid-bench   %d.%d \n\n",VERSION,RELEASE); \
-    fflush(stdout);
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE  ############ */
-
-void copyThreadData(ThreadUserData* src,ThreadUserData* dst)
-{
-    uint32_t i;
-
-    *dst = *src;
-    dst->processors = (int*) malloc(src->numberOfThreads*sizeof(int));
-    dst->streams = (void**) malloc(src->test->streams*sizeof(void*));
-
-    for (i=0; i<  src->test->streams; i++)
-    {
-        dst->streams[i] = src->streams[i];
-    }
-
-    for (i=0; i<src->numberOfThreads; i++)
-    {
-        dst->processors[i] = src->processors[i];
-    }
-}
-
-
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-int main(int argc, char** argv)
-{
-    int iter = 100;
-    uint32_t i;
-    uint32_t j;
-    int globalNumberOfThreads = 0;
-    int optPrintDomains = 0;
-    int c;
-    ThreadUserData myData;
-    bstring testcase = bfromcstr("none");
-    uint32_t numberOfWorkgroups = 0;
-    int tmp = 0;
-    double time;
-    const TestCase* test = NULL;
-    Workgroup* currentWorkgroup = NULL;
-    Workgroup* groups = NULL;
-    FILE* OUTSTREAM = stdout;
-
-    if (cpuid_init() == EXIT_FAILURE)
-    {
-        ERROR_PLAIN_PRINT(Unsupported processor!);
-    }
-    numa_init();
-    affinity_init();
-
-    /* Handling of command line options */
-    if (argc ==  1)
-    {
-        HELP_MSG;
-        affinity_finalize();
-        exit(EXIT_SUCCESS);
-    }
-    opterr = 0;
-    while ((c = getopt (argc, argv, "g:w:t:i:l:aphvq")) != -1) {
-        switch (c)
-        {
-            case 'h':
-                HELP_MSG;
-                affinity_finalize();
-                if (groups)
-                {
-                    free(groups);
-                }
-                exit (EXIT_SUCCESS);
-            case 'v':
-                VERSION_MSG;
-                affinity_finalize();
-                if (groups)
-                {
-                    free(groups);
-                }
-                exit (EXIT_SUCCESS);
-            case 'a':
-                if (OUTSTREAM)
-                {
-                    fprintf(OUTSTREAM, TESTS"\n");
-                    fflush(OUTSTREAM);
-                }
-                affinity_finalize();
-                if (groups)
-                {
-                    free(groups);
-                }
-                exit (EXIT_SUCCESS);
-            case 'q':
-                OUTSTREAM = NULL;
-                break;
-            case 'w':
-                tmp--;
-
-                if (tmp == -1)
-                {
-                    fprintf (stderr, "More workgroups configured than allocated!\n"
-                        "Did you forget to set the number of workgroups with -g?\n");
-                    affinity_finalize();
-                    if (groups)
-                    {
-                        free(groups);
-                    }
-                    return EXIT_FAILURE;
-                }
-                if (!test)
-                {
-                    fprintf (stderr, "You need to specify a test case first!\n");
-                    affinity_finalize();
-                    if (groups)
-                    {
-                        free(groups);
-                    }
-                    return EXIT_FAILURE;
-                }
-                testcase = bfromcstr(optarg);
-                currentWorkgroup = groups+tmp;  /*FIXME*/
-                bstr_to_workgroup(currentWorkgroup, testcase, test->type, test->streams);
-                bdestroy(testcase);
-
-                for (i=0; i<  test->streams; i++)
-                {
-                    if (currentWorkgroup->streams[i].offset%test->stride)
-                    {
-                        fprintf (stderr, "Stream %d: offset is not a multiple of stride!\n",i);
-                        affinity_finalize();
-                        if (groups)
-                        {
-                            free(groups);
-                        }
-                        return EXIT_FAILURE;
-                    }
-
-                    allocator_allocateVector(OUTSTREAM,
-                            &(currentWorkgroup->streams[i].ptr),
-                            PAGE_ALIGNMENT,
-                            currentWorkgroup->size,
-                            currentWorkgroup->streams[i].offset,
-                            test->type,
-                            currentWorkgroup->streams[i].domain);
-                }
-
-                break;
-            case 'i':
-                iter =  atoi(optarg);
-                if (iter <= 0)
-                {
-                    fprintf(stderr, "Iterations must be greater than 0.\n");
-                    exit(EXIT_FAILURE);
-                }
-                break;
-            case 'l':
-                testcase = bfromcstr(optarg);
-                for (i=0; i<NUMKERNELS; i++)
-                {
-                    if (biseqcstr(testcase, kernels[i].name))
-                    {
-                        test = kernels+i;
-                        break;
-                    }
-                }
-
-                if (biseqcstr(testcase,"none") || !test)
-                {
-                    fprintf (stderr, "Unknown test case %s\n",optarg);
-                    if (OUTSTREAM)
-                    {
-                        fprintf(OUTSTREAM, "Available test cases:\n");
-                        fprintf(OUTSTREAM, TESTS"\n");
-                        fflush(OUTSTREAM);
-                    }
-                    affinity_finalize();
-                    if (groups)
-                    {
-                        free(groups);
-                    }
-                    return EXIT_FAILURE;
-                }
-                else
-                {
-                    if (OUTSTREAM)
-                    {
-                        fprintf(OUTSTREAM, "Name: %s\n",test->name);
-                        fprintf(OUTSTREAM, "Number of streams: %d\n",test->streams);
-                        fprintf(OUTSTREAM, "Loop stride: %d\n",test->stride);
-                        fprintf(OUTSTREAM, "Flops: %d\n", (int) test->flops);
-                        fprintf(OUTSTREAM, "Bytes: %d\n",test->bytes);
-                        switch (test->type)
-                        {
-                            case SINGLE:
-                                fprintf(OUTSTREAM, "Data Type: Single precision float\n");
-                                break;
-                            case DOUBLE:
-                                fprintf(OUTSTREAM, "Data Type: Double precision float\n");
-                                break;
-                        }
-                        fflush(OUTSTREAM);
-                    }
-                }
-                bdestroy(testcase);
-                affinity_finalize();
-                if (groups)
-                {
-                    free(groups);
-                }
-                exit (EXIT_SUCCESS);
-
-                break;
-            case 'p':
-                optPrintDomains = 1;
-                break;
-            case 'g':
-                numberOfWorkgroups =  atoi(optarg);
-                if (numberOfWorkgroups <= 0)
-                {
-                    fprintf(stderr, "Number of Workgroups must be 1 or greater.\n");
-                    exit(EXIT_FAILURE);
-                }
-                allocator_init(numberOfWorkgroups * MAX_STREAMS);
-                tmp = numberOfWorkgroups;
-                groups = (Workgroup*) malloc(numberOfWorkgroups*sizeof(Workgroup));
-                break;
-            case 't':
-                testcase = bfromcstr(optarg);
-
-                for (i=0; i<NUMKERNELS; i++)
-                {
-                    if (biseqcstr(testcase, kernels[i].name))
-                    {
-                        test = kernels+i;
-                        break;
-                    }
-                }
-                if (biseqcstr(testcase,"none"))
-                {
-                    fprintf (stderr, "Unknown test case %s\n",optarg);
-                    affinity_finalize();
-                    if (groups)
-                    {
-                        free(groups);
-                    }
-                    return EXIT_FAILURE;
-                }
-                bdestroy(testcase);
-                break;
-            case '?':
-                if (optopt == 'l' || optopt == 'g' || optopt == 'w' || 
-                        optopt == 't' || optopt == 'i')
-                    fprintf (stderr, "Option `-%c' requires an argument.\n", optopt);
-                else if (isprint (optopt))
-                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
-                else
-                    fprintf (stderr,
-                            "Unknown option character `\\x%x'.\n",
-                            optopt);
-                affinity_finalize();
-                if (groups)
-                {
-                    free(groups);
-                }
-                return EXIT_FAILURE;
-            default:
-                HELP_MSG;
-        }
-    }
-
-    if (numberOfWorkgroups == 0 && !optPrintDomains)
-    {
-        fprintf(stderr, "Number of Workgroups must be 1 or greater.\n");
-        affinity_finalize();
-        allocator_finalize();
-        if (groups)
-        {
-            free(groups);
-        }
-        exit(EXIT_FAILURE);
-    }
-    if (tmp > 0 && iter > 0)
-    {
-        fprintf(stderr, "%d workgroups requested but only %d given on commandline\n",numberOfWorkgroups,numberOfWorkgroups-tmp);
-        affinity_finalize();
-        allocator_finalize();
-        if (groups)
-        {
-            free(groups);
-        }
-        exit(EXIT_FAILURE);
-    }
-    if (iter <= 0)
-    {
-        fprintf(stderr,"Iterations must be greater than 0\n");
-        affinity_finalize();
-        allocator_finalize();
-        if (groups)
-        {
-            free(groups);
-        }
-        exit(EXIT_FAILURE);
-    }
-    if (test && !(currentWorkgroup || groups))
-    {
-        fprintf(stderr, "Workgroups must be set on commandline\n");
-        affinity_finalize();
-        allocator_finalize();
-        if (groups)
-        {
-            free(groups);
-        }
-        exit(EXIT_FAILURE);
-    }
-
-    if (optPrintDomains)
-    {
-        affinity_printDomains(OUTSTREAM);
-        affinity_finalize();
-        allocator_finalize();
-        if (groups)
-        {
-            free(groups);
-        }
-        exit (EXIT_SUCCESS);
-    }
-    timer_init();
-
-    /* :WARNING:05/04/2010 08:58:05 AM:jt: At the moment the thread
-     * module only allows equally sized thread groups*/
-    for (i=0; i<numberOfWorkgroups; i++)
-    {
-        globalNumberOfThreads += groups[i].numberOfThreads;
-    }
-
-    threads_init(OUTSTREAM, globalNumberOfThreads);
-    threads_createGroups(numberOfWorkgroups);
-
-    /* we configure global barriers only */
-    barrier_init(1);
-    barrier_registerGroup(globalNumberOfThreads);
-
-#ifdef PERFMON
-    if (OUTSTREAM)
-    {
-        fprintf(OUTSTREAM, "Using likwid\n");
-        fflush(OUTSTREAM);
-    }
-    likwid_markerInit();
-#endif
-#ifdef PAPI
-    if (OUTSTREAM)
-    {
-        fprintf(OUTSTREAM, "Using PAPI\n");
-    }
-    PAPI_library_init (PAPI_VER_CURRENT);
-    PAPI_thread_init((unsigned long (*)(void))(omp_get_thread_num));
-#endif
-
-
-    /* initialize data structures for threads */
-    for (i=0; i<numberOfWorkgroups; i++)
-    {
-        myData.iter = iter;
-        myData.size = groups[i].size;
-        myData.test = test;
-        myData.numberOfThreads = groups[i].numberOfThreads;
-        myData.processors = (int*) malloc(myData.numberOfThreads * sizeof(int));
-        myData.streams = (void**) malloc(test->streams * sizeof(void*));
-
-        for (j=0; j<groups[i].numberOfThreads; j++)
-        {
-            myData.processors[j] = groups[i].processorIds[j];
-        }
-
-        for (j=0; j<  test->streams; j++)
-        {
-            myData.streams[j] = groups[i].streams[j].ptr;
-        }
-        threads_registerDataGroup(i, &myData, copyThreadData);
-
-        free(myData.processors);
-        free(myData.streams);
-    }
-
-    if (OUTSTREAM)
-    {
-        fprintf(OUTSTREAM, HLINE);
-        fprintf(OUTSTREAM, "LIKWID MICRO BENCHMARK\n");
-        fprintf(OUTSTREAM, "Test: %s\n",test->name);
-        fprintf(OUTSTREAM, HLINE);
-        fprintf(OUTSTREAM, "Using %d work groups\n",numberOfWorkgroups);
-        fprintf(OUTSTREAM, "Using %d threads\n",globalNumberOfThreads);
-        fprintf(OUTSTREAM, HLINE);
-        fflush(OUTSTREAM);
-    }
-
-    threads_create(runTest);
-    threads_join();
-    allocator_finalize();
-
-    uint32_t realSize = 0;
-    uint64_t realCycles = 0;
-    int current_id = 0;
-
-    if (OUTSTREAM)
-    {
-        fprintf(OUTSTREAM, HLINE);
-        for(j=0;j<numberOfWorkgroups;j++)
-        {
-            current_id = j*groups[j].numberOfThreads;
-            realCycles += threads_data[current_id].cycles;
-            realSize += groups[j].numberOfThreads * threads_data[current_id].data.size;
-        }
-        time = (double) realCycles / (double) timer_getCpuClock();
-        fprintf(OUTSTREAM, "Cycles: %llu \n", LLU_CAST realCycles);
-        fprintf(OUTSTREAM, "Iterations: %llu \n", LLU_CAST iter);
-        fprintf(OUTSTREAM, "Size %d \n",  realSize );
-        fprintf(OUTSTREAM, "Vectorlength: %llu \n", LLU_CAST threads_data[current_id].data.size);
-        fprintf(OUTSTREAM, "Time: %e sec\n", time);
-        fprintf(OUTSTREAM, "Number of Flops: %llu \n", LLU_CAST (iter * realSize *  test->flops));
-        fprintf(OUTSTREAM, "MFlops/s: %.2f\n",
-                1.0E-06 * ((double) iter * realSize *  test->flops/  time));
-        fprintf(OUTSTREAM, "MByte/s: %.2f\n",
-                1.0E-06 * ( (double) iter * realSize *  test->bytes/ time));
-        fprintf(OUTSTREAM, "Cycles per update: %f\n",
-                ((double) realCycles / (double) (iter * numberOfWorkgroups * threads_data[current_id].numberOfThreads *  threads_data[current_id].data.size)));
-
-        switch ( test->type )
-        {
-            case SINGLE:
-                fprintf(OUTSTREAM, "Cycles per cacheline: %f\n",
-                        (16.0 * (double) realCycles / (double) (iter * realSize)));
-                break;
-            case DOUBLE:
-                fprintf(OUTSTREAM, "Cycles per cacheline: %f\n",
-                        (8.0 * (double) realCycles / (double) (iter * realSize)));
-                break;
-        }
-
-        fprintf(OUTSTREAM, HLINE);
-        fflush(OUTSTREAM);
-    }
-    threads_destroy(numberOfWorkgroups);
-    barrier_destroy();
-    
-    affinity_finalize();
-#ifdef PERFMON
-    likwid_markerClose();
-#endif
-
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-features.c b/src/applications/likwid-features.c
deleted file mode 100644
index 6fe5477..0000000
--- a/src/applications/likwid-features.c
+++ /dev/null
@@ -1,191 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-features.c
- *
- *      Description:  An application to read out and set the feature flag
- *                  register on Intel Core 2 processors.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <error.h>
-#include <strUtil.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <cpuid.h>
-#include <cpuFeatures.h>
-
-#define HELP_MSG \
-    fprintf(stdout, "\nlikwid-features --  Version  %d.%d \n\n",VERSION,RELEASE); \
-    fprintf(stdout, "A tool to print and toggle the feature flag msr on Intel CPUS.\n"); \
-    fprintf(stdout, "Supported Features: HW_PREFETCHER, CL_PREFETCHER, DCU_PREFETCHER, IP_PREFETCHER.\n\n"); \
-    fprintf(stdout, "Options:\n"); \
-    fprintf(stdout, "-h\t Help message\n"); \
-    fprintf(stdout, "-v\t Version information\n"); \
-    fprintf(stdout, "-s <FEATURE>\t set cpu feature \n"); \
-    fprintf(stdout, "-u <FEATURE>\t unset cpu feature \n"); \
-    fprintf(stdout, "-c <ID>\t core id\n\n"); \
-    fflush(stdout);
-
-#define VERSION_MSG \
-    fprintf(stdout, "likwid-features  %d.%d \n\n",VERSION,RELEASE); \
-    fflush(stdout);
-
-int main (int argc, char** argv)
-{ 
-    int socket_fd = -1;
-    int optSetFeature = 0;
-    int cpuId = 0;
-    int c;
-    bstring argString;
-    CpuFeature feature = HW_PREFETCHER ;
-
-    while ((c = getopt (argc, argv, "c:s:u:hv")) != -1)
-    {
-        switch (c)
-        {
-            case 'h':
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-            case 'v':
-                VERSION_MSG;
-                exit (EXIT_SUCCESS);
-            case 'u':
-                optSetFeature = 2;
-            case 's':
-                if (! (argString = bSecureInput(40,optarg)))
-                {
-                    fprintf(stderr,"Failed to read argument string!\n");
-                    exit(EXIT_FAILURE);
-                }
-
-                if (biseqcstr(argString,"HW_PREFETCHER"))
-                {
-                    feature = HW_PREFETCHER;
-                }
-                else if (biseqcstr(argString,"CL_PREFETCHER"))
-                {
-                    feature = CL_PREFETCHER;
-                }
-                else if (biseqcstr(argString,"DCU_PREFETCHER"))
-                {
-                    feature = DCU_PREFETCHER;
-                }
-                else if (biseqcstr(argString,"IP_PREFETCHER"))
-                {
-                    feature = IP_PREFETCHER;
-                }
-                else
-                {
-                    fprintf(stderr,"Feature not supported!\n");
-                    exit(EXIT_FAILURE);
-                }
-
-
-                if (!optSetFeature)
-                {
-                    optSetFeature = 1;
-                }
-                break;
-            case 'c':
-                if (! (argString = bSecureInput(20,optarg)))
-                {
-                    fprintf(stderr,"Failed to read argument string!\n");
-                    exit(EXIT_FAILURE);
-                }
-
-                cpuId = str2int((char*) argString->data);
-
-                break;
-            case '?':
-                if (isprint (optopt))
-                {
-                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
-                }
-                else
-                {
-                    fprintf (stderr,
-                            "Unknown option character `\\x%x'.\n",
-                            optopt);
-                }
-                return EXIT_FAILURE;
-            default:
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-        }
-    }
-
-    if (cpuid_init() == EXIT_FAILURE)
-    {
-        ERROR_PLAIN_PRINT(Unsupported processor!);
-    }
-
-    fprintf(stdout, HLINE);
-    fprintf(stdout, "CPU name:\t%s \n",cpuid_info.name);
-    fprintf(stdout, "CPU core id:\t%d \n", cpuId);
-    fflush(stdout);
-
-    if (cpuid_info.family != P6_FAMILY)
-    {
-        fprintf (stderr, "likwid-features only supports Intel P6 based processors!\n");
-        exit(EXIT_FAILURE);
-    }
-
-    if (cpuId >= (int) cpuid_topology.numHWThreads)
-    {
-        fprintf (stderr, "This processor has only %d HWthreads! \n",cpuid_topology.numHWThreads);
-        exit(EXIT_FAILURE);
-    }
-
-    accessClient_init(&socket_fd);
-    msr_init(socket_fd);
-    cpuFeatures_init(cpuId);
-    cpuFeatures_print(cpuId);
-
-    if (optSetFeature == 1)
-    {
-        fprintf(stdout, SLINE);
-        cpuFeatures_enable(cpuId, feature);
-        fprintf(stdout, SLINE);
-    }
-    else if (optSetFeature == 2)
-    {
-        fprintf(stdout, SLINE);
-        cpuFeatures_disable(cpuId, feature);
-        fprintf(stdout, SLINE);
-    }
-    fflush(stdout);
-
-    msr_finalize();
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-genCfg.c b/src/applications/likwid-genCfg.c
deleted file mode 100644
index 97147fd..0000000
--- a/src/applications/likwid-genCfg.c
+++ /dev/null
@@ -1,122 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-genCfg.c
- *
- *      Description:  An application to dump the cpu topology information to
- *      a config file.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <error.h>
-#include <cpuid.h>
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#define HELP_MSG \
-    fprintf(stdout, "\nlikwid-genCfg --  Version  %d.%d \n\n",VERSION,RELEASE); \
-    fprintf(stdout, "A tool to dump node topology information into a file.\n"); \
-    fprintf(stdout, "Options:\n"); \
-    fprintf(stdout, "-h\t Help message\n"); \
-    fprintf(stdout, "-v\t Version information\n"); \
-    fprintf(stdout, "-o\t output file path (optional)\n\n"); \
-    fflush(stdout);
-
-#define VERSION_MSG \
-    fprintf(stdout, "likwid-genCfg  %d.%d \n\n",VERSION,RELEASE); \
-    fflush(stdout);
-
-
-int main (int argc, char** argv)
-{
-    FILE *file;
-    char *filepath = TOSTRING(CFGFILE);
-    size_t size;
-    int c;
-
-    while ((c = getopt (argc, argv, "ho:v")) != -1)
-    {
-        switch (c)
-        {
-            case 'h':
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-            case 'o':
-                filepath = optarg;
-                break;
-            case 'v':
-                VERSION_MSG;
-                exit (EXIT_SUCCESS);
-            case '?':
-                if (isprint (optopt))
-                {
-                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
-                }
-                else
-                {
-                    fprintf (stderr,
-                            "Unknown option character `\\x%x'.\n",
-                            optopt);
-                }
-                return EXIT_FAILURE;
-            default:
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-        }
-    }
-
-    cpuid_init();
-    fprintf(stdout, HLINE);
-    fprintf(stdout, "CPU name:\t%s \n",cpuid_info.name);
-    fflush(stdout);
-
-    if ((file = fopen(filepath, "wb")) != NULL) 
-    {
-        size = fwrite((void*) &cpuid_topology, sizeof(CpuTopology), 1, file);
-
-        size = fwrite((void*) cpuid_topology.threadPool,
-                sizeof(HWThread), cpuid_topology.numHWThreads, file);
-
-        size = fwrite((void*) cpuid_topology.cacheLevels,
-                sizeof(CacheLevel), cpuid_topology.numCacheLevels, file);
-
-        fclose(file);
-    }
-    else
-    {
-        fprintf(stderr,"Cfg file could not be written to %s\n", filepath);
-        ERROR;
-    }
-
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-genTopoCfg.lua b/src/applications/likwid-genTopoCfg.lua
new file mode 100644
index 0000000..d367077
--- /dev/null
+++ b/src/applications/likwid-genTopoCfg.lua
@@ -0,0 +1,140 @@
+#!<PREFIX>/bin/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-genTopoCfg.lua
+ *
+ *      Description:  A application to create a file of the underlying system configuration
+ *                    that is used by likwid to avoid reading the systems architecture at
+ *                    each start.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local filename = "<PREFIX>/etc/likwid_topo.cfg"
+
+function version()
+    print(string.format("likwid-genTopoCfg --  Version %d.%d",likwid.version,likwid.release))
+end
+
+function usage()
+    version()
+    print("A tool to store the system's architecture to a config file for LIKWID.\n")
+    print("Options:")
+    print("-h, --help\t\t Help message")
+    print("-v, --version\t\t Version information")
+    print("-o, --output <file>\t Use <file> instead of default "..filename)
+    print("\t\t\t Likwid searches at startup per default:")
+    print("\t\t\t /etc/likwid_topo.cfg and <PREFIX>/etc/likwid_topo.cfg")
+    print("\t\t\t Another location can be configured in the configuration file /etc/likwid.cfg,")
+    print("\t\t\t <PREFIX>/etc/likwid.cfg or the path defined at the build process of Likwid.")
+end
+
+for opt,arg in likwid.getopt(arg, {"h","v","help","version", "o:", "output:"}) do
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif opt == "o" or opt == "output" then
+        filename = arg
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    end
+end
+
+local file = io.open(filename, "w")
+if file == nil then
+    print("Cannot open file "..filename.." for writing")
+    os.exit(1)
+end
+
+os.remove(filename)
+local cpuinfo = likwid.getCpuInfo()
+local cputopo = likwid.getCpuTopology()
+local numainfo = likwid.getNumaInfo()
+local affinity = likwid.getAffinityInfo()
+cpuinfo["clock"] = likwid.getCpuClock()
+
+local threadPool_order = {"threadId", "coreId", "packageId", "apicId"}
+local cacheLevels_order = {"type", "associativity", "sets", "lineSize", "size", "threads", "inclusive"}
+
+for field, value in pairs(cpuinfo) do
+    file:write("cpuid_info " .. field .. " = " .. tostring(value).."\n")
+end
+
+for field, value in pairs(cputopo) do
+    if (field ~= "threadPool" and field ~= "cacheLevels" and field ~= "topologyTree") then
+        if field ~= "activeHWThreads" then
+            file:write("cpuid_topology " .. field .. " = " .. tostring(value).."\n")
+        end
+    elseif (field == "threadPool") then
+        --file:write("cpuid_topology threadPool count = "..tostring(likwid.tablelength(cputopo["threadPool"])).."\n")
+        for id, tab in pairs(cputopo["threadPool"]) do
+            str = "cpuid_topology threadPool "..tostring(id).." "
+            for k,v in pairs(threadPool_order) do
+                file:write(str..tostring(v).." = "..tostring(tab[v]).."\n")
+            end
+            
+        end
+    elseif (field == "cacheLevels") then
+        for id, tab in pairs(cputopo["cacheLevels"]) do
+            str = "cpuid_topology cacheLevels "..tostring(id).." "
+            for k,v in pairs(cacheLevels_order) do
+                file:write(str..tostring(v).." = "..tostring(tab[v]).."\n")
+            end
+            
+        end
+    end
+end
+
+file:write("numa_info numberOfNodes = "..tostring(numainfo["numberOfNodes"]).."\n")
+for field, value in pairs(numainfo["nodes"]) do
+    for id, tab in pairs(value) do
+        if id ~= "processors" and id ~= "distances" then
+            file:write("numa_info nodes "..tostring(field).." "..tostring(id).." = "..tostring(tab).."\n")
+        elseif id == "processors" then
+            for k,v in pairs(tab) do 
+                str = str..","..tostring(v) 
+                file:write("numa_info nodes "..tostring(field).." "..tostring(id).." "..tostring(k).." = "..tostring(v).."\n")
+            end
+        elseif id == "distances" then
+            for k,v in pairs(tab) do
+                for k1,v1 in pairs(v) do
+                    file:write("numa_info nodes "..tostring(field).." "..tostring(id).." "..tostring(k1).." = "..tostring(v1).."\n")
+                end
+            end
+        end
+    end
+end
+
+file:close()
+likwid.putAffinityInfo()
+likwid.putNumaInfo()
+likwid.putTopology()
+
diff --git a/src/applications/likwid-memsweeper.c b/src/applications/likwid-memsweeper.c
deleted file mode 100644
index 4806763..0000000
--- a/src/applications/likwid-memsweeper.c
+++ /dev/null
@@ -1,138 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-memsweeper.c
- *
- *      Description:  An application to clean up NUMA memory domains.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <strUtil.h>
-#include <error.h>
-#include <cpuid.h>
-#include <numa.h>
-#include <affinity.h>
-#include <memsweep.h>
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#define HELP_MSG \
-    fprintf(stdout, "\nlikwid-memsweeper --  Version  %d.%d \n\n",VERSION,RELEASE); \
-    fprintf(stdout, "A tool clean up NUMA memory domains and last level caches.\n"); \
-    fprintf(stdout, "Options:\n"); \
-    fprintf(stdout, "-h\t Help message\n"); \
-    fprintf(stdout, "-v\t Version information\n"); \
-    fprintf(stdout, "-q\t Silent without output\n"); \
-    fprintf(stdout, "-c\t Specify NUMA domain ID to clean up\n"); \
-    fprintf(stdout, "\t If no specific domain is set, all domains are swept.\n"); \
-    fprintf(stdout, "Usage:\n"); \
-    fprintf(stdout, "To clean specific domain: likwid-memsweeper -c 2 \n"); \
-    fflush(stdout);
-
-#define VERSION_MSG \
-    fprintf(stdout, "likwid-memsweeper  %d.%d \n\n",VERSION,RELEASE); \
-    fflush(stdout);
-
-
-int main (int argc, char** argv)
-{
-    int domainId = -1;
-    int c;
-    int optSilent = 0;
-    bstring argString;
-    FILE* OUTSTREAM = stdout;
-
-    while ((c = getopt (argc, argv, "+c:hvq")) != -1)
-    {
-        switch (c)
-        {
-            case 'h':
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-            case 'v':
-                VERSION_MSG;
-                exit (EXIT_SUCCESS);
-            case 'q':
-                optSilent = 1;
-                OUTSTREAM = NULL;
-                break;
-            case 'c':
-                if (! (argString = bSecureInput(10,optarg)))
-                {
-                    fprintf(stderr,"Failed to read argument string!\n");
-                    exit(EXIT_FAILURE);
-                }
-
-                domainId = str2int((char*) argString->data);
-
-                break;
-            case '?':
-                if (isprint (optopt))
-                {
-                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
-                }
-                else
-                {
-                    fprintf (stderr,
-                            "Unknown option character `\\x%x'.\n",
-                            optopt);
-                }
-                return EXIT_FAILURE;
-            default:
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-        }
-    }
-
-    if (cpuid_init() == EXIT_FAILURE)
-    {
-        ERROR_PLAIN_PRINT(Unsupported processor!);
-    }
-    numa_init();
-
-    if (domainId < 0) 
-    {
-        memsweep_node(OUTSTREAM);
-    }
-    else if (domainId < numa_info.numberOfNodes)
-    {
-        memsweep_domain(OUTSTREAM, domainId);
-    }
-    else
-    {
-        fprintf(stderr, "Unknown NUMA domain %d\n", domainId);
-        exit(EXIT_FAILURE);
-    }
-
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-memsweeper.lua b/src/applications/likwid-memsweeper.lua
new file mode 100644
index 0000000..cf6b80e
--- /dev/null
+++ b/src/applications/likwid-memsweeper.lua
@@ -0,0 +1,86 @@
+#!<PREFIX>/bin/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-memsweeper.lua
+ *
+ *      Description:  An application to clean up NUMA memory domains.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<PREFIX>/share/lua/?.lua;' .. package.path
+local likwid = require("likwid")
+
+local function version()
+    print(string.format("likwid-memsweeper --  Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+    print("Examples:")
+    print("To clean specific domain:")
+    print("likwid-memsweeper -c 2")
+    print("To clean a range of domains:")
+    print("likwid-memsweeper -c 1-2")
+    print("To clean specific domains:")
+    print("likwid-memsweeper -c 0,1-2")
+
+end
+
+local function usage()
+    version()
+    print("A tool clean up NUMA memory domains.\n")
+    print("Options:")
+    print("-h\t\t Help message")
+    print("-v\t\t Version information")
+    print("-c <list>\t Specify NUMA domain ID to clean up")
+    print("")
+    examples()
+end
+
+numainfo = likwid.getNumaInfo()
+nodes = {}
+for i,_ in pairs(numainfo["nodes"]) do
+    if tonumber(numainfo["nodes"][i]["id"]) ~= nil then
+        table.insert(nodes,numainfo["nodes"][i]["id"])
+    end
+end
+
+for opt,arg in likwid.getopt(arg, {"c:", "h", "v", "help", "version"}) do
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif (opt == "c") then
+        num_nodes, nodes = likwid.nodestr_to_nodelist(arg)
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    end
+end
+
+for i,socket in pairs(nodes) do
+    likwid.memSweepDomain(socket)
+end
+likwid.putNumaInfo()
diff --git a/src/applications/likwid-mpirun.lua b/src/applications/likwid-mpirun.lua
new file mode 100644
index 0000000..1a7057f
--- /dev/null
+++ b/src/applications/likwid-mpirun.lua
@@ -0,0 +1,1490 @@
+#!<PREFIX>/bin/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-mpirun.lua
+ *
+ *      Description: A wrapper script to pin threads spawned by MPI processes and 
+ *                   measure hardware performance counters
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local function version()
+    print(string.format("likwid-mpirun --  Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+    print("Examples:")
+    print("Run 32 processes on hosts in hostlist")
+    print("likwid-mpirun -np 32 ./a.out")
+    print("")
+    print("Run 1 MPI process on each socket")
+    print("likwid-mpirun -nperdomain S:1 ./a.out")
+    print("Total amount of MPI processes is calculated using the number of hosts in the hostfile")
+    print("")
+    print("For hybrid MPI/OpenMP jobs you need to set the -pin option")
+    print("Starts 2 MPI processes on each host, one on socket 0 and one on socket 1")
+    print("Each MPI processes starts 2 OpenMP threads pinned to the same socket")
+    print("likwid-mpirun -pin S0:2_S1:2 ./a.out")
+    print("")
+    print("Run 2 processes on each socket and measure the MEM performance group")
+    print("likwid-mpirun -nperdomain S:2 -g MEM ./a.out")
+    print("Only one process on a socket measures the Uncore/RAPL counters, the other one(s) only core-local counters")
+    print("")
+end
+
+local function usage()
+    version()
+    print("A wrapper script to pin threads spawned by MPI processes and measure hardware performance counters.\n")
+    print("Options:")
+    print("-h, --help\t\t Help message")
+    print("-v, --version\t\t Version information")
+    print("-d, --debug\t\t Debugging output")
+    print("-n/-np <count>\t\t Set the number of processes")
+    print("-nperdomain <domain>\t Set the number of processes per node by giving an affinity domain and count")
+    print("-pin <list>\t\t Specify pinning of threads. CPU expressions like likwid-pin separated with '_'")
+    print("-s, --skip <hex>\t Bitmask with threads to skip")
+    print("-mpi <id>\t\t Specify which MPI should be used. Possible values: openmpi, intelmpi and mvapich2")
+    print("\t\t\t If not set, module system is checked")
+    print("-omp <id>\t\t Specify which OpenMP should be used. Possible values: gnu and intel")
+    print("\t\t\t Only required for statically linked executables.")
+    print("-hostfile\t\t Use custom hostfile instead of searching the environment")
+    print("-g/-group <perf>\t Set a likwid-perfctr conform event set for measuring on nodes")
+    print("-m/-marker\t\t Activate marker API mode")
+    print("")
+    print("Processes are pinned to physical CPU cores first.")
+    print("")
+    examples()
+end
+
+local np = 0
+local ppn = 0
+local nperdomain = nil
+local npernode = 0
+local cpuexprs = {}
+local perfexprs = {}
+local hostfile = nil
+local hosts = {}
+local perf = {}
+local mpitype = nil
+local omptype = nil
+local skipStr = ""
+local executable = {}
+local debug = false
+local use_marker = false
+local use_csv = false
+
+local LIKWID_PIN="<PREFIX>/bin/likwid-pin"
+local LIKWID_PERFCTR="<PREFIX>/bin/likwid-perfctr"
+local MPIINFO = {}
+local MPIROOT = os.getenv("MPIHOME")
+if MPIROOT == nil then
+    MPIROOT = os.getenv("MPI_ROOT")
+end
+if MPIROOT == nil then
+    print("Please load a MPI module or set path to MPI solder in MPIHOME environment variable")
+    print("$MPIHOME/bin/<MPI launcher> should be valid")
+    os.exit(1)
+end
+local MPIEXEC = { openmpi=MPIROOT.."/bin/mpiexec", intelmpi=MPIROOT.."/bin/mpiexec.hydra", mvapich2="mpirun"}
+
+
+local readHostfile = nil
+local writeHostfile = nil
+local getEnvironment = nil
+
+
+local function readHostfileOpenMPI(filename)
+    local hostlist = {}
+    if filename == nil or filename == "" then
+        return {}
+    end
+    --[[if likwid.access(filename) then
+        return {}
+    end]]
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..filename)
+        os.exit(1)
+    end
+    local t = f:read("*all")
+    f:close()
+    for i, line in pairs(likwid.stringsplit(t,"\n")) do
+        if line:match("^#") == nil and line:match("^%s*$") == nil then
+            hostname, slots, maxslots = line:match("^([%.%a%d]+)%s+slots=(%d*)%s+max%-slots=(%d*)")
+            if not hostname then
+                hostname, slots = line:match("^([%.%a%d]+)%s+slots=(%d*)")
+                if not hostname then
+                    hostname = line:match("^([%.%a%d]+)")
+                    slots = nil
+                    maxslots = nil
+                end
+            end
+            local found = false
+            for i, host in pairs(hostlist) do
+                if host["hostname"] == hostname then
+                    if slots and host["slots"] then
+                        host["slots"] = host["slots"] + tonumber(slots)
+                    end
+                    if maxslots and host["maxslots"] then
+                        host["maxslots"] = host["maxslots"] + tonumber(maxslots)
+                    end
+                    break
+                end
+            end
+            if not found then
+                table.insert(hostlist, {hostname=hostname, slots=tonumber(slots), maxslots=tonumber(maxslots)})
+            end
+        end
+    end
+    return hostlist
+end
+
+local function writeHostfileOpenMPI(hostlist, filename)
+    if filename == nil or filename == "" then
+        return
+    end
+
+    local f = io.open(filename, "w")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..filename)
+        os.exit(1)
+    end
+    for i, hostcontent in pairs(hostlist) do
+        str = hostcontent["hostname"]
+        if hostcontent["slots"] then
+            str = str .. string.format(" slots=%d", hostcontent["slots"])
+        end
+        if hostcontent["maxslots"] then
+            str = str .. string.format(" max-slots=%d", hostcontent["maxslots"])
+        end
+        f:write(str .. "\n")
+    end
+    f:close()
+end
+
+local function getEnvironmentOpenMPI()
+    return {}
+end
+
+local function executeOpenMPI(wrapperscript, hostfile, env, nrNodes)
+    local bindstr = ""
+    if wrapperscript.sub(1,1) ~= "/" then
+        wrapperscript = os.getenv("PWD").."/"..wrapperscript
+    end
+
+    local f = io.popen(string.format("%s -V", MPIINFO["openmpi"]["MPIEXEC"]), "r")
+    if f ~= nil then
+        local input = f:read("*a")
+        ver1,ver2,ver3 = input:match("(%d+)%.(%d+)%.(%d+)")
+        if ver1 == "1" then
+            if ver2 == "7" then
+                bindstr = "--bind-to none"
+            elseif ver2 == "6" then
+                bindstr = "--bind-to-none"
+            end
+        end
+        f:close()
+    end
+
+    local cmd = string.format("%s -hostfile %s %s -np %d -npernode %d %s",
+                                MPIINFO["openmpi"]["MPIEXEC"], hostfile, bindstr,
+                                np, ppn, wrapperscript)
+    if debug then
+        print("EXEC: "..cmd)
+    end
+    os.execute(cmd)
+end
+
+local function readHostfileIntelMPI(filename)
+    local hostlist = {}
+    if filename == nil or filename == "" then
+        return {}
+    end
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..filename)
+        os.exit(1)
+    end
+    local t = f:read("*all")
+    f:close()
+    for i, line in pairs(likwid.stringsplit(t,"\n")) do
+        if line:match("^#") == nil and line:match("^%s*$") == nil then
+            hostname, slots = line:match("^([%.%a%d]+):(%d+)")
+            if not hostname then
+                hostname = line:match("^([%.%a%d]+)")
+                slots = nil
+            end
+            table.insert(hostlist, {hostname=hostname, slots=slots})
+        end
+    end
+    return hostlist
+end
+
+local function writeHostfileIntelMPI(hostlist, filename)
+    if filename == nil or filename == "" then
+        return
+    end
+
+    local f = io.open(filename, "w")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..filename)
+        os.exit(1)
+    end
+    for i, hostcontent in pairs(hostlist) do
+        str = hostcontent["hostname"]
+        if hostcontent["slots"] then
+            str = str .. string.format(":%d", hostcontent["slots"])
+        end
+        f:write(str .. "\n")
+    end
+    f:close()
+end
+
+local function getEnvironmentIntelMPI()
+    local env = {}
+    env['I_MPI_PIN']='off'
+    env['KMP_AFFINITY']='disabled'
+    return env
+end
+
+local function executeIntelMPI(wrapperscript, hostfile, env, nrNodes)
+    if wrapperscript.sub(1,1) ~= "/" then
+        wrapperscript = os.getenv("PWD").."/"..wrapperscript
+    end
+    if hostfile.sub(1,1) ~= "/" then
+        hostfile = os.getenv("PWD").."/"..hostfile
+    end
+
+    if debug then
+        print(string.format("EXEC: %s/bin/mpdboot -r ssh -n %d -f %s", MPIROOT, nrNodes, hostfile))
+        print(string.format("EXEC: %s/bin/mpiexec -perhost %d -env I_MPI_PIN 0 -np %d %s", MPIROOT, ppn, np, wrapperscript))
+        print(string.format("EXEC: %s/bin/mpdallexit", MPIROOT))
+    end
+
+    os.execute(string.format("%s/bin/mpdboot -r ssh -n %d -f %s", MPIROOT, nrNodes, hostfile))
+    os.execute(string.format("%s/bin/mpiexec -perhost %d -env I_MPI_PIN 0 -np %d %s", MPIROOT, ppn, np, wrapperscript))
+    os.execute(string.format("%s/bin/mpdallexit", MPIROOT))
+end
+
+local function readHostfileMvapich2(filename)
+    local hostlist = {}
+    if filename == nil or filename == "" then
+        return {}
+    end
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..filename)
+        os.exit(1)
+    end
+    local t = f:read("*all")
+    f:close()
+    for i, line in pairs(likwid.stringsplit(t,"\n")) do
+        if line:match("^#") == nil and line:match("^%s*$") == nil then
+            hostname, slots, interface = line:match("^([%.%a%d]+):(%d+):([%a%d]+)")
+            if not hostname then
+                hostname, slots = line:match("^([%.%a%d]+):(%d+)")
+                if not hostname then
+                    hostname = line:match("^([%.%a%d]+)")
+                    slots = 1
+                    interface = nil
+                else
+                    interface = nil
+                end
+            end
+            table.insert(hostlist, {hostname=hostname, slots=slots, interface=interface})
+        end
+    end
+    return hostlist
+end
+
+local function writeHostfileMvapich2(hostlist, filename)
+    if filename == nil or filename == "" then
+        return
+    end
+
+    local f = io.open(filename, "w")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..filename)
+        os.exit(1)
+    end
+    for i, hostcontent in pairs(hostlist) do
+        str = hostcontent["hostname"]
+        if hostcontent["slots"] then
+            str = str .. string.format(":%d", hostcontent["slots"])
+        end
+        if hostcontent["interface"] then
+            str = str .. string.format(":%s", hostcontent["interface"])
+        end
+        f:write(str .. "\n")
+    end
+    f:close()
+end
+
+local function getEnvironmentMvapich2()
+    local env = {}
+    env['MV2_ENABLE_AFFINITY'] = "0"
+    return env
+end
+
+local function executeMvapich2(wrapperscript, hostfile, env, nrNodes)
+    if wrapperscript.sub(1,1) ~= "/" then
+        wrapperscript = os.getenv("PWD").."/"..wrapperscript
+    end
+    if hostfile.sub(1,1) ~= "/" then
+        hostfile = os.getenv("PWD").."/"..hostfile
+    end
+    local cmd = string.format("%s -f %s -np %d -ppn %d %s",
+                                MPIINFO["mvapich2"]["MPIEXEC"], hostfile,
+                                np, ppn, wrapperscript)
+    if debug then
+        print("EXEC: "..cmd)
+    end
+    os.execute(cmd)
+end
+
+MPIINFO =      { openmpi={ MPIEXEC=MPIROOT.."/bin/mpiexec",
+                            readHostfile = readHostfileOpenMPI,
+                            writeHostfile = writeHostfileOpenMPI,
+                            getEnvironment = getEnvironmentOpenMPI,
+                            executeCommand = executeOpenMPI},
+                  intelmpi={MPIEXEC=MPIROOT.."/bin/mpiexec",
+                            readHostfile = readHostfileIntelMPI,
+                            writeHostfile = writeHostfileIntelMPI,
+                            getEnvironment = getEnvironmentIntelMPI,
+                            executeCommand = executeIntelMPI},
+                  mvapich2={MPIEXEC=MPIROOT.."/bin/mpiexec.hydra",
+                            readHostfile = readHostfileMvapich2,
+                            writeHostfile = writeHostfileMvapich2,
+                            getEnvironment = getEnvironmentMvapich2,
+                            executeCommand = executeMvapich2}
+                }
+
+local function readHostfilePBS(filename)
+    local hostlist = {}
+    if filename == nil or filename == "" then
+        return {}
+    end
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..filename)
+        os.exit(1)
+    end
+    local t = f:read("*all")
+    f:close()
+    for i, line in pairs(likwid.stringsplit(t,"\n")) do
+        if line:match("^#") == nil and line:match("^%s*$") == nil then
+            hostname = line:match("^([%.%a%d]+)")
+            local found = false
+            for i, host in pairs(hostlist) do
+                if host["hostname"] == hostname then
+                    host["slots"] = host["slots"] + 1
+                    host["maxslots"] = host["slots"]
+                    found = true
+                    break
+                end
+            end
+            if not found then
+                table.insert(hostlist, {hostname=hostname, slots=1, maxslots=1})
+            end
+        end
+    end
+    return hostlist
+end
+
+local function getNumberOfNodes(hostlist)
+    local n = 0
+    for i, h in pairs(hostlist) do
+        hostname = h["hostname"]
+        exists = false
+        for j=1,i-1 do
+            if hostlist[i]["hostname"] == hostlist[j]["hostname"] then
+                exists = true
+            end
+        end
+        if not exists then
+            n = n + 1
+        end
+    end
+    return n
+end
+
+local function getMpiType()
+    local mpitype = nil
+    cmd = "tclsh /apps/modules/modulecmd.tcl sh list -t 2>&1"
+    local f = io.popen(cmd, 'r')
+    if f == nil then
+        cmd = os.getenv("SHELL").." -c 'module -t 2>&1'"
+        f,msg,ret = io.popen(cmd, 'r')
+    end
+    if f ~= nil then
+        local s = assert(f:read('*a'))
+        f:close()
+        s = string.gsub(s, '^%s+', '')
+        s = string.gsub(s, '%s+$', '')
+        for i,line in pairs(likwid.stringsplit(s, "\n")) do
+            if line:match("^intelmpi") then
+                mpitype = "intelmpi"
+                --libmpi%a*.so
+            elseif line:match("^openmpi") then
+                mpitype = "openmpi"
+                --libmpi.so
+            elseif line:match("^mvapich2") then
+                mpitype = "mvapich2"
+                --libmpich.so
+            end
+        end
+    end
+    if not mpitype then
+        print("WARN: No supported MPI loaded in module system")
+    end
+    return mpitype
+end
+
+local function getOmpType()
+    local cmd = string.format("ldd `which %s`", executable[1])
+    local f = assert(io.popen(cmd, 'r'))
+    local s = assert(f:read('*a'))
+    f:close()
+    for i,line in pairs(likwid.stringsplit(s, "\n")) do
+        if line:match("libgomp.so") then
+            omptype = "gnu"
+            break
+        elseif line:match("libiomp%d*.so") then
+            omptype = "intel"
+            break
+        end
+    end
+    if not omptype then
+        print("WARN: Cannot get OpenMP variant from executable, trying module system")
+        cmd = "tclsh /apps/modules/modulecmd.tcl sh list -t 2>&1"
+        local f = io.popen(cmd, 'r')
+        if f ~= nil then
+            f:close()
+            cmd = os.getenv("SHELL").." -c 'module -t list 2>&1'"
+            f = io.popen(cmd, 'r')
+        end
+        if f ~= nil then
+            local s = assert(f:read('*a'))
+            f:close()
+            s = string.gsub(s, '^%s+', '')
+            s = string.gsub(s, '%s+$', '')
+            for i,line in pairs(likwid.stringsplit(s, "\n")) do
+                if line:match("^intel") then
+                    omptype = "intel"
+                elseif line:match("^gnu") then
+                    omptype = "gnu"
+                end
+            end
+        end
+        if not omptype then
+            print("WARN: No supported OpenMP loaded in module system")
+        end
+    end
+    return omptype
+end
+
+local function assignHosts(hosts, np, ppn)
+    tmp = np
+    newhosts = {}
+    current = 0
+    local break_while = false
+    while tmp > 0 do
+        for i, host in pairs(hosts) do
+            if host["slots"] and host["slots"] >= ppn then
+                if host["maxslots"] and host["maxslots"] < ppn then
+                    table.insert(newhosts, {hostname=host["hostname"],
+                                            slots=host["maxslots"],
+                                            maxslots=host["maxslots"],
+                                            interface=host["interface"]})
+                    current = host["maxslots"]
+                    hosts[i] = nil
+                else
+                    table.insert(newhosts, {hostname=host["hostname"],
+                                            slots=ppn,
+                                            maxslots=host["slots"],
+                                            interface=host["interface"]})
+                    current = ppn
+                end
+            elseif host["slots"] then
+                if host["maxslots"] then
+                    if host["maxslots"] < ppn then
+                        print(string.format("WARN: Oversubscription for host %s needed, but max-slots set to %d.",
+                                                host["hostname"], host["maxslots"]))
+                        table.insert(newhosts, {hostname=host["hostname"],
+                                                slots=host["maxslots"],
+                                                maxslots=host["maxslots"],
+                                                interface=host["interface"]})
+                        current = host["maxslots"]
+                        hosts[i] = nil
+                    else
+                        print(string.format("WARN: Oversubscription for host %s.", host["hostname"]))
+                        table.insert(newhosts, {hostname=host["hostname"],
+                                            slots=ppn,
+                                            maxslots=host["maxslots"],
+                                            interface=host["interface"]})
+                        current = ppn
+                    end
+                else
+                    print(string.format("WARN: Oversubscription for host %s.", host["hostname"]))
+                    table.insert(newhosts, {hostname=host["hostname"],
+                                        slots=ppn,
+                                        maxslots=host["slots"],
+                                        interface=host["interface"]})
+                    current = ppn
+                end
+            else
+                table.insert(newhosts, {hostname=host["hostname"],
+                                        slots=ppn,
+                                        maxslots=host["slots"],
+                                        interface=host["interface"]})
+                current = ppn
+            end
+            tmp = tmp - current
+            if tmp <= 1 then
+                break_while = true
+                break
+            elseif tmp < ppn then
+                ppn = tmp
+            end
+        end
+        if break_while then
+            break
+        end
+    end
+    for i=1, #newhosts do
+        if newhosts[i] then
+            for j=i+1,#newhosts do
+                if newhosts[j] then
+                    if newhosts[i]["hostname"] == newhosts[j]["hostname"] then
+                        newhosts[i]["slots"] = newhosts[i]["slots"] + newhosts[j]["slots"]
+                        if newhosts[i]["maxslots"] ~= nil and newhosts[j]["maxslots"] ~= nil then
+                            newhosts[i]["maxslots"] = newhosts[i]["maxslots"] + newhosts[j]["maxslots"]
+                        end
+                        if newhosts[i]["slots"] > ppn then
+                            ppn = newhosts[i]["slots"]
+                        end
+                        table.remove(newhosts, j)
+                    end
+                end
+            end
+        end
+    end
+    if debug then
+        print("DEBUG: Scheduling on hosts:")
+        for i, h in pairs(newhosts) do
+            if h["maxslots"] ~= nil then
+                str = string.format("DEBUG: Host %s with %d processes (max. %d processes)",
+                                h["hostname"],h["slots"],h["maxslots"])
+            else
+                str = string.format("DEBUG: Host %s with %d processes", h["hostname"],h["slots"])
+            end
+            if h["interface"] then
+                str = str.. string.format(" using interface %s", h["interface"])
+            end
+            print(str)
+        end
+    end
+    return newhosts, ppn
+end
+
+local function calculatePinExpr(cpuexprs)
+    local newexprs = {}
+    for i, expr in pairs(cpuexprs) do
+        local strList = {}
+        amount, list = likwid.cpustr_to_cpulist(expr)
+        for _, c in pairs(list) do
+            table.insert(strList, c)
+        end
+        table.insert(newexprs, table.concat(strList,","))
+    end
+    return newexprs
+end
+
+local function calculateCpuExprs(nperdomain, cpuexprs)
+    local topo = likwid.getCpuTopology()
+    local affinity = likwid.getAffinityInfo()
+    local domainlist = {}
+    local newexprs = {}
+    domainname, count = nperdomain:match("[E:]*(%g*):(%d+)")
+
+    for i, domain in pairs(affinity["domains"]) do
+        if domain["tag"]:match(domainname.."%d*") then
+            table.insert(domainlist, i)
+        end
+    end
+    if debug then
+        local str = "DEBUG: NperDomain string "..nperdomain.." covers the domains: "
+        for i, idx in pairs(domainlist) do
+            str = str .. affinity["domains"][idx]["tag"] .. " "
+        end
+        print(str)
+    end
+
+    for i, domidx in pairs(domainlist) do
+        local sortedlist = {}
+        for off=1,topo["numThreadsPerCore"] do
+            for i=0,affinity["domains"][domidx]["numberOfProcessors"]/topo["numThreadsPerCore"] do
+                table.insert(sortedlist, affinity["domains"][domidx]["processorList"][off + (i*topo["numThreadsPerCore"])])
+            end
+        end
+        local tmplist = {}
+        for j=1,count do
+            table.insert(tmplist, sortedlist[1])
+            table.remove(sortedlist, 1)
+        end
+        table.insert(newexprs, table.concat(tmplist,","))
+    end
+    if debug then
+        local str = "DEBUG: Resolved NperDomain string "..nperdomain.." to CPUs: "
+        for i, expr in pairs(newexprs) do
+            str = str .. expr .. " "
+        end
+        print(str)
+    end
+    return newexprs
+end
+
+local function createEventString(eventlist)
+    local str = eventlist[1]["Event"]..":"..eventlist[1]["Counter"]
+    for i=2,#eventlist do
+        str = str .. ","..eventlist[i]["Event"]..":"..eventlist[i]["Counter"]
+    end
+    return str
+end
+
+local function setPerfStrings(perflist, cpuexprs)
+    local uncore = false
+    local perfexprs = {}
+    local grouplist = {}
+    local cpuinfo = likwid.getCpuInfo()
+    local affinity = likwid.getAffinityInfo()
+    local socketList = {}
+    local socketListFlags = {}
+    for i, d in pairs(affinity["domains"]) do
+        if d["tag"]:match("S%d+") then
+            local tmpList = {}
+            for j,cpu in pairs(d["processorList"]) do
+                table.insert(tmpList, cpu)
+            end
+            table.insert(socketList, tmpList)
+            table.insert(socketListFlags, 1)
+        end
+    end
+
+    for k, perfStr in pairs(perflist) do
+        local coreevents = {}
+        local uncoreevents = {}
+        local gdata = nil
+        gdata = likwid.get_groupdata(perfStr)
+        if gdata == nil then
+            print("Cannot get data for group "..perfStr..". Skipping...")
+        else
+            table.insert(grouplist, gdata)
+            if perfexprs[k] == nil then
+                perfexprs[k] = {}
+            end
+
+            for i, e in pairs(gdata["Events"]) do
+                if  not e["Counter"]:match("FIXC%d") and
+                    not e["Counter"]:match("^PMC%d") and
+                    not e["Counter"]:match("TMP%d") then
+                    table.insert(uncoreevents, e)
+                else
+                    table.insert(coreevents, e)
+                end
+            end
+            
+            local tmpSocketFlags = {}
+            for _,e in pairs(socketListFlags) do
+                table.insert(tmpSocketFlags, e)
+            end
+
+            for i,cpuexpr in pairs(cpuexprs) do
+                for j, cpu in pairs(likwid.stringsplit(cpuexpr,",")) do
+                    local uncore = false
+                    for sidx, socket in pairs(socketList) do
+                        local switchedFlag = false
+                        for _,c in pairs(socket) do
+                            if c == tonumber(cpu) then
+                                if tmpSocketFlags[sidx] == 1 then
+                                    local eventStr = createEventString(coreevents)
+                                    if #uncoreevents > 0 then
+                                        eventStr = eventStr .. ","..createEventString(uncoreevents)
+                                    end
+                                    table.insert(perfexprs[k], eventStr)
+                                    tmpSocketFlags[sidx] = 0
+                                    switchedFlag = true
+                                    uncore = true
+                                    break
+                                else
+                                    table.insert(perfexprs[k], createEventString(coreevents))
+                                end
+                            end
+                        end
+                        if switchedFlag then break end
+                    end
+                    if uncore then break end
+                end
+            end
+
+            if debug then
+                for i, expr in pairs(perfexprs[k]) do
+                    print(string.format("DEBUG: Process %d measures with event set: %s", i-1, expr))
+                end
+            end
+        end
+    end
+    return perfexprs, grouplist
+end
+
+
+local function writeWrapperScript(scriptname, execStr, hosts, outputname)
+    if scriptname == nil or scriptname == "" then
+        return
+    end
+    local oversubscripted = {}
+    local commands = {}
+    tmphosts = {}
+    for i, host in pairs(hosts) do
+        if tmphosts[host["hostname"]] ~= nil then
+            tmphosts[host["hostname"]] = tmphosts[host["hostname"]] + host["slots"]
+        else
+            tmphosts[host["hostname"]] = host["slots"]
+        end
+    end
+
+    if mpitype == "openmpi" then
+        glsize_var = "$OMPI_COMM_WORLD_SIZE"
+        glrank_var = "${OMPI_COMM_WORLD_RANK:-$(($GLOBALSIZE * 2))}"
+        losize_var = "$OMPI_COMM_WORLD_LOCAL_SIZE"
+    elseif mpitype == "intelmpi" then
+        glrank_var = "${PMI_RANK:-$(($GLOBALSIZE * 2))}"
+        glsize_var = tostring(np)
+        losize_var = tostring(ppn)
+    elseif mpitype == "mvapich2" then
+        glrank_var = "${PMI_RANK:-$(($GLOBALSIZE * 2))}"
+        glsize_var = tostring(np)
+        losize_var = tostring(ppn)
+    else
+        print("Invalid MPI vendor "..mpitype)
+        return
+    end
+
+    local taillength = np % ppn
+    if taillength ~= 0 then
+        local full = tostring(np -taillength)
+        table.insert(oversubscripted, "if [ $GLOBALRANK >= "..full.." ]; then\n")
+        table.insert(oversubscripted, "\tLOCALRANK=$($GLOBALRANK - "..full..")\n")
+        table.insert(oversubscripted, "fi\n")
+    end
+
+    local f = io.open(scriptname, "w")
+    if f == nil then
+        print("ERROR: Cannot open hostfile "..scriptname)
+        os.exit(1)
+    end
+
+    if outputname:sub(1,1) ~= "/" then
+        outputname = os.getenv("PWD").."/"..outputname
+    end
+
+    for i=1,#cpuexprs do
+        local cmd = {}
+        local cpuexpr_opt = "-c"
+        if #perf > 0 then
+            table.insert(cmd,LIKWID_PERFCTR)
+            if use_marker then
+                table.insert(cmd,"-m")
+            end
+            cpuexpr_opt = "-C"
+        else
+            table.insert(cmd,LIKWID_PIN)
+            table.insert(cmd,"-q")
+        end
+        table.insert(cmd,skipStr)
+        table.insert(cmd,cpuexpr_opt)
+        table.insert(cmd,cpuexprs[i])
+        if #perf > 0 then
+            for j, expr in pairs(perfexprs) do
+                table.insert(cmd,"-g")
+                table.insert(cmd,expr[i])
+            end
+            table.insert(cmd,"-o")
+            table.insert(cmd,outputname)
+        end
+        table.insert(cmd,execStr)
+        commands[i] = table.concat(cmd, " ")
+    end
+
+    f:write("#!/bin/bash\n")
+    f:write("GLOBALSIZE="..glsize_var.."\n")
+    f:write("GLOBALRANK="..glrank_var.."\n")
+    f:write("unset OMP_NUM_THREADS\n")
+    if mpitype == "intelmpi" then
+        f:write("export I_MPI_PIN=disable\n")
+    end
+    f:write("LOCALSIZE="..losize_var.."\n\n")
+
+    if mpitype == "openmpi" then
+        f:write("LOCALRANK=$OMPI_COMM_WORLD_LOCAL_RANK\n\n")
+    else
+        local full = tostring(np - (np % ppn))
+        f:write("if [ \"$GLOBALRANK\" -lt "..full.." ]; then\n")
+        f:write("\tLOCALRANK=$(($GLOBALRANK % $LOCALSIZE))\n")
+        f:write("else\n")
+        f:write("\tLOCALRANK=$(($GLOBALRANK - ("..full.." - 1)))\n")
+        f:write("fi\n\n")
+    end
+
+    if #perf > 0 then
+        f:write("which `basename "..LIKWID_PERFCTR.."` 1>/dev/null 2>&1\n")
+    else
+        f:write("which `basename "..LIKWID_PIN.."` 1>/dev/null 2>&1\n")
+    end
+    f:write("if [ $? -eq 1 ]; then\n")
+    f:write("\tmodule load likwid 1>/dev/null 2>&1\n")
+    f:write("fi\n\n")
+
+    f:write("if [ \"$LOCALRANK\" -eq 0 ]; then\n")
+    if debug then
+        print("NODE_EXEC: "..commands[1])
+    end
+    f:write("\t"..commands[1].."\n")
+
+    for i=2,#commands do
+        f:write("elif [ \"$LOCALRANK\" -eq "..tostring(i-1).." ]; then\n")
+        if debug then
+            print("NODE_EXEC: "..commands[i])
+        end
+        f:write("\t"..commands[i].."\n")
+    end
+    f:write("else\n")
+    f:write("\techo \"Unknown local rank $LOCALRANK\"\n")
+    f:write("fi\n")
+    
+    f:close()
+    os.execute("chmod +x "..scriptname)
+end
+
+
+local function listdir(dir, infilepart)
+    local outlist = {}
+    local p = io.popen("find "..dir.." -type f -name \"*"..infilepart.."*\"")
+    for file in p:lines() do
+        table.insert(outlist, file)
+    end
+    p:close()
+    if #outlist > 0 then
+        table.sort(outlist)
+    end
+    return outlist
+end
+
+
+local function parseOutputFile(filename)
+    local rank = 0
+    local host = nil
+    local cpulist = {}
+    local eventlist = {}
+    local counterlist = {}
+    local idx = 1
+    local gidx = 0
+    local results = {}
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("ERROR: Cannot open output file "..filename)
+        os.exit(1)
+    end
+    rank, host = filename:match("output_%d+_(%d+)_(%g+).csv")
+
+    local t = f:read("*all")
+    f:close()
+    for i, line in pairs(likwid.stringsplit(t, "\n")) do
+        if (not line:match("^-")) and
+           (not line:match("^CPU type:")) and
+           (not line:match("^CPU name:")) and
+           (not line:match("^TABLE")) and
+           (not line:match("^STRUCT")) and
+           (not line:match("^%s*$")) and
+           (not line:match("STAT")) then
+            if line:match("^Event") and not line:match("Sum,Min,Max,Avg") then
+                linelist = likwid.stringsplit(line,",")
+                table.remove(linelist,1)
+                table.remove(linelist,1)
+                for _, cpustr in pairs(linelist) do
+                    local test = tonumber(cpustr:match("Core (%d+)"))
+                    if test ~= nil then
+                        for _cpu in pairs(cpulist) do
+                            if tonumber(cpu) == test then test = -1 end
+                        end
+                        if test >= 0 then
+                            table.insert(cpulist, test)
+                        end
+                    end
+                end
+                gidx = gidx + 1
+                idx = 1
+                if results[gidx] == nil then
+                    results[gidx] = {}
+                    eventlist[gidx] = {}
+                    counterlist[gidx] = {}
+                    results[gidx]["time"] = {}
+                end
+            elseif not line:match("^CPU clock:") and not line:match("Sum,Min,Max,Avg") then
+                linelist = likwid.stringsplit(line,",")
+                event = linelist[1]
+                counter = linelist[2]
+                table.remove(linelist,1)
+                table.remove(linelist,1)
+                for j=#linelist,1,-1 do
+                    if linelist[j] == "" then
+                        table.remove(linelist, j)
+                    end
+                end
+                if results[gidx][idx] == nil then
+                    results[gidx][idx] = {}
+                end
+                for j, value in pairs(linelist) do
+                    if event:match("[Rr]untime") then
+                        results[gidx]["time"][cpulist[j]] = tonumber(value)
+                    else
+                        results[gidx][idx][cpulist[j]] = tonumber(value)
+                    end
+                end
+                if not event:match("[Rr]untime") then
+                    table.insert(eventlist[gidx], idx, event)
+                    table.insert(counterlist[gidx], idx, counter)
+                    idx = idx + 1
+                end
+            elseif line:match("^CPU clock:") then
+                results["clock"] = line:match("^CPU clock:,([%d.]+)")
+                results["clock"] = tonumber(results["clock"])*1.E09
+            end
+        end
+    end
+    return host, tonumber(rank), results, cpulist
+end
+
+local function parseMarkerOutputFile(filename)
+    local rank = 0
+    local host = nil
+    local cpulist = {}
+    local eventlist = {}
+    local counterlist = {}
+    local idx = 1
+    
+    local results = {}
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("ERROR: Cannot open output file "..filename)
+        os.exit(1)
+    end
+    rank, host = filename:match("output_%d+_(%d+)_(%g+).csv")
+    local t = f:read("*all")
+    f:close()
+    local parse_reg_info = false
+    local parse_reg_output = false
+    local current_region = nil
+    local gidx = 0
+    local clock = 0
+    for i, line in pairs(likwid.stringsplit(t, "\n")) do
+        if (not line:match("^-")) and
+           (not line:match("^CPU type:")) and
+           (not line:match("^CPU name:")) and
+           (not line:match("^TABLE")) and
+           (not line:match("STAT")) then
+
+            if line:match("^STRUCT,Info") and not parse_reg_info then
+                parse_reg_info = true
+            elseif line:match("^Event") and not line:match("Sum,Min,Max,Avg") then
+                parse_reg_info = false
+                parse_reg_output = true
+                idx = 1
+            elseif line:match("^Event") and line:match("Sum,Min,Max,Avg") then
+                parse_reg_output = false
+            elseif line:match("^CPU clock") then
+                clock = line:match("^CPU clock,([%d.]+)")
+                clock = tonumber(clock)*1.E09
+            elseif parse_reg_info and line:match("^%d+,%g+") then
+                gidx, current_region = line:match("^(%d+),(%g-),")
+                gidx = tonumber(gidx)
+                if results[current_region] == nil then
+                    results[current_region] = {}
+                end
+                if results[current_region][gidx] == nil then
+                    results[current_region][gidx] = {}
+                    results[current_region][gidx]["time"] = {}
+                    results[current_region][gidx]["calls"] = {}
+                end
+            elseif parse_reg_info and line:match("^Region Info") then
+                linelist = likwid.stringsplit(line,",")
+                table.remove(linelist,1)
+                for _, cpustr in pairs(linelist) do
+                    if cpustr:match("Core %d+") then
+                        local test = tonumber(cpustr:match("Core (%d+)"))
+                        if test ~= nil then
+                            for _,cpu in pairs(cpulist) do
+                                if test == cpu then test = -1 end
+                            end
+                            if test >= 0 then
+                                table.insert(cpulist, test)
+                            end
+                        end
+                    end
+                end
+            elseif parse_reg_info and line:match("^RDTSC") then
+                linelist = likwid.stringsplit(line,",")
+                table.remove(linelist,1)
+                for i, time in pairs(linelist) do
+                    if time ~= "" then
+                        results[current_region][gidx]["time"][cpulist[i]] = tonumber(time)
+                    end
+                end
+            elseif parse_reg_info and line:match("^call count") then
+                linelist = likwid.stringsplit(line,",")
+                table.remove(linelist,1)
+                for j, calls in pairs(linelist) do
+                    if calls:match("%d+") then
+                        if calls ~= "" then
+                            results[current_region][gidx]["calls"][cpulist[j]] = tonumber(calls)
+                        end
+                    end
+                end
+            elseif parse_reg_output then
+                linelist = likwid.stringsplit(line,",")
+                table.remove(linelist,1)
+                table.remove(linelist,1)
+                for j=#linelist,1,-1 do
+                    if linelist[j] == "" then
+                        table.remove(linelist, j)
+                    end
+                end
+                if results[current_region][gidx][idx] == nil then
+                    results[current_region][gidx][idx] = {}
+                end
+                for j, value in pairs(linelist) do
+                    results[current_region][gidx][idx][cpulist[j]] = tonumber(value)
+                end
+                idx = idx + 1
+            end
+        end
+    end
+    for region, data in pairs(results) do
+        results[region]["clock"] = clock
+    end
+
+    return host, tonumber(rank), results, cpulist
+end
+
+function printMpiOutput(group_list, all_results)
+
+    if #group_list == 0 or likwid.tablelength(all_results) == 0 then
+        return
+    end
+    for gidx, gdata in pairs(group_list) do
+        local firsttab = {}
+        local firsttab_combined = {}
+        local secondtab = {}
+        local secondtab_combined = {}
+        local total_threads = 0
+        for rank = 0, #all_results do
+            total_threads = total_threads + #all_results[rank]["cpus"]
+        end
+
+        desc = {"Event"}
+        if total_threads == 1 or not gdata["Metrics"] then
+            table.insert(desc, "Runtime (RDTSC) [s]")
+        end
+        if all_results[0]["results"][1]["calls"] then
+            table.insert(desc, "Region calls")
+        end
+        for i=1,#gdata["Events"] do
+            table.insert(desc, gdata["Events"][i]["Event"])
+        end
+        table.insert(firsttab, desc)
+
+        desc = {"Counter"}
+        if total_threads == 1 or not gdata["Metrics"] then
+            table.insert(desc, "TSC")
+        end
+        if all_results[0]["results"][1]["calls"] then
+            table.insert(desc, "")
+        end
+        for i=1,#gdata["Events"] do
+            table.insert(desc, gdata["Events"][i]["Counter"])
+        end
+        table.insert(firsttab, desc)
+
+        for rank = 0, #all_results do
+            for i, cpu in pairs(all_results[rank]["cpus"]) do
+                column = {all_results[rank]["hostname"]..":"..tostring(rank)..":"..tostring(cpu)}
+                if total_threads == 1 or not gdata["Metrics"] then
+                    table.insert(column, all_results[rank]["results"][gidx]["time"][cpu])
+                end
+                if all_results[0]["results"][1]["calls"] then
+                    table.insert(column, all_results[rank]["results"][gidx]["calls"][cpu])
+                end
+                for j=1,#gdata["Events"] do
+                    if all_results[rank]["results"][gidx][j] ~= nil then
+                        table.insert(column, all_results[rank]["results"][gidx][j][cpu])
+                    else
+                        table.insert(column, 0)
+                    end
+                end
+                table.insert(firsttab, column)
+            end
+        end
+
+        if total_threads > 1 then
+            firsttab_combined = likwid.tableToMinMaxAvgSum(firsttab, 2, 1)
+        end
+        if gdata["Metrics"] then
+            secondtab[1] = {"Metric"}
+            for j=1,#gdata["Metrics"] do
+                table.insert(secondtab[1], gdata["Metrics"][j]["description"])
+            end
+            
+            for rank = 0, #all_results do
+                for i, cpu in pairs(all_results[rank]["cpus"]) do
+                    local counterlist = {}
+                    for j=1,#gdata["Events"] do
+                        local counter = gdata["Events"][j]["Counter"]
+                        if all_results[rank]["results"][gidx][j] ~= nil then
+                            counterlist[counter] = all_results[rank]["results"][gidx][j][cpu]
+                        else
+                            counterlist[counter] = 0
+                        end
+                    end
+                    counterlist["time"] = all_results[rank]["results"][gidx]["time"][cpu]
+                    counterlist["inverseClock"] = 1.0/all_results[rank]["results"]["clock"]
+                    tmpList = {all_results[rank]["hostname"]..":"..tostring(rank)..":"..tostring(cpu)}
+                    for j=1,#groupdata["Metrics"] do
+                        local tmp = likwid.calculate_metric(gdata["Metrics"][j]["formula"], counterlist)
+                        if tostring(tmp):len() > 12 then
+                            tmp = string.format("%e",tmp)
+                        end
+                        table.insert(tmpList, tostring(tmp))
+                    end
+                    table.insert(secondtab,tmpList)
+                end
+            end
+
+            if total_threads > 1 then
+                secondtab_combined = likwid.tableToMinMaxAvgSum(secondtab, 1, 1)
+            end
+        end
+        if use_csv then
+            print("Group,"..tostring(gidx))
+            likwid.printcsv(firsttab)
+            if total_threads > 1 then likwid.printcsv(firsttab_combined) end
+            if gdata["Metrics"] then
+                likwid.printcsv(secondtab)
+                if total_threads > 1 then likwid.printcsv(secondtab_combined) end
+            end
+        else
+            print("Group: "..tostring(gidx))
+            likwid.printtable(firsttab)
+            if total_threads > 1 then likwid.printtable(firsttab_combined) end
+            if gdata["Metrics"] then
+                likwid.printtable(secondtab)
+                if total_threads > 1 then likwid.printtable(secondtab_combined) end
+            end
+        end
+    end
+end
+
+if #arg == 0 then
+    usage()
+    os.exit(0)
+end
+
+for opt,arg in likwid.getopt(arg, {"n:","np:", "nperdomain:","pin:","hostfile:","h","help","v","g:","group:","mpi:","omp:","d","m","O","debug","marker","version","s:","skip:"}) do
+    if (type(arg) == "string") then
+        local s,e = arg:find("-")
+        if s == 1 then
+            print(string.format("ERROR: Argmument %s to option -%s starts with invalid character -.", arg, opt))
+            print("ERROR: Did you forget an argument to an option?")
+            os.exit(1)
+        end
+    end
+
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version"then
+        version()
+        os.exit(0)
+    elseif opt == "d" or opt == "debug" then
+        debug = true
+    elseif opt == "m" or opt == "marker" then
+        use_marker = true
+    elseif opt == "O" then
+        use_csv = true
+    elseif opt == "n" or opt == "np" then
+        np = tonumber(arg)
+    elseif opt == "nperdomain" then
+        nperdomain = arg
+        local domain, count = nperdomain:match("([NSCM]%d*):(%d+)")
+        if domain == nil then
+            print("Invalid option to -nperdomain")
+            os.exit(1)
+        end
+    elseif opt == "hostfile" then
+        hostfile = arg
+    elseif opt == "pin" then
+        cpuexprs = likwid.stringsplit(arg, "_")
+    elseif opt == "g" or opt == "group" then
+        table.insert(perf, arg)
+    elseif opt == "mpi" then
+        mpitype = arg
+    elseif opt == "omp" then
+        omptype = arg
+    elseif opt == "s" or opt == "skip" then
+        skipStr = "-s "..arg
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    end
+end
+
+if MPIROOT == "" then
+    print("Please load a MPI module or set path to MPI solder in MPIHOME environment variable")
+    print("$MPIHOME/bin/<MPI launcher> should be valid")
+    os.exit(1)
+end
+
+if np == 0 and nperdomain == nil and #cpuexprs == 0 then
+    print("ERROR: No option -n/-np, -nperdomain or -pin")
+    os.exit(1)
+end
+
+for i=1,#arg do
+    table.insert(executable, arg[i])
+end
+if #executable == 0 then
+    print("ERROR: No executable given on commandline")
+    os.exit(1)
+else
+    if debug then
+        print("DEBUG: Executable given on commandline: "..table.concat(executable, " "))
+    end
+end
+
+if mpitype == nil then
+    mpitype = getMpiType()
+    if debug then
+        print("DEBUG: Using MPI implementation "..mpitype)
+    end
+end
+if mpitype ~= "intelmpi" and mpitype ~= "mvapich2" and mpitype ~= "openmpi" then
+    print("ERROR: Unknown MPI given. Possible values: openmpi, intelmpi, mvapich2")
+    os.exit(1)
+end
+
+if omptype == nil then
+    omptype = getOmpType()
+    if debug and omptype ~= nil then
+        print("DEBUG: Using OpenMP implementation "..omptype)
+    end
+end
+if omptype == nil then
+    print("WARN: Cannot extract OpenMP vendor from executable or commandline, assuming no OpenMP")
+end
+
+if not hostfile then
+    hostfile = os.getenv("PBS_NODEFILE")
+    if not hostfile or hostfile == "" then
+        hostfile = os.getenv("LOADL_HOSTFILE")
+    end
+    if not hostfile or hostfile == "" then
+        hostfile = os.getenv("SLURM_HOSTFILE")
+    end
+    if not hostfile or hostfile == "" then
+        print("ERROR: No hostfile given and not in batch environment")
+        os.exit(1)
+    end
+    hosts = readHostfilePBS(hostfile)
+else
+    hosts = MPIINFO[mpitype]["readHostfile"](hostfile)
+end
+
+local givenNrNodes = getNumberOfNodes(hosts)
+
+if skipStr == "" then
+    if mpitype == "intelmpi" then
+        if omptype == "intel" then
+            skipStr = '-s 0x3'
+        elseif omptype == "gnu" then
+            skipStr = '-s 0x1'
+        end
+    elseif mpitype == "mvapich2" then
+        if omptype == "intel" and givenNrNodes > 1 then
+            skipStr = '-s 0x7'
+        end
+    elseif mpitype == "openmpi" then
+        if omptype == "intel" and givenNrNodes > 1 then
+            skipStr = '-s 0x7'
+        elseif omptype == "intel" and givenNrNodes == 1 then
+            skipStr = '-s 0x1'
+        elseif omptype == "gnu" and givenNrNodes > 1 then
+            skipStr = '-s 0x7'
+        elseif omptype == "gnu" and givenNrNodes == 1 then
+            skipStr = '-s 0x3'
+        end
+    end
+end
+if debug and skipStr ~= "" then
+    print(string.format("DEBUG: Using skip option %s to skip pinning of shepard threads", skipStr))
+end
+
+
+if #cpuexprs > 0 then
+    cpuexprs = calculatePinExpr(cpuexprs)
+    ppn = #cpuexprs
+    if np == 0 then
+        if debug then
+            print(string.format("DEBUG: No -np given , setting according to pin expression and number of available hosts"))
+        end
+        np = givenNrNodes
+        ppn = #cpuexprs
+    end
+    newhosts = assignHosts(hosts, np, ppn)
+    if np > #cpuexprs*#newhosts then
+        print("WARN: Oversubsribing not allowed.")
+        print(string.format("WARN: You want %d processes but the pinning expression has only expressions for %d for %d hosts", np, #cpuexprs*#newhosts, #newhosts))
+        np = #cpuexprs*#newhosts
+        print(string.format("WARN: Sanizing number of processes to %d.", np))
+    end
+elseif nperdomain ~= nil then
+    cpuexprs = calculateCpuExprs(nperdomain, cpuexprs)
+    ppn = #cpuexprs
+    if np == 0 then
+        np = givenNrNodes * ppn
+    end
+    if np < ppn then
+        if debug then
+            print("WARN: Removing additional cpu expressions to get requested number of processes")
+        end
+        for i=np+1,ppn do
+            if debug then
+                print("WARN: Remove cpuexpr: "..cpuexprs[#cpuexprs])
+            end
+            table.remove(cpuexprs, #cpuexprs)
+        end
+        ppn = np
+    elseif np > (givenNrNodes * ppn) then
+        print("WARN: Oversubsribing nodes not allowed!")
+        print(string.format("WARN: You want %d processes with %d on each of the %d hosts", np, ppn, givenNrNodes))
+        np = givenNrNodes * ppn
+        print(string.format("WARN: Sanizing number of processes to %d.", np))
+    end
+    newhosts, ppn = assignHosts(hosts, np, ppn)
+elseif ppn == 0 and np > 0 then
+    ppn = math.floor(np/givenNrNodes)
+    if (ppn * givenNrNodes) ~= np then
+        print("WARN: Processes cannot be equally distributed")
+        print(string.format("WARN: You want %d processes on %d hosts.", np, givenNrNodes))
+        np = givenNrNodes * ppn
+        print(string.format("WARN: Sanitizing number of processes to %d", np))
+    end
+    local newexprs = calculateCpuExprs("E:N:"..tostring(ppn), cpuexprs)
+    for i, expr in pairs(newexprs) do
+        local exprlist = likwid.stringsplit(expr, ",")
+        local seclength = #exprlist/ppn
+        local offset = 0
+        for p=1, ppn do
+            local str = ""
+            for j=1, seclength do
+                str = str .. exprlist[((p-1)*seclength) + j] ..","
+            end
+            str = str:sub(1,#str-1)
+            table.insert(cpuexprs, str)
+        end
+    end
+    
+    newhosts, ppn = assignHosts(hosts, np, ppn)
+else
+    print("ERROR: Commandline settings are not supported.")
+    os.exit(1)
+end
+
+local grouplist = {}
+if #perf > 0 then
+    perfexprs, grouplist = setPerfStrings(perf, cpuexprs)
+end
+
+local nrNodes = getNumberOfNodes(newhosts)
+
+local pid = likwid.getpid()
+local hostfilename = string.format(".hostfile_%s.txt", pid)
+local scriptfilename = string.format(".likwidscript_%s.txt", pid)
+local outfilename = string.format(os.getenv("PWD").."/.output_%s_%%r_%%h.csv", pid)
+
+MPIINFO[mpitype]["writeHostfile"](newhosts, hostfilename)
+writeWrapperScript(scriptfilename, table.concat(executable, " "), newhosts, outfilename)
+local env = MPIINFO[mpitype]["getEnvironment"]()
+MPIINFO[mpitype]["executeCommand"](scriptfilename, hostfilename, env, nrNodes)
+
+os.remove(scriptfilename)
+os.remove(hostfilename)
+
+infilepart = ".output_"..pid
+filelist = listdir(os.getenv("PWD"), infilepart)
+all_results = {}
+if not use_marker then
+    for i, file in pairs(filelist) do
+        local host, rank, results, cpulist = parseOutputFile(file)
+        if all_results[rank] == nil then
+            all_results[rank] = {}
+        end
+        all_results[rank]["hostname"] = host
+        all_results[rank]["results"] = results
+        all_results[rank]["cpus"] = cpulist
+        os.remove(file)
+    end
+    printMpiOutput(grouplist, all_results)
+else
+    local tmpList = {}
+    for i, file in pairs(filelist) do
+        host, rank, results, cpulist = parseMarkerOutputFile(file)
+        if all_results[rank] == nil then
+            all_results[rank] = {}
+        end
+        all_results[rank]["hostname"] = host
+        all_results[rank]["cpus"] = cpulist
+        tmpList[rank] = results
+        os.remove(file)
+    end
+    for reg, _ in pairs(tmpList[0]) do
+        print("Region: "..reg)
+        for rank,_ in pairs(all_results) do
+            all_results[rank]["results"] = tmpList[rank][reg]
+        end
+        printMpiOutput(grouplist, all_results)
+    end
+end
diff --git a/src/applications/likwid-perfctr.c b/src/applications/likwid-perfctr.c
deleted file mode 100644
index 6c9f98f..0000000
--- a/src/applications/likwid-perfctr.c
+++ /dev/null
@@ -1,528 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-perfctr.c
- *
- *      Description:  An application to read out performance counter registers
- *                  on x86 processors
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <time.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-#include <signal.h>
-
-#include <error.h>
-#include <types.h>
-#include <bitUtil.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <timer.h>
-#include <cpuid.h>
-#include <affinity.h>
-#include <cpuFeatures.h>
-#include <perfmon.h>
-#include <daemon.h>
-#include <bstrlib.h>
-#include <numa.h>
-#include <strUtil.h>
-
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-#define HELP_MSG \
-fprintf(stdout, "likwid-perfctr --  Version  %d.%d \n\n",VERSION,RELEASE); \
-fprintf(stdout, "\n"); \
-fprintf(stdout, "Example Usage: likwid-perfctr -C 2  ./a.out \n"); \
-fprintf(stdout, "Supported Options:\n"); \
-fprintf(stdout, "-h\t Help message\n"); \
-fprintf(stdout, "-v\t Version information\n"); \
-fprintf(stdout, "-V\t verbose output\n"); \
-fprintf(stdout, "-g\t performance group or event set string\n"); \
-fprintf(stdout, "-H\t Get group help (together with -g switch) \n"); \
-fprintf(stdout, "-t\t timeline mode with frequency in s or ms, e.g. 300ms\n"); \
-fprintf(stdout, "-S\t stethoscope mode with duration in s\n"); \
-fprintf(stdout, "-m\t use markers inside code \n"); \
-fprintf(stdout, "-s\t bitmask with threads to skip\n"); \
-fprintf(stdout, "-o\t Store output to file, with output conversation according to file suffix\n"); \
-fprintf(stdout, "\t Conversation scripts can be supplied in %s\n",TOSTRING(LIKWIDFILTERPATH)); \
-fprintf(stdout, "-O\t Output easily parseable CSV instead of fancy tables\n"); \
-fprintf(stdout, "-M\t set how MSR registers are accessed: 0=direct, 1=msrd\n"); \
-fprintf(stdout, "-a\t list available performance groups\n"); \
-fprintf(stdout, "-e\t list available counters and events\n"); \
-fprintf(stdout, "-i\t print cpu info\n"); \
-fprintf(stdout, "-c\t processor ids to measure (required), e.g 0,3-4,8\n"); \
-fprintf(stdout, "-C\t processor ids to measure (this variant also cares for pinning of process/threads)\n"); \
-fprintf(stdout, "\t\t for -c and -C, see likwid-pin -h for details\n"); \
-fflush(stdout);
-
-
-#define VERSION_MSG \
-fprintf(stdout, "likwid-perfctr  %d.%d \n\n",VERSION,RELEASE); \
-fflush(stdout);
-
-/* To be able to give useful error messages instead of just dieing without a
- * comment. Mainly happens because we get a SIGPIPE if the daemon drops us. */
-static void Signal_Handler(int sig)
-{
-    fprintf(stderr, "ERROR - [%s:%d] Signal %d caught\n", __FILE__, __LINE__, sig);
-}
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-int main (int argc, char** argv)
-{
-    int optInfo = 0;
-    int optPrintGroups = 0;
-    int optPrintGroupHelp = 0;
-    int optPrintEvents = 0;
-    int optUseMarker = 0;
-    int optReport = 0;
-    int optTimeline = 0;
-    int optStethoscope = 0;
-    int optPin = 0;
-    int c;
-    bstring eventString = bfromcstr("_NOGROUP");
-    bstring  argString;
-    bstring  pinString;
-    bstring  skipString;
-    bstring  filterScript = bfromcstr("NO");
-    int skipMask = -1;
-    BitMask counterMask;
-    bstring filepath = bformat("/tmp/likwid_%u.txt", (uint32_t) getpid());
-    int numThreads = 0;
-    int threads[MAX_NUM_THREADS];
-    threads[0] = 0;
-    int i,j;
-    FILE* OUTSTREAM = stdout;
-    struct timespec interval;
-
-    if (argc ==  1)
-    {
-        HELP_MSG;
-        bdestroy(filepath);
-        bdestroy(eventString);
-        exit (EXIT_SUCCESS);
-    }
-
-    if (cpuid_init() == EXIT_FAILURE)
-    {
-        ERROR_PLAIN_PRINT(Unsupported processor!);
-    }
-    numa_init();
-    affinity_init();
-
-    while ((c = getopt (argc, argv, "+ac:C:d:eg:hHimM:o:OPs:S:t:vV")) != -1)
-    {
-        switch (c)
-        {
-            case 'a':
-                numThreads = 1; /*to get over the error message */
-                threads[0] = 0;
-                optPrintGroups = 1;
-                break;
-            case 'C':
-                optPin = 1;
-                CHECK_OPTION_STRING;
-                numThreads = bstr_to_cpuset(threads, argString);
-
-                if(!numThreads)
-                {
-                    ERROR_PLAIN_PRINT(Failed to parse cpu list.);
-                }
-
-                break;
-            case 'c':
-                CHECK_OPTION_STRING;
-                numThreads = bstr_to_cpuset(threads, argString);
-                if(!numThreads)
-                {
-                    ERROR_PLAIN_PRINT(Failed to parse cpu list.);
-                }
-
-                break;
-            case 'd':
-                fprintf(stdout, "Option -d for daemon mode is deprecated. Daemon mode has be renamed to timeline mode (Option -t)!\n");
-                fflush(stdout);
-                break;
-            case 'e':
-                numThreads=1; /*to get over the error message */
-                threads[0]=0;
-                optPrintEvents = 1;
-                break;
-            case 'g':
-                CHECK_OPTION_STRING;
-                eventString = bstrcpy(argString);
-                break;
-            case 'h':
-                HELP_MSG;
-                cpuid_print();
-                bdestroy(filepath);
-                bdestroy(eventString);
-                exit (EXIT_SUCCESS);
-            case 'H':
-                numThreads=1; /*to get over the error message */
-                threads[0]=0;
-                optPrintGroupHelp = 1;
-                break;
-            case 'i':
-                numThreads=1; /*to get over the error message */
-                threads[0]=0;
-                optInfo = 1;
-                perfmon_verbose = 1;
-                break;
-            case 'm':
-                optUseMarker = 1;
-                break;
-            case 'M':  /* Set MSR Access mode */
-                CHECK_OPTION_STRING;
-                accessClient_setaccessmode(str2int((char*) argString->data));
-                break;
-            case 'o':
-                CHECK_OPTION_STRING;
-                OUTSTREAM = bstr_to_outstream(argString, filterScript);
-
-                if(!OUTSTREAM)
-                {
-                    ERROR_PLAIN_PRINT(Failed to parse out file pattern.);
-                }
-                break;
-            case 'O':
-                perfmon_setCSVMode(1);
-                break;
-            case 's':
-                CHECK_OPTION_STRING;
-                skipMask = strtoul((char*) argString->data,NULL,16);
-                break;
-            case 'S':
-                CHECK_OPTION_STRING;
-                optStethoscope = str2int((char*) argString->data);
-                if (optStethoscope <= 0)
-                {
-                    fprintf(stderr, "The measurement time must be larger than 0\n\n");
-                    HELP_MSG;
-                    exit(EXIT_FAILURE);
-                }
-                break;
-            case 't':
-                CHECK_OPTION_STRING;
-                bstr_to_interval(argString, &interval);
-                optTimeline = 1;
-                break;
-            case 'v':
-                VERSION_MSG;
-                bdestroy(filepath);
-                bdestroy(eventString);
-                exit (EXIT_SUCCESS);
-            case 'V':
-                perfmon_verbose = 1;
-                break;
-            case '?':
-                if (optopt == 'S'||optopt == 't'||optopt == 'c'||optopt == 'C'||
-                    optopt == 'o'||optopt == 'M'||optopt == 'g')
-                {
-
-                }
-                else if (isprint (optopt))
-                {
-                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
-                }
-                else
-                {
-                    fprintf (stderr,
-                            "Unknown option character `\\x%x'.\n",
-                            optopt);
-                }
-                return EXIT_FAILURE;
-            default:
-                HELP_MSG;
-                bdestroy(filepath);
-                bdestroy(eventString);
-                exit (EXIT_SUCCESS);
-        }
-    }
-
-    if (!numThreads)
-    {
-        fprintf (stderr, "ERROR: Required -c. You must specify at least one processor.\n");
-        HELP_MSG;
-        exit(EXIT_FAILURE);
-    }
-
-    if (optPin)
-    {
-
-        if ( getenv("OMP_NUM_THREADS") == NULL )
-        {
-            argString = bformat("%d",numThreads);
-            setenv("OMP_NUM_THREADS",(char*) argString->data , 0);
-        }
-
-        if (numThreads > 1)
-        {
-            bstring ldPreload = bfromcstr(getenv("LD_PRELOAD"));
-
-            pinString = bformat("%d",threads[1]);
-
-            for (i=2; i < numThreads;i++)
-            {
-                bformata(pinString,",%d",threads[i]);
-            }
-
-            bformata(pinString,",%d",threads[0]);
-
-            if (skipMask > 0)
-            {
-                skipString = bformat("%d",skipMask);
-                setenv("LIKWID_SKIP",(char*) skipString->data , 1);
-            }
-            setenv("KMP_AFFINITY", "disabled", 1);
-            setenv("LIKWID_PIN",(char*) pinString->data , 1);
-
-            setenv("LIKWID_SILENT","true", 1);
-            if (ldPreload == NULL)
-            {
-                setenv("LD_PRELOAD",TOSTRING(LIBLIKWIDPIN), 1);
-            }
-            else
-            {
-                bconchar(ldPreload, ':');
-                bcatcstr(ldPreload, TOSTRING(LIBLIKWIDPIN));
-                setenv("LD_PRELOAD", bdata(ldPreload), 1);
-            }
-        }
-
-        affinity_pinProcess(threads[0]);
-    }
-
-
-    for (i = 0; i< numThreads;i++)
-    {
-        for (j = 0; j< numThreads;j++)
-        {
-            if(i != j && threads[i] == threads[j])
-            {
-                fprintf (stderr, "ERROR: Processor list (%d",threads[0]);
-                for (c=1;c<numThreads;c++)
-                {
-                    fprintf (stderr, ",%d",threads[c]);
-                }
-                fprintf (stderr, ") is not unique.\n");
-                exit(EXIT_FAILURE);
-            }
-        }
-    }
-
-    { /* Init signal handler */
-        struct sigaction sia;
-        sia.sa_handler = Signal_Handler;
-        sigemptyset(&sia.sa_mask);
-        sia.sa_flags = 0;
-        sigaction(SIGPIPE, &sia, NULL);
-    }
-
-    perfmon_init(numThreads, threads, OUTSTREAM);
-
-    if (perfmon_verbose)
-    {
-        fprintf(OUTSTREAM,"CPU family:\t%u \n",cpuid_info.family);
-        fprintf(OUTSTREAM,"CPU model:\t%u \n", cpuid_info.model);
-        fprintf(OUTSTREAM,"CPU stepping:\t%u \n", cpuid_info.stepping);
-        fprintf(OUTSTREAM,"CPU features:\t%s \n", cpuid_info.features);
-
-        if( cpuid_info.family == P6_FAMILY && cpuid_info.perf_version)
-        {
-            fprintf(OUTSTREAM,HLINE);
-            fprintf(OUTSTREAM,"PERFMON version:\t%u \n",cpuid_info.perf_version);
-            fprintf(OUTSTREAM,"PERFMON number of counters:\t%u \n",cpuid_info.perf_num_ctr);
-            fprintf(OUTSTREAM,"PERFMON width of counters:\t%u \n",cpuid_info.perf_width_ctr);
-            fprintf(OUTSTREAM,"PERFMON number of fixed counters:\t%u \n",cpuid_info.perf_num_fixed_ctr);
-        }
-    }
-    fprintf(OUTSTREAM,HLINE);
-    fflush(OUTSTREAM);
-
-    if (optInfo)
-    {
-        exit (EXIT_SUCCESS);
-    }
-    if (optPrintGroups)
-    {
-        perfmon_printAvailableGroups();
-        exit (EXIT_SUCCESS);
-    }
-    if (optPrintGroupHelp)
-    {
-        perfmon_printGroupHelp(eventString);
-        exit (EXIT_SUCCESS);
-    }
-    if (optPrintEvents)
-    {
-        perfmon_printCounters();
-        perfmon_printEvents();
-        exit (EXIT_SUCCESS);
-    }
-    if ((!optTimeline && !optStethoscope) && (optind == argc))
-    {
-        fprintf(OUTSTREAM,"NOTICE: You have to specify a program to measure as argument!\n");
-        exit (EXIT_SUCCESS);
-    }
-    argv +=  optind;
-    bstring exeString = bfromcstr(argv[0]);
-    for (i=1; i<(argc-optind); i++)
-        {
-            bconchar(exeString, ' ');
-            bcatcstr(exeString, argv[i]);
-        }
-    if (blength(exeString) == 0 && !optStethoscope)
-    {
-        fprintf(OUTSTREAM, "Executable must be given on commandline\n");
-        fflush(OUTSTREAM);
-        exit(EXIT_FAILURE);
-    }
-    if (biseqcstr(eventString,"_NOGROUP"))
-    {
-        fprintf(OUTSTREAM,"NOTICE: You have to specify a group or event set to measure using the -g option.\n");
-        fprintf(OUTSTREAM,"        Use likwid-perfctr -a to get a list of available groups and likwid-perfctr -e for supported events.\n\n");
-        exit (EXIT_SUCCESS);
-    }
-
-    timer_init();
-
-    fprintf(OUTSTREAM,HLINE);
-    fprintf(OUTSTREAM,"CPU type:\t%s \n",cpuid_info.name);
-    fprintf(OUTSTREAM,"CPU clock:\t%3.2f GHz \n",  (float) timer_getCpuClock() * 1.E-09);
-    fflush(OUTSTREAM);
-
-    fprintf(OUTSTREAM,HLINE);
-    fflush(OUTSTREAM);
-
-    if (optStethoscope)
-    {
-        perfmon_setupEventSet(eventString, &counterMask);
-        perfmon_startCounters();
-        sleep(optStethoscope);
-        perfmon_stopCounters();
-        perfmon_printCounterResults();
-    }
-    else if (optTimeline)
-    {
-        fprintf(OUTSTREAM,"CORES: %d", threads[0]);
-        for (int i=1; i<numThreads; i++)
-        {
-            fprintf(OUTSTREAM," %d", threads[i]);
-        }
-        fprintf(OUTSTREAM," \n");
-        fflush(OUTSTREAM);
-
-        daemon_start(eventString, interval);
-        if (system(bdata(exeString)) == EOF)
-        {
-            fprintf(stderr, "Failed to execute %s!\n", bdata(exeString));
-            exit(EXIT_FAILURE);
-        }
-        daemon_stop(SIGINT);
-    }
-    else
-    {
-        if (perfmon_verbose)
-        {
-            fprintf(OUTSTREAM,"Executing: %s \n",bdata(exeString));
-            fflush(OUTSTREAM);
-        }
-
-        if (optReport)
-        {
-            //        multiplex_start();
-        }
-        else if (!optUseMarker && !optTimeline)
-        {
-            perfmon_setupEventSet(eventString, &counterMask);
-            perfmon_startCounters();
-        }
-        else
-        {
-            if (getenv("LIKWID_FILEPATH") == NULL)
-                setenv("LIKWID_FILEPATH",(char*) filepath->data, 1);
-            perfmon_setupEventSet(eventString, &counterMask);
-            char* modeStr = (char*) malloc(40 * sizeof(char));
-            sprintf(modeStr,"%d",accessClient_mode);
-            setenv("LIKWID_MODE", modeStr, 1);
-            bitMask_toString(modeStr,counterMask);
-            setenv("LIKWID_MASK", modeStr, 1);
-            free(modeStr);
-
-            perfmon_startCounters();
-        }
-
-        if (system(bdata(exeString)) == EOF)
-        {
-            fprintf(stderr, "Failed to execute %s!\n", bdata(exeString));
-            exit(EXIT_FAILURE);
-        }
-
-        if (optReport)
-        {
-            //        multiplex_stop();
-            //        perfmon_printReport(&set);
-        }
-        else
-        {
-            if (optUseMarker)
-            {
-                perfmon_stopCounters();
-                perfmon_printMarkerResults(filepath);
-            }
-            else
-            {
-                perfmon_stopCounters();
-                perfmon_printCounterResults();
-            }
-        }
-    }
-
-    bdestroy(filepath);
-    bdestroy(exeString);
-    perfmon_finalize();
-    fflush(OUTSTREAM);
-    fclose(OUTSTREAM);
-    /* call filterscript if specified */
-    if (!biseqcstr(filterScript,"NO"))
-    {
-        bcatcstr(filterScript, " perfctr");
-        if (system(bdata(filterScript)) == EOF)
-        {
-            fprintf(stderr, "Failed to execute filter %s!\n", bdata(filterScript));
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-perfctr.lua b/src/applications/likwid-perfctr.lua
new file mode 100644
index 0000000..2ad115d
--- /dev/null
+++ b/src/applications/likwid-perfctr.lua
@@ -0,0 +1,715 @@
+#!<PREFIX>/bin/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-perfctr.lua
+ *
+ *      Description:  An application to read out performance counter registers
+ *                    on x86 processors
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = '<PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local function version()
+    print(string.format("likwid-perfctr --  Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+    print("Examples:")
+    print("Run command on CPU 2 and measure performance group TEST:")
+    print("likwid-perfctr -C 2 -g TEST ./a.out")
+end
+
+local function usage()
+    version()
+    print("A tool to read out performance counter registers on x86 processors\n")
+    print("Options:")
+    print("-h, --help\t\t Help message")
+    print("-v, --version\t\t Version information")
+    print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+    print("-c <list>\t\t Processor ids to measure (required), e.g. 1,2-4,8")
+    print("-C <list>\t\t Processor ids to pin threads and measure, e.g. 1,2-4,8")
+    print("\t\t\t For information about the <list> syntax, see likwid-pin")
+    print("-g, --group <string>\t Performance group or custom event set string")
+    print("-H\t\t\t Get group help (together with -g switch)")
+    print("-s, --skip <hex>\t Bitmask with threads to skip")
+    print("-M <0|1>\t\t Set how MSR registers are accessed, 0=direct, 1=accessDaemon")
+    print("-a\t\t\t List available performance groups")
+    print("-e\t\t\t List available events and counter registers")
+    print("-E <string>\t\t List available events and corresponding counters that match <string>")
+    print("-i, --info\t\t Print CPU info")
+    print("-T <time>\t\t Switch eventsets with given frequency")
+    print("Modes:")
+    print("-S <time>\t\t Stethoscope mode with duration in s, ms or us, e.g 20ms")
+    print("-t <time>\t\t Timeline mode with frequency in s, ms or us, e.g. 300ms")
+    print("-m, --marker\t\t Use Marker API inside code")
+    print("Output options:")
+    print("-o, --output <file>\t Store output to file. (Optional: Apply text filter according to filename suffix)")
+    print("-O\t\t\t Output easily parseable CSV instead of fancy tables")
+    print("\n")
+    examples()
+end
+
+
+local config = likwid.getConfiguration()
+verbose = 0
+print_groups = false
+print_events = false
+print_event = nil
+print_info = false
+cpulist = nil
+num_cpus = 0
+pin_cpus = false
+group_string = nil
+event_string = nil
+event_string_list = {}
+avail_groups = {}
+num_avail_groups = 0
+group_list = {}
+group_ids = {}
+activeGroup = 0
+print_group_help = false
+skip_mask = "0x0"
+counter_mask = {}
+access_flags = "e"
+if config["daemonMode"] < 0 then
+    access_mode = 1
+else
+    access_mode = config["daemonMode"]
+    if access_mode == 0 then
+        access_flags = "rw"
+    end
+end
+set_access_modes = false
+use_marker = false
+use_stethoscope = false
+use_timeline = false
+daemon_run = 0
+use_wrapper = false
+duration = 2.E06
+switch_interval = 5
+output = ""
+use_csv = false
+execString = nil
+outfile = nil
+gotC = false
+markerFile = string.format("/tmp/likwid_%d.txt",likwid.getpid("pid"))
+print_stdout = print
+likwid.catchSignal()
+
+if #arg == 0 then
+    usage()
+    os.exit(0)
+end
+
+for opt,arg in likwid.getopt(arg, {"a", "c:", "C:", "e", "E:", "g:", "h", "H", "i", "m", "M:", "o:", "O", "P", "s:", "S:", "t:", "v", "V:","T:", "group:", "help", "info", "version", "verbose:", "output:", "skip:", "marker"}) do
+    if (type(arg) == "string") then
+        local s,e = arg:find("-");
+        if s == 1 then
+            print_stdout(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+            print_stdout("Did you forget an argument to an option?")
+            os.exit(1)
+        end
+    end
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif opt == "V" or opt == "verbose" then
+        verbose = tonumber(arg)
+        likwid.setVerbosity(verbose)
+    elseif (opt == "c") then
+        num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+        gotC = true
+    elseif (opt == "C") then
+        num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+        pin_cpus = true
+        gotC = true
+    elseif (opt == "a") then
+        print_groups = true
+    elseif (opt == "e") then
+        print_events = true
+    elseif (opt == "E") then
+        print_event = arg
+    elseif opt == "g" or opt == "group" then
+        table.insert(event_string_list, arg)
+    elseif (opt == "H") then
+        print_group_help = true
+    elseif opt == "s" or opt == "skip" then
+        if arg:match("0x[0-9A-F]") then
+            skip_mask = arg
+        else
+            if arg:match("[0-9A-F]") then
+                print("Given skip mask looks like hex, sanitizing arg to 0x"..arg)
+                skip_mask = "0x"..arg
+            else
+                print("Skip mask must be given in hex")
+            end
+        end
+    elseif (opt == "M") then
+        access_mode = tonumber(arg)
+        set_access_modes = true
+        if access_mode == 0 then
+            access_flags = "rw"
+        else
+            access_flags = "e"
+        end
+        if (access_mode < 0 and access_mode > 1) then
+            print_stdout("Access mode must be 0 for direct access and 1 for access daemon")
+            os.exit(1)
+        end
+    elseif opt == "i" or opt == "info" then
+        print_info = true
+        verbose = true
+    elseif opt == "m" or opt == "marker" then
+        use_marker = true
+        use_wrapper = true
+    elseif (opt == "S") then
+        use_stethoscope = true
+        duration = likwid.parse_time(arg)
+    elseif (opt == "t") then
+        use_timeline = true
+        duration = likwid.parse_time(arg)
+    elseif (opt == "T") then
+        duration = likwid.parse_time(arg)
+    elseif opt == "o" or opt == "output" then
+        local suffix = string.match(arg, ".-[^\\/]-%.?([^%.\\/]*)$")
+        if suffix ~= "txt" then
+            use_csv = true
+        end
+        outfile = arg:gsub("%%h", likwid.gethostname())
+        outfile = outfile:gsub("%%p", likwid.getpid())
+        outfile = outfile:gsub("%%j", likwid.getjid())
+        outfile = outfile:gsub("%%r", likwid.getMPIrank())
+        io.output(outfile:gsub(string.match(arg, ".-[^\\/]-%.?([^%.\\/]*)$"),"tmp"))
+        print = function(...) for k,v in pairs({...}) do io.write(v .. "\n") end end
+    elseif (opt == "O") then
+        use_csv = true
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    end
+end
+
+io.stdout:setvbuf("no")
+cpuinfo = likwid.getCpuInfo()
+
+if not likwid.msr_available(access_flags) then
+    if access_mode == 1 then
+        print_stdout("MSR device files not available")
+        print_stdout("Please load msr kernel module before retrying")
+        os.exit(1)
+    else
+        print_stdout("MSR device files not readable and writeable")
+        print_stdout("Be sure that you have enough permissions to access the MSR files directly")
+        os.exit(1)
+    end
+end
+
+if num_cpus == 0 and
+   not gotC and
+   not print_events and
+   print_event == nil and
+   not print_groups and
+   not print_group_help and
+   not print_info then
+    print_stdout("Option -c <list> or -C <list> must be given on commandline")
+    usage()
+    os.exit(1)
+elseif num_cpus == 0 and
+       gotC and
+       not print_events and
+       print_event == nil and
+       not print_groups and
+       not print_group_help and
+       not print_info then
+    print_stdout("CPUs given on commandline are not valid in current environment, maybe it's limited by a cpuset.")
+    os.exit(1)
+end
+
+
+if num_cpus > 0 then
+    for i,cpu1 in pairs(cpulist) do
+        for j, cpu2 in pairs(cpulist) do
+            if i ~= j and cpu1 == cpu2 then
+                print_stdout("List of CPUs is not unique, got two times CPU " .. tostring(cpu1))
+                os.exit(1)
+            end
+        end
+    end
+end
+
+
+
+if print_events == true then
+    local tab = likwid.getEventsAndCounters()
+    print_stdout(string.format("This architecture has %d counters.", #tab["Counters"]))
+    local outstr = "Counters names: "
+    print_stdout("Counter tags(name, type<, options>):")
+    for _, counter in pairs(tab["Counters"]) do
+        outstr = string.format("%s, %s", counter["Name"], counter["TypeName"]);
+        if counter["Options"]:len() > 0 then
+            outstr = outstr .. string.format(", %s",counter["Options"])
+        end
+        print_stdout(outstr)
+    end
+    print_stdout(string.format("This architecture has %d events.",#tab["Events"]))
+    print_stdout("Event tags (tag, id, umask, counters<, options>):")
+    for _, eventTab in pairs(tab["Events"]) do
+        outstr = eventTab["Name"] .. ", "
+        outstr = outstr .. string.format("0x%X, 0x%X, ",eventTab["ID"],eventTab["UMask"])
+        outstr = outstr .. eventTab["Limit"]
+        if #eventTab["Options"] > 0 then
+            outstr = outstr .. string.format(", %s",eventTab["Options"])
+        end
+        print_stdout(outstr)
+    end
+    os.exit(0)
+end
+
+if print_event ~= nil then
+    local tab = likwid.getEventsAndCounters()
+    local events = {}
+    local counters = {}
+    local outstr = ""
+    for _, eventTab in pairs(tab["Events"]) do
+        if eventTab["Name"]:match(print_event) then
+            table.insert(events, eventTab)
+        end
+    end
+    for _, counter in pairs(tab["Counters"]) do
+        for _, event in pairs(events) do
+            if counter["Name"]:match(event["Limit"]) then
+                counters[counter["Name"]] = counter
+            end
+        end
+    end
+    print_stdout(string.format("Found %d event(s) with search key %s:", #events, print_event))
+    for _, eventTab in pairs(events) do
+        outstr = eventTab["Name"] .. ", "
+        outstr = outstr .. string.format("0x%X, 0x%X, ",eventTab["ID"],eventTab["UMask"])
+        outstr = outstr .. eventTab["Limit"]
+        if #eventTab["Options"] > 0 then
+            outstr = outstr .. string.format(", %s",eventTab["Options"])
+        end
+        print_stdout(outstr)
+    end
+    print_stdout("\nUsable counter(s) for above event(s):")
+    for i, counter in pairs(counters) do
+        outstr = string.format("%s, %s", counter["Name"], counter["TypeName"]);
+        if counter["Options"]:len() > 0 then
+            outstr = outstr .. string.format(", %s",counter["Options"])
+        end
+        print_stdout(outstr)
+    end
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(0)
+end
+
+num_avail_groups, avail_groups = likwid.get_groups()
+
+if print_groups == true then
+    for i,g in pairs(avail_groups) do
+        local gdata = likwid.get_groupdata(g)
+        if gdata ~= nil then
+            print_stdout(string.format("%10s\t%s",g,gdata["ShortDescription"]))
+        end
+    end
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(0)
+end
+
+if print_group_help == true then
+    if #event_string_list == 0 then
+        print_stdout("Group(s) must be given on commandline to get group help")
+        os.exit(1)
+    end
+    for i,event_string in pairs(event_string_list) do
+        local s,e = event_string:find(":")
+        if s ~= nil then
+            print_stdout("Given string is no group")
+            os.exit(1)
+        end
+        for i,g in pairs(avail_groups) do
+            if event_string == g then
+                local gdata = likwid.get_groupdata(event_string)
+                print_stdout(string.format("Group %s:",event_string))
+                print_stdout(gdata["LongDescription"])
+            end
+        end
+    end
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(0)
+end
+
+if #event_string_list == 0 and not print_info then
+    print_stdout("Option(s) -g <string> must be given on commandline")
+    usage()
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(1)
+end
+
+if outfile == nil then
+    print_stdout(likwid.hline)
+    print_stdout(string.format("CPU name:\t%s",cpuinfo["osname"]))
+    print_stdout(string.format("CPU type:\t%s",cpuinfo["name"]))
+    if (cpuinfo["clock"] > 0) then
+        print_stdout(string.format("CPU clock:\t%3.2f GHz",cpuinfo["clock"] * 1.E-09))
+    else
+        print_stdout(string.format("CPU clock:\t%3.2f GHz",likwid.getCpuClock() * 1.E-09))
+    end
+end
+
+if print_info or verbose > 0 then
+    print_stdout(string.format("CPU family:\t%u", cpuinfo["family"]))
+    print_stdout(string.format("CPU model:\t%u", cpuinfo["model"]))
+    print_stdout(string.format("CPU stepping:\t%u", cpuinfo["stepping"]))
+    print_stdout(string.format("CPU features:\t%s", cpuinfo["features"]))
+    P6_FAMILY = 6
+    if cpuinfo["family"] == P6_FAMILY and cpuinfo["perf_version"] > 0 then
+        print_stdout(likwid.hline)
+        print_stdout(string.format("PERFMON version:\t%u",cpuinfo["perf_version"]))
+        print_stdout(string.format("PERFMON number of counters:\t%u",cpuinfo["perf_num_ctr"]))
+        print_stdout(string.format("PERFMON width of counters:\t%u",cpuinfo["perf_width_ctr"]))
+        print_stdout(string.format("PERFMON number of fixed counters:\t%u",cpuinfo["perf_num_fixed_ctr"]))
+    end
+    print_stdout(likwid.hline)
+    if not print_info then likwid.printSupportedCPUs() end
+    likwid.putTopology()
+    likwid.putConfiguration()
+    if print_info then os.exit(0) end
+end
+
+if use_stethoscope == false and use_timeline == false and use_marker == false then
+    use_wrapper = true
+end
+
+if use_wrapper and likwid.tablelength(arg)-2 == 0 and print_info == false then
+    print_stdout("No Executable can be found on commanlikwid.dline")
+    usage()
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(0)
+end
+
+
+
+if pin_cpus then
+    local omp_threads = os.getenv("OMP_NUM_THREADS")
+    if omp_threads == nil then
+        likwid.setenv("OMP_NUM_THREADS",tostring(num_cpus))
+    elseif num_cpus > tonumber(omp_threads) then
+        print_stdout(string.format("Environment variable OMP_NUM_THREADS already set to %s but %d cpus required", omp_threads,num_cpus))
+    end
+    
+    if num_cpus > 1 then
+        local preload = os.getenv("LD_PRELOAD")
+        local pinString = tostring(cpulist[2])
+        for i=3,likwid.tablelength(cpulist) do
+            pinString = pinString .. "," .. cpulist[i]
+        end
+        pinString = pinString .. "," .. cpulist[1]
+        skipString = skip_mask
+
+        likwid.setenv("KMP_AFFINITY","disabled")
+        likwid.setenv("LIKWID_PIN", pinString)
+        likwid.setenv("LIKWID_SKIP",skipString)
+        likwid.setenv("LIKWID_SILENT","true")
+        if preload == nil then
+            likwid.setenv("LD_PRELOAD",likwid.pinlibpath)
+        else
+            likwid.setenv("LD_PRELOAD",likwid.pinlibpath .. ":" .. preload)
+        end
+    end
+    likwid.pinProcess(cpulist[1], 1)
+end
+
+
+
+for i, event_string in pairs(event_string_list) do
+    local groupdata = likwid.get_groupdata(event_string)
+    if groupdata == nil then
+        print_stdout("Cannot read event string, it's neither a performance group nor a proper event string <event>:<counter>:<options>,...")
+        usage()
+        likwid.putTopology()
+        likwid.putConfiguration()
+        os.exit(1)
+    end
+    table.insert(group_list, groupdata)
+    event_string_list[i] = groupdata["EventString"]
+end
+
+
+if set_access_modes then
+    if likwid.setAccessClientMode(access_mode) ~= 0 then
+        likwid.putTopology()
+        likwid.putConfiguration()
+        os.exit(1)
+    end
+end
+if likwid.init(num_cpus, cpulist) < 0 then
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(1)
+end
+
+for i, event_string in pairs(event_string_list) do
+    if event_string:len() > 0 then
+        local gid = likwid.addEventSet(event_string)
+        if gid < 0 then
+            likwid.putTopology()
+            likwid.putConfiguration()
+            likwid.finalize()
+            os.exit(1)
+        end
+        table.insert(group_ids, gid)
+    end
+end
+if #group_ids == 0 then
+    print("ERROR: No valid eventset given on commandline. Exiting...")
+    likwid.putTopology()
+    likwid.putConfiguration()
+    likwid.finalize()
+    os.exit(1)
+end
+
+activeGroup = group_ids[1]
+likwid.setupCounters(activeGroup)
+if outfile == nil then
+    print_stdout(likwid.hline)
+end
+
+if use_marker == true then
+    likwid.setenv("LIKWID_FILEPATH", markerFile)
+    likwid.setenv("LIKWID_MODE", tostring(access_mode))
+    likwid.setenv("LIKWID_DEBUG", tostring(verbose))
+    local str = table.concat(event_string_list, "|")
+    likwid.setenv("LIKWID_EVENTS", str)
+    likwid.setenv("LIKWID_THREADS", table.concat(cpulist,","))
+end
+
+execString = table.concat(arg," ",1, likwid.tablelength(arg)-2)
+if verbose == true then
+    print_stdout(string.format("Executing: %s",execString))
+end
+
+
+if use_timeline == true then
+    local cores_string = "CORES: "
+    for i, cpu in pairs(cpulist) do
+        cores_string = cores_string .. tostring(cpu) .. " "
+    end
+    print_stdout(cores_string:sub(1,cores_string:len()-1))
+end
+
+local ret = likwid.startCounters()
+if ret < 0 then
+    print_stdout(string.format("Error starting counters for cpu %d.",cpulist[ret * (-1)]))
+    os.exit(1)
+end
+
+
+io.stdout:flush()
+local int_results = {}
+if use_wrapper or use_timeline then
+    local start = likwid.startClock()
+    local stop = 0
+    local alltime = 0
+    local groupsums = {}
+    local nr_events = likwid.getNumberOfEvents(activeGroup)
+    local nr_threads = likwid.getNumberOfThreads()
+    for i,g in pairs(group_ids) do
+        groupsums[g] = {}
+        for j=1, likwid.getNumberOfEvents(g) do
+            groupsums[g][j] = {}
+            for k=1,nr_threads do
+                groupsums[g][j][k] = 0
+            end
+        end
+    end
+    
+    if use_wrapper and #group_ids == 1 then
+        duration = 30.E06
+    end
+
+    local pid = likwid.startProgram(execString)
+    if not pid then
+        print_stdout("Failed to execute command: ".. execString)
+        likwid.stopCounters()
+        likwid.finalize()
+        likwid.putTopology()
+        likwid.putConfiguration()
+        os.exit(1)
+    end
+    while true do
+        if likwid.getSignalState() ~= 0 then
+            likwid.killProgram()
+            break
+        end
+        if duration >= 1.E06 then
+            remain = sleep(duration/1.E06)
+            if remain > 0 or not likwid.checkProgram() then
+                io.stdout:flush()
+                break
+            end
+        else
+            status = usleep(duration)
+            if status ~= 0 or not likwid.checkProgram()then
+                io.stdout:flush()
+                break
+            end
+        end
+        if use_timeline == true then
+            stop = likwid.stopClock()
+            likwid.readCounters()
+            local time = likwid.getClock(start, stop)
+            alltime = alltime + time
+            int_results[alltime] = likwid.getResults()
+            local str = tostring(activeGroup) .. ","..tostring(nr_events) .. "," .. tostring(nr_threads) .. ","..tostring(alltime)
+            for ie, e in pairs(int_results[alltime][activeGroup]) do
+                for it, t in pairs(e) do
+                    
+                    groupsums[activeGroup][ie][it] = groupsums[activeGroup][ie][it] + t
+                    str = str .. "," .. tostring(groupsums[activeGroup][ie][it])
+                end
+            end
+            io.stderr:write(str.."\n")
+        end
+        if #group_ids > 1 then
+            likwid.switchGroup(activeGroup + 1)
+            activeGroup = likwid.getIdOfActiveGroup()
+            nr_events = likwid.getNumberOfEvents(activeGroup)
+        end
+        start = likwid.startClock()
+    end
+    stop = likwid.stopClock()
+    if use_timeline == true then
+        likwid.readCounters()
+        local time = likwid.getClock(start, stop)
+        alltime = alltime + time
+        int_results[time] = likwid.getResults()
+        local str = tostring(activeGroup) .. ","..tostring(nr_events) .. "," .. tostring(nr_threads) .. ","..tostring(alltime)
+        for ie, e in pairs(int_results[time][activeGroup]) do
+            for it, t in pairs(e) do
+                groupsums[activeGroup][ie][it] = groupsums[activeGroup][ie][it] + t
+                str = str .. "," .. tostring(groupsums[activeGroup][ie][it])
+            end
+        end
+        io.stderr:write(str.."\n")
+    end
+elseif use_stethoscope then
+    if duration >= 1.E06 then
+        sleep(duration/1.E06)
+    else
+        usleep(duration)
+    end
+elseif use_marker then
+    local ret = os.execute(execString)
+    if ret == nil then
+        print_stdout("Failed to execute command: ".. execString)
+        likwid.stopCounters()
+        likwid.finalize()
+        likwid.putTopology()
+        likwid.putConfiguration()
+        os.exit(1)
+    end
+end
+
+local ret = likwid.stopCounters()
+if ret < 0 then
+    print_stdout(string.format("Error stopping counters for thread %d.",ret * (-1)))
+    likwid.finalize()
+    likwid.putTopology()
+    likwid.putConfiguration()
+    os.exit(1)
+end
+io.stdout:flush()
+if outfile == nil then
+    print_stdout(likwid.hline)
+end
+
+
+if use_marker == true then
+    groups, results = likwid.getMarkerResults(markerFile, group_list, num_cpus)
+    if #groups == 0 and #results == 0 then
+        likwid.finalize()
+        likwid.putTopology()
+        likwid.putConfiguration()
+        os.exit(1)
+    end
+    likwid.print_markerOutput(groups, results, group_list, cpulist)
+else
+    results = likwid.getResults()
+    groups = {}
+    for g,gr in pairs(group_list) do
+        if groups[g] == nil then
+            groups[g] = {}
+        end
+        groups[g]["ID"] = g
+    end
+    likwid.printOutput(groups, results, group_list, cpulist)
+end
+
+if outfile then
+    local suffix = string.match(outfile, ".-[^\\/]-%.?([^%.\\/]*)$")
+    local command = "<PREFIX>/share/likwid/filter/" .. suffix
+    local tmpfile = outfile:gsub("."..suffix,".tmp",1)
+    if likwid.access(command, "x") then
+        print_stdout("Cannot find filter script, save output in CSV format to file "..outfile)
+        os.rename(tmpfile, outfile)
+    else
+        if suffix ~= "txt" and suffix ~= "csv" then
+            command = command .." ".. tmpfile .. " perfctr"
+            local f = assert(io.popen(command))
+            if f ~= nil then
+                local o = f:read("*a")
+                if o:len() > 0 then
+                    print_stdout(string.format("Failed to executed filter script %s.",command))
+                end
+            else
+                print_stdout("Failed to call filter script, save output in CSV format to file "..outfile)
+                os.rename(tmpfile, outfile)
+                os.remove(tmpfile)
+            end
+        else
+            os.rename(tmpfile, outfile)
+            os.remove(tmpfile)
+        end
+    end
+end
+
+likwid.finalize()
+likwid.putTopology()
+likwid.putNumaInfo()
+likwid.putConfiguration()
+os.exit(0)
diff --git a/src/applications/likwid-perfscope.lua b/src/applications/likwid-perfscope.lua
new file mode 100644
index 0000000..ecc0ac8
--- /dev/null
+++ b/src/applications/likwid-perfscope.lua
@@ -0,0 +1,599 @@
+#!<PREFIX>/bin/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-perfscope.lua
+ *
+ *      Description:  An application to use the timeline mode of likwid-perfctr to generate
+ *                    realtime plots using feedGnuplot
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = '<PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+PERFCTR="/home/rrze/unrz/unrz139/TMP/likwid-base/trunk/likwid-perfctr"
+FEEDGNUPLOT="/home/rrze/unrz/unrz139/TMP/likwid-base/trunk/perl/feedGnuplot"
+
+local predefined_plots = {
+    FLOPS_DP = {
+        perfgroup = "FLOPS_DP",
+        ymetricmatch = "MFlops/s",
+        title = "Double Precision Flop Rate",
+        ytitle = "MFlops/s",
+        y2title = nil,
+        xtitle = "Time"
+    },
+    FLOPS_SP = {
+        perfgroup = "FLOPS_SP",
+        ymetricmatch = "MFlops/s",
+        title = "Single Precision Flop Rate",
+        ytitle = "MFlops/s",
+        y2title = nil,
+        xtitle = "Time"
+    },
+    L2_BAND = {
+        perfgroup = "L2",
+        ymetricmatch = "L2 bandwidth [MBytes/s]",
+        title = "L2 cache bandwidth",
+        ytitle = "Bandwidth [MBytes/s]",
+        y2title = nil,
+        xtitle = "Time"
+    },
+    L3_BAND = {
+        perfgroup = "L3",
+        ymetricmatch = "L3 bandwidth [MBytes/s]",
+        title = "L3 cache bandwidth",
+        ytitle = "Bandwidth [MBytes/s]",
+        y2title = nil,
+        xtitle = "Time"
+    },
+    MEM_BAND = {
+        perfgroup = "MEM",
+        ymetricmatch = "Memory bandwidth [MBytes/s]",
+        title = "Memory bandwidth",
+        ytitle = "Bandwidth [MBytes/s]",
+        y2title = nil,
+        xtitle = "Time"
+    },
+    QPI_BAND = {
+        perfgroup = "QPI",
+        ymetricmatch = "QPI data bandwidth [MByte/s]",
+        title = "QPI bandwidth",
+        ytitle = "Bandwidth [MBytes/s]",
+        y2title = nil,
+        xtitle = "Time",
+        y2metricmatch = "QPI link bandwidth [MByte/s]"
+    },
+    POWER = {
+        perfgroup = "ENERGY",
+        ymetricmatch = "Power [W]",
+        title = "Consumed power",
+        ytitle = "Power [W]",
+        y2title = "Power DRAM [W]",
+        y2metricmatch = "Power DRAM [W]",
+        xtitle = "Time"
+    },
+    TEMP = {
+        perfgroup = "ENERGY",
+        ymetricmatch = "Temperature [C]",
+        title = "Temperature",
+        ytitle = "Temperature [C]",
+        y2title = nil,
+        xtitle = "Time"
+    },
+    NUMA = {
+        perfgroup = "NUMA",
+        ymetricmatch = "Local DRAM bandwidth [MByte/s]",
+        title = "NUMA separated memory bandwidth",
+        ytitle = "Bandwidth [MBytes/s]",
+        y2metricmatch = "Remote DRAM bandwidth [MByte/s]",
+        y2title = nil,
+        xtitle = "Time"
+    },
+}
+
+local function version()
+    print(string.format("likwid-perfscope --  Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+    print("Examples:")
+    print("Run command on CPU 2 and measure performance group TEST:")
+    print("likwid-perfscope -C 2 -g TEST -f 1s ./a.out")
+end
+
+local function usage()
+    version()
+    print("A tool to generate pictures on-the-fly from likwid-perfctr measurements\n")
+    print("Options:")
+    print("-h, --help\t\t Help message")
+    print("-v, --version\t\t Version information")
+    print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+    print("-a\t\t\t Print all preconfigured plot configurations for the current system.")
+    print("-c <list>\t\t Processor ids to measure, e.g. 1,2-4,8")
+    print("-C <list>\t\t Processor ids to pin threads and measure, e.g. 1,2-4,8")
+    print("-g, --group <string>\t Preconfigured plot group or custom event set string with plot config. See man page for information.")
+    print("-t, --time <time>\t Frequency in s, ms or us, e.g. 300ms, for the timeline mode of likwid-perfctr")
+    print("-d, --dump\t\t Print output as it is send to feedGnuplot.")
+    print("-p, --plotdump\t\t Use dump functionality of feedGnuplot. Plots out plot configurations plus data to directly submit to gnuplot")
+    print("--host <host>\t\t Run likwid-perfctr on the selected host using SSH. Evaluation and plotting is done locally.")
+    print("\t\t\t This can be used for machines that have no gnuplot installed. All paths must be similar to the local machine.")
+    print("\n")
+    examples()
+end
+
+local function test_gnuplot()
+    cmd = "which gnuplot"
+    f = io.popen(cmd)
+    if f ~= nil then
+        io.close(f)
+        return true
+    end
+    return false
+end
+
+local eventStrings = {}
+local terminal = "x11"
+local num_cpus = 0
+local cpulist = {}
+local matchstring = nil
+local group_list = {}
+local timeline = "1s"
+local print_configs = false
+local pinning = false
+local dump = false
+local plotdump = false
+local nrgroups, allgroups = likwid.get_groups()
+local mfreq = 1.0
+local plotrange = 0
+local host = nil
+
+if #arg == 0 then
+    usage()
+    os.exit(0)
+end
+
+for opt,arg in likwid.getopt(arg, {"h","v","g:","C:","c:","t:","r:","a","d","p","help", "version","group:","time:","dump","range:","plotdump","all", "host:"}) do
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif opt == "g" or opt == "group" then
+        table.insert(eventStrings, arg)
+    elseif (opt == "c") then
+        num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+    elseif (opt == "C") then
+        num_cpus, cpulist = likwid.cpustr_to_cpulist(arg)
+        pinning = true
+    elseif opt == "t" or opt == "time" then
+        timeline = arg
+        mfreq = likwid.parse_time(timeline) * 1.E-6
+    elseif opt == "d" or opt == "dump" then
+        dump = true
+    elseif opt == "p" or opt == "plotdump" then
+        plotdump = true
+    elseif opt == "r" or opt == "range" then
+        plotrange = tonumber(arg)
+    elseif opt == "a" or opt == "all" then
+        print_configs = true
+    elseif opt == "host" then
+        host = arg
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    end
+end
+
+if print_configs then
+    local num_groups, all_groups = likwid.get_groups()
+    for name, config in pairs(predefined_plots) do
+        for i,g in pairs(all_groups) do
+            if g == config["perfgroup"] then
+                print("Group "..name)
+                print("\tPerfctr group: "..config["perfgroup"])
+                print("\tMatch for metric: "..config["ymetricmatch"])
+                print("\tTitle of plot: "..config["title"])
+                print("\tTitle of x-axis: "..config["xtitle"])
+                print("\tTitle of y-axis: "..config["ytitle"])
+                if config["y2metricmatch"] then
+                    print("\tMatch for second metric: "..config["y2metricmatch"])
+                end
+                if config["y2title"] then
+                    print("\tTitle of y2-axis: "..config["y2title"])
+                elseif config["y2metricmatch"] then
+                    print("\tTitle of y2-axis: "..config["ytitle"])
+                end
+                print("")
+                break
+            end
+        end
+    end
+    os.exit(0)
+end
+
+if not test_gnuplot() then
+    print("GnuPlot not available")
+    os.exit(1)
+end
+
+if num_cpus == 0 then
+    print("ERROR: CPU string must be given")
+    os.exit(1)
+end
+
+if #arg == 0 then
+    print("ERROR: Executable must be given on commandline")
+    os.exit(1)
+end
+
+for i, event_def in pairs(eventStrings) do
+    local eventlist = likwid.stringsplit(event_def,",")
+    event_string = nil
+    plotgroup = nil
+    plotgroupconfig = nil
+    plotdefgroup = false
+    for j, preconf in pairs(predefined_plots) do
+        if eventlist[1] == j then
+            for j,g in pairs(allgroups) do
+                if g == preconf["perfgroup"] then
+                    event_string = preconf["perfgroup"]
+                    plotdefgroup = true
+                    plotgroupconfig = preconf
+                    plotgroup = j
+                    break;
+                end
+            end
+            break;
+        end
+    end
+    if #eventlist > 1 then
+        outopts = eventlist[#eventlist]
+        table.remove(eventlist, #eventlist)
+    end
+    if event_string == nil then
+        if plotdefgroup == false then
+            event_string = table.concat(eventlist,",")
+        end
+    end
+
+    local groupdata = nil
+    groupdata = likwid.get_groupdata(event_string)
+    if groupdata == nil then
+        print("Cannot read event string, it's neither a performance group nor a proper event string <event>:<counter>:<options>,...")
+        usage()
+        os.exit(1)
+    end
+    if group_list[i] == nil then
+        group_list[i] = {}
+    end
+    group_list[i]["gdata"] = groupdata
+
+    formulalist = nil
+    local title = nil
+    local ytitle = nil
+    local y2title = nil
+    local y2funcindex = nil
+    local xtitle = nil
+    local output = nil
+    if plotgroup ~= nil then
+        title = plotgroupconfig["title"]
+        ytitle = plotgroupconfig["ytitle"]
+        xtitle = plotgroupconfig["xtitle"]
+        if plotgroupconfig["y2title"] ~= nil then
+            y2title = plotgroupconfig["y2title"]
+        elseif plotgroupconfig["y2metricmatch"] ~= nil then
+            y2title = plotgroupconfig["ytitle"]
+        end
+        for i,mconfig in pairs(groupdata["Metrics"]) do
+            local mmatch = "%a*"..plotgroupconfig["ymetricmatch"]:gsub("%[","%%["):gsub("%]","%%]").."%a*"
+            if mconfig["description"]:match(mmatch) then
+                formulalist = {{name=mconfig["description"], formula=mconfig["formula"]}}
+            end
+            if plotgroupconfig["y2metricmatch"] ~= nil then
+                mmatch = "%a*"..plotgroupconfig["y2metricmatch"]:gsub("%[","%%["):gsub("%]","%%]").."%a*"
+                if mconfig["description"]:match(mmatch) then
+                    table.insert(formulalist, {name=mconfig["description"], formula=mconfig["formula"]})
+                end
+            end
+        end
+    end
+    for j,estr in pairs(likwid.stringsplit(outopts, ":")) do
+        if estr:match("^title=([%g%s]+)") then
+            title = estr:match("^title=([%g%s]+)")
+        elseif estr:match("^TITLE=([%g%s]+)") then
+            title = estr:match("^TITLE=([%g%s]+)")
+        elseif estr:match("ytitle=([%g%s]+)") then
+            ytitle = estr:match("ytitle=([%g%s]+)")
+        elseif estr:match("YTITLE=([%g%s]+)")then
+            ytitle = estr:match("YTITLE=([%g%s]+)")
+        elseif estr:match("y2title=(%d+)-([%g%s]+)") then
+            y2funcindex, y2title = estr:match("y2title=(%d+)-([%g%s]+)")
+        elseif estr:match("Y2TITLE=(%d+)-([%g%s]+)") then
+            y2funcindex, y2title = estr:match("Y2TITLE=(%d+)-([%g%s]+)")
+        elseif estr:match("y2title=([%g%s]+)") then
+            y2title = estr:match("y2title=([%g%s]+)")
+        elseif estr:match("Y2TITLE=([%g%s]+)") then
+            y2title = estr:match("Y2TITLE=([%g%s]+)")
+        elseif estr:match("xtitle=([%g%s]+)") then
+            xtitle = estr:match("xtitle=([%g%s]+)")
+        elseif estr:match("XTITLE=([%g%s]+)")then
+            xtitle = estr:match("XTITLE=([%g%s]+)")
+        elseif estr:match("[%g%s]+=[%g]+") then
+            fname, form = estr:match("([%g%s]+)=([%g]+)")
+            if formulalist == nil then formulalist = {} end
+            table.insert(formulalist, {name=fname, formula=form})
+        end
+    end
+    group_list[i]["eventstring"] = event_string
+    group_list[i]["counterlist"] = {}
+    for k=1,#groupdata["Events"] do
+        table.insert(group_list[i]["counterlist"], groupdata["Events"][k]["Counter"])
+    end
+    if title then
+        group_list[i]["title"] = title
+    end
+    if ytitle then
+        group_list[i]["ytitle"] = ytitle
+    end
+    if y2title then
+        group_list[i]["y2title"] = y2title
+    end
+    if y2funcindex then
+        group_list[i]["y2funcindex"] = y2funcindex - 1
+    else
+        if formulalist ~= nil then
+            group_list[i]["y2funcindex"] = #formulalist - 1
+        end
+    end
+    if xtitle then
+        group_list[i]["xtitle"] = xtitle
+    end
+    if formulalist ~= nil then
+        group_list[i]["formulas"] = formulalist
+    else
+        group_list[i]["formulas"] = {}
+    end
+end
+
+cmd = ""
+if host ~= nil then
+    cmd = cmd .. "ssh "..host.. " \"/bin/bash -c \\\" "
+end
+cmd = cmd .. " " ..PERFCTR
+if pinning then
+    cmd = cmd .. string.format(" -C %s",table.concat(cpulist,","))
+else
+    cmd = cmd .. string.format(" -c %s",table.concat(cpulist,","))
+end
+cmd = cmd .. string.format(" -t %s", timeline)
+
+for i, group in pairs(group_list) do
+    cmd = cmd .. " -g "..group["eventstring"]
+end
+cmd = cmd .. " ".. table.concat(arg, " ")
+-- since io.popen can only read stdout we swap stdout and stderr
+-- application output is written to stderr, we catch stdout
+cmd = cmd .. " 3>&1 1>&2 2>&3 3>&-"
+if host ~= nil then
+    cmd = cmd .. " \\\" \" "
+end
+perfctr = assert (io.popen (cmd))
+
+
+for i, group in pairs(group_list) do
+    gnucmd = string.format("%s --stream %f --with linespoints --domain --nodataid", FEEDGNUPLOT, mfreq/#group_list)
+    if plotrange > 0 then
+        gnucmd = gnucmd .. string.format(" --xlen %d", plotrange)
+    else
+        gnucmd = gnucmd .. " --xmin 0"
+    end
+    if group["title"] ~= nil then
+        if #group_list > 1 then
+            gnucmd = gnucmd .. string.format(" --title %q", "Group "..i..": "..group["title"])
+        else
+            gnucmd = gnucmd .. string.format(" --title %q", group["title"])
+        end
+    end
+    if group["xtitle"] ~= nil then
+        gnucmd = gnucmd .. string.format(" --xlabel %q", group["xtitle"])
+    else
+        gnucmd = gnucmd .. string.format(" --xlabel %q", "Time")
+    end
+    if group["ytitle"] ~= nil then
+        gnucmd = gnucmd .. string.format(" --ylabel %q", group["ytitle"])
+    end
+    if group["y2title"] ~= nil then
+        gnucmd = gnucmd .. string.format(" --y2 %d --y2label %q", group["y2funcindex"], group["y2title"])
+    end
+    if group["formulas"] then
+        if #cpulist == 1 then
+            for f, fdesc in pairs(group["formulas"]) do
+                gnucmd = gnucmd .. string.format(" --legend %d %q", f-1, fdesc["name"])
+            end
+        else
+            local curveID = 0
+            for c,cpu in pairs(cpulist) do
+                for f, fdesc in pairs(group["formulas"]) do
+                    gnucmd = gnucmd .. string.format(" --legend %d %q", curveID, "C"..cpu..": "..fdesc["name"])
+                    curveID = curveID + 1
+                end
+            end
+        end
+    end
+    if plotdump then
+        gnucmd = gnucmd .. " --dump"
+    else
+        gnucmd = gnucmd .. " 1>/dev/null 2>&1"
+    end
+    group_list[i]["output"] = assert(io.popen(gnucmd,"w"))
+end
+
+
+likwid.catchSignal()
+local mtime = {}
+for i,g in pairs(group_list) do
+    local str = "0 "
+    for k,t in pairs(cpulist) do
+        for j,c in pairs(g["formulas"]) do
+            str = str .."0 "
+        end
+    end
+    mtime[i] = nil
+    g["output"]:write(str.."\n")
+    g["output"]:flush()
+    if dump then
+        print(tostring(i).." ".. str)
+    end
+end
+
+
+olddata = {}
+oldmetric = {}
+local perfctr_exited = false
+local oldtime = 0
+local clock = likwid.getCpuClock()
+while true do
+    local l = perfctr:read("*line")
+    if l == nil or l:match("^%s*$") then
+        break
+    elseif l:match("^ERROR") then
+        perfctr_exited = true
+        break
+    end
+    if l:match("^%d+,%d+,%d+,[%d.]+,%d+") then
+        local data = {}
+        local diff = {}
+        linelist = likwid.stringsplit(l, ",")
+        group = tonumber(linelist[1])
+        nr_events = tonumber(linelist[2])
+        nr_threads = tonumber(linelist[3])
+        time = tonumber(linelist[4])
+        table.remove(linelist, 1)
+        table.remove(linelist, 1)
+        table.remove(linelist, 1)
+        table.remove(linelist, 1)
+
+        for i=1,nr_events do
+            for j=1,nr_threads do
+                if data[j] == nil then data[j] = {} end
+                data[j][group_list[group]["counterlist"][i]] = tonumber(linelist[1])
+                table.remove(linelist, 1)
+            end
+        end
+
+        if time - oldtime <= mfreq + (0.1*mfreq) and time - oldtime >= mfreq - (0.1*mfreq) then
+            mtime[group] = time - oldtime
+        else
+            mtime[group] = mfreq
+        end
+
+        if olddata[group] == nil then
+            olddata[group] = {}
+            for l1,v1 in pairs(data) do
+                diff[l1] = {}
+                olddata[group][l1] = {}
+                if type(v1) == "table" then
+                    for l2,v2 in pairs(data[l1]) do
+                        diff[l1][l2] = data[l1][l2]
+                        olddata[group][l1][l2] = data[l1][l2]
+                    end
+                else
+                    diff[l1] = data[l1]
+                    olddata[group][l1] = data[group][l1]
+                end
+                diff[l1]["time"] = mtime[group]
+                diff[l1]["inverseClock"] = 1.0/clock
+            end
+        else
+            for l1,v1 in pairs(data) do
+                diff[l1] = {}
+                if type(v1) == "table" then
+                    for l2,v2 in pairs(v1) do
+                        diff[l1][l2] = data[l1][l2] - olddata[group][l1][l2]
+                        olddata[group][l1][l2] = data[l1][l2]
+                    end
+                else
+                    diff[l1] = data[l1] - olddata[group][l1]
+                    olddata[group][l1] = data[group][l1]
+                end
+                diff[l1]["time"] = mtime[group]
+                diff[l1]["inverseClock"] = 1.0/clock
+            end
+        end
+
+
+        str = tostring(time)
+        if oldmetric[group] == nil then
+            oldmetric[group] = {}
+            for i, thread in pairs(diff) do
+                oldmetric[group][i] = {}
+                for j,fdesc in pairs(group_list[group]["formulas"]) do
+                    oldmetric[group][i][j] = likwid.calculate_metric(fdesc["formula"], thread)
+                    str = str .. " " .. tostring(oldmetric[group][i][j])
+                end
+            end
+        else
+            for i, thread in pairs(diff) do
+                for j,fdesc in pairs(group_list[group]["formulas"]) do
+                    local tmp = likwid.calculate_metric(fdesc["formula"], thread)
+                    if #group_list > 1 or fdesc["formula"]:match("(%/time)") == nil then
+                        str = str .. " " ..tostring(tmp)
+                    else
+                        if tmp - oldmetric[group][i][j] >= 0 then
+                            str = str .. " " .. tostring(tmp - oldmetric[group][i][j])
+                        else
+                            str = str .. " 0"
+                        end
+                    end
+                    oldmetric[group][i][j] = tmp
+                end
+            end
+        end
+        
+        group_list[group]["output"]:write(str.."\n")
+        group_list[group]["output"]:flush()
+        if dump then
+            print(tostring(group).." ".. str)
+        end
+        oldtime = time
+    end
+end
+
+if perfctr_exited == false then
+    while likwid.getSignalState() == 0 do
+        sleep(1)
+    end
+end
+for i, group in pairs(group_list) do
+    group["output"]:write("exit\n")
+    io.close(group["output"])
+end
+io.close(perfctr)
+
+
+
diff --git a/src/applications/likwid-pin.c b/src/applications/likwid-pin.c
deleted file mode 100644
index 3d9e85b..0000000
--- a/src/applications/likwid-pin.c
+++ /dev/null
@@ -1,346 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-pin.c
- *
- *      Description:  An application to pin a program including threads
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* #####   HEADER FILE INCLUDES   ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <error.h>
-#include <types.h>
-#include <bstrlib.h>
-#include <cpuid.h>
-#include <affinity.h>
-#include <numa.h>
-#include <memsweep.h>
-#include <strUtil.h>
-
-#ifdef COLOR
-#include <textcolor.h>
-#endif
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-#define HELP_MSG \
-    fprintf(stdout, "likwid-pin --  Version %d.%d \n\n",VERSION,RELEASE); \
-    fprintf(stdout, "\n"); \
-    fprintf(stdout, "Supported Options:\n"); \
-    fprintf(stdout, "-h\t Help message\n"); \
-    fprintf(stdout, "-v\t Version information\n"); \
-    fprintf(stdout, "-i\t Set numa interleave policy with all involved numa nodes\n"); \
-    fprintf(stdout, "-S\t Sweep memory in involved numa nodes\n"); \
-    fprintf(stdout, "-c\t comma separated list of processor ids or expression\n"); \
-    fprintf(stdout, "-s\t bitmask with threads to skip\n"); \
-    fprintf(stdout, "-p\t Print available domains with mapping on physical ids\n"); \
-    fprintf(stdout, "  \t If used together with -c option outputs a physical processor ids.\n"); \
-    fprintf(stdout, "-d\t Delimiter used for using -p to output physical processor list, default is comma.\n\n"); \
-    fprintf(stdout, "-q\t Silent without output\n\n"); \
-    fprintf(stdout, "There are three possibilities to provide a thread to processor list:\n\n"); \
-    fprintf(stdout, "1. Thread list with physical or logical thread numberings and physical cores first.\n"); \
-    fprintf(stdout, "Example usage thread list: likwid-pin -c N:0,4-6 ./myApp\n"); \
-    fprintf(stdout, "You can pin with the following numberings:\n");  \
-    fprintf(stdout, "\t1. Physical numbering of OS.\n");  \
-    fprintf(stdout, "\t2. Logical numbering inside node. e.g. -c N:0-3\n");  \
-    fprintf(stdout, "\t3. Logical numbering inside socket. e.g. -c S0:0-3\n");  \
-    fprintf(stdout, "\t4. Logical numbering inside last level cache group. e.g. -c C0:0-3\n");  \
-    fprintf(stdout, "\t5. Logical numbering inside NUMA domain. e.g. -c M0:0-3\n");  \
-    fprintf(stdout, "\tYou can also mix domains separated by  @, e.g. -c S0:0-3 at S1:0-3 \n\n");  \
-    fprintf(stdout, "2. Expressions based thread list generation with compact processor numbering.\n"); \
-    fprintf(stdout, "Example usage expression: likwid-pin -c E:N:8 ./myApp\n"); \
-    fprintf(stdout, "This will generate a compact list of thread to processor mapping for the node domain with eight threads.\n");  \
-    fprintf(stdout, "The following syntax variants are available:\n");  \
-    fprintf(stdout, "\t1. -c E:<thread domain>:<number of threads>\n");  \
-    fprintf(stdout, "\t2. -c E:<thread domain>:<number of threads>:<chunk size>:<stride>\n");  \
-    fprintf(stdout, "\t   For two SMT threads per core on a SMT 4 machine use e.g. -c E:N:122:2:4\n\n");  \
-    fprintf(stdout, "3. Scatter policy among thread domain type.\n"); \
-    fprintf(stdout, "Example usage scatter: likwid-pin -c M:scatter ./myApp\n"); \
-    fprintf(stdout, "This will generate a thread to processor mapping scattered among all memory domains with physical cores first.\n\n");  \
-    fprintf(stdout, "4. Logical pinning.\n"); \
-    fprintf(stdout, "Example usage logical pinning: likwid-pin -c L:0,3,4 ./myApp\n"); \
-    fprintf(stdout, "This will generate a mapping containing the processors with index 0, 3 and 4 in the currently available processor list.\n");  \
-    fprintf(stdout, "If you are running inside a cpuset (taskset, cgroup) the sorted list of allowed processors is taken as processor list.\n");  \
-    fprintf(stdout, "Example usage logical pinning inside cpuset:\n"); \
-    fprintf(stdout, "taskset -c 4,7,2,1,5 likwid-pin -c L:0,2,4 ./myApp\n"); \
-    fprintf(stdout, "This maps the application to the processors 1,4,7.\n\n");  \
-    fprintf(stdout, "If you ommit the -c option likwid will use all processors available on the node\n"); \
-    fprintf(stdout, "with physical cores first. likwid-pin will also set OMP_NUM_THREADS with as many\n"); \
-    fprintf(stdout, "threads as specified in your pin expression if OMP_NUM_THREADS is not present\n"); \
-    fprintf(stdout, "in your environment.\n\n"); \
-    fflush(stdout);
-
-#define VERSION_MSG \
-    fprintf(stdout, "likwid-pin   %d.%d \n\n",VERSION,RELEASE); \
-    fflush(stdout);
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-    static void
-pinPid(int cpuid, int silent)
-{
-    int status;
-    cpu_set_t cpuset;
-
-    CPU_ZERO(&cpuset);
-    CPU_SET(cpuid, &cpuset);
-
-    status = sched_setaffinity(0, sizeof(cpu_set_t), &cpuset);
-
-    if (status == -1)
-    {
-        fprintf(stderr, "sched_setaffinity failed : %s \n",strerror(errno));
-    }
-    else
-    {
-        if(!silent)
-        {
-#ifdef COLOR
-            color_on(BRIGHT, COLOR);
-#endif
-            fprintf(stdout, "[likwid-pin] Main PID -> core %d - OK",  cpuid);
-#ifdef COLOR
-            color_reset();
-#endif
-            fprintf(stdout, "\n");
-            fflush(stdout);
-        }
-    }
-}
-
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-int main (int argc, char** argv)
-{
-    int i;
-    int c;
-    int skipMask = -1;
-    int optInterleaved = 0;
-    int optMemSweep = 0;
-    int optPrintDomains = 0;
-    int optSilent = 0;
-    int hasAffinity = 0;
-    bstring  pinString;
-    bstring  skipString;
-    bstring  argString;
-    int numThreads=0;
-    int threads[MAX_NUM_THREADS];
-    char delimiter = ',';
-    FILE* OUTSTREAM = stdout;
-    threads[0] = 0;
-
-    if (argc ==  1) {
-        HELP_MSG;
-        exit (EXIT_SUCCESS);
-    }
-
-    if (cpuid_init() == EXIT_SUCCESS)
-    {
-        numa_init();
-        affinity_init();
-        hasAffinity = 1;
-    }
-
-    while ((c = getopt (argc, argv, "+c:d:hipqs:Sv")) != -1)
-    {
-        switch (c)
-        {
-            case 'c':
-                CHECK_OPTION_STRING;
-                if (hasAffinity)
-                {
-                    numThreads = bstr_to_cpuset(threads, argString);
-                }
-                else
-                {
-                    numThreads = bstr_to_cpuset_physical((uint32_t*) threads, argString);
-                }
-
-                if(!numThreads)
-                {
-                    ERROR_PLAIN_PRINT(Failed to parse cpu list.);
-                }
-                break;
-            case 'd':
-                delimiter = optarg[0];
-                break;
-            case 'h':
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-            case 'i':
-                optInterleaved = 1;
-                break;
-            case 'p':
-                if (!hasAffinity)
-                {
-                    fprintf(stderr, "Option -p is not supported for unknown processor!\n");
-                    exit(EXIT_SUCCESS);
-                }
-                optPrintDomains = 1;
-                break;
-            case 'q':
-                optSilent = 1;
-                OUTSTREAM = NULL;
-                setenv("LIKWID_SILENT","true", 1);
-                break;
-            case 's':
-                CHECK_OPTION_STRING;
-                skipMask = strtoul((char*) argString->data,NULL,16);
-                break;
-            case 'S':
-                if (!hasAffinity)
-                {
-                    fprintf(stderr, "Option -S is not supported for unknown processor!\n");
-                    exit(EXIT_SUCCESS);
-                }
-                optMemSweep = 1;
-                break;
-            case 'v':
-                VERSION_MSG;
-                exit (EXIT_SUCCESS);
-            default:
-                HELP_MSG;
-                exit(EXIT_FAILURE);
-        }
-    }
-    if (optind == argc && !optPrintDomains)
-    {
-        fprintf(stderr,"Executable must be given on commandline\n");
-        exit(EXIT_FAILURE);
-    }
-
-    if (optPrintDomains && numThreads)
-    {
-        if ((!optSilent) && (OUTSTREAM))
-        {
-            fprintf(OUTSTREAM, "%d",threads[0]);
-
-            for ( i=1; i< numThreads; i++)
-            {
-                fprintf(OUTSTREAM, "%c%d",delimiter,threads[i]);
-            }
-            fprintf(OUTSTREAM, "\n");
-            fflush(OUTSTREAM);
-        }
-        exit (EXIT_SUCCESS);
-    }
-    else if ( optPrintDomains )
-    {
-        affinity_printDomains(OUTSTREAM);
-        exit (EXIT_SUCCESS);
-    }
-
-    if (!numThreads)
-    {
-        argString = bformat("N:0-%u", cpuid_topology.numHWThreads-1);
-        numThreads = bstr_to_cpuset(threads, argString);
-    }
-
-    /* CPU List:
-     * pthread (default): pin main pid + all thread tids
-     *
-     * OpenMP: Pin OMP_NUM_THREADS
-     * intel openmp: pin main pid + all thread tids (skip thread 1)
-     * gcc openmp: pin main pid + all thread tids (one less)
-     */
-
-    if (optInterleaved)
-    {
-        if ((!optSilent) && (OUTSTREAM))
-        {
-            fprintf(OUTSTREAM, "Set mem_policy to interleaved\n");
-            fflush(OUTSTREAM);
-        }
-        numa_setInterleaved(threads, numThreads);
-    }
-
-    if (optMemSweep)
-    {
-        if ((!optSilent) && (OUTSTREAM))
-        {
-            fprintf(OUTSTREAM, "Sweeping memory\n");
-            fflush(OUTSTREAM);
-        }
-        memsweep_threadGroup(OUTSTREAM, threads, numThreads);
-    }
-
-    if ( getenv("OMP_NUM_THREADS") == NULL )
-    {
-        argString = bformat("%d",numThreads);
-        setenv("OMP_NUM_THREADS",(char*) argString->data , 0);
-    }
-
-    if (numThreads > 1)
-    {
-        bstring ldPreload = bfromcstr(getenv("LD_PRELOAD"));
-
-        pinString = bformat("%d",threads[1]);
-
-        for (i=2; i < numThreads;i++)
-        {
-            bformata(pinString,",%d",threads[i]);
-        }
-
-        bformata(pinString,",%d",threads[0]);
-
-        if (skipMask >= 0)
-        {
-            skipString = bformat("%d",skipMask);
-            setenv("LIKWID_SKIP",(char*) bdata(skipString) , 1);
-        }
-
-        setenv("KMP_AFFINITY", "disabled", 1);
-        setenv("LIKWID_PIN",(char*) bdata(pinString) , 1);
-
-
-        if (ldPreload == NULL)
-        {
-            setenv("LD_PRELOAD",TOSTRING(LIBLIKWIDPIN), 1);
-        }
-        else
-        {
-            bconchar(ldPreload, ':');
-            bcatcstr(ldPreload, TOSTRING(LIBLIKWIDPIN));
-            setenv("LD_PRELOAD", bdata(ldPreload), 1);
-        }
-    }
-
-    pinPid(threads[0], optSilent);
-    fflush(stdout);
-
-    argv +=  optind;
-    execvp(argv[0], argv);
-    perror("execvp");
-    fprintf(stderr,"failed to execute %s\n", argv[0]);
-
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-pin.lua b/src/applications/likwid-pin.lua
new file mode 100644
index 0000000..868f541
--- /dev/null
+++ b/src/applications/likwid-pin.lua
@@ -0,0 +1,250 @@
+#!<PREFIX>/bin/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-pin.lua
+ *
+ *      Description:  An application to pin a program including threads
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local function version()
+    print(string.format("likwid-pin.lua --  Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+    print("Examples:")
+    print("There are three possibilities to provide a thread to processor list:")
+    print("1. Thread list with physical thread IDs")
+    print("Example: likwid-pin.lua -c 0,4-6 ./myApp")
+    print("Pins the application to cores 0,4,5 and 6")
+    print("2. Thread list with logical thread numberings in physical cores first sorted list.")
+    print("Example usage thread list: likwid-pin.lua -c N:0,4-6 ./myApp")
+    print("You can pin with the following numberings:")
+    print("\t2. Logical numbering inside node.\n\t   e.g. -c N:0,1,2,3 for the first 4 physical cores of the node")
+    print("\t3. Logical numbering inside socket.\n\t   e.g. -c S0:0-1 for the first 2 physical cores of the socket")
+    print("\t4. Logical numbering inside last level cache group.\n\t   e.g. -c C0:0-3  for the first 4 physical cores in the first LLC")
+    print("\t5. Logical numbering inside NUMA domain.\n\t   e.g. -c M0:0-3 for the first 4 physical cores in the first NUMA domain")
+    print("\tYou can also mix domains separated by  @,\n\te.g. -c S0:0-3 at S1:0-3 for the 4 first physical cores on both sockets.")
+    print("3. Expressions based thread list generation with compact processor numbering.")
+    print("Example usage expression: likwid-pin.lua -c E:N:8 ./myApp")
+    print("This will generate a compact list of thread to processor mapping for the node domain")
+    print("with eight threads.")
+    print("The following syntax variants are available:")
+    print("\t1. -c E:<thread domain>:<number of threads>")
+    print("\t2. -c E:<thread domain>:<number of threads>:<chunk size>:<stride>")
+    print("\tFor two SMT threads per core on a SMT 4 machine use e.g. -c E:N:122:2:4")
+    print("4. Scatter policy among thread domain type.")
+    print("Example usage scatter: likwid-pin.lua -c M:scatter ./myApp")
+    print("This will generate a thread to processor mapping scattered among all memory domains")
+    print("with physical cores first.")
+    print("")
+    print("likwid-pin sets OMP_NUM_THREADS with as many threads as specified")
+    print("in your pin expression if OMP_NUM_THREADS is not present in your environment.")
+end
+
+local function usage()
+    version()
+    print("An application to pin a program including threads.\n")
+    print("Options:")
+    print("-h, --help\t\t Help message")
+    print("-v, --version\t\t Version information")
+    print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+    print("-i\t\t\t Set numa interleave policy with all involved numa nodes")
+    print("-S, --sweep\t\t Sweep memory and LLC of involved NUMA nodes")
+    print("-c <list>\t\t Comma separated processor IDs or expression")
+    print("-s, --skip <hex>\t Bitmask with threads to skip")
+    print("-p\t\t\t Print available domains with mapping on physical IDs")
+    print("\t\t\t If used together with -p option outputs a physical processor IDs.")
+    print("-d <string>\t\t Delimiter used for using -p to output physical processor list, default is comma.")
+    print("-q, --quiet\t\t Silent without output")
+    print("\n")
+    examples()
+end
+
+delimiter = ','
+quiet = 0
+sweep_sockets = false
+interleaved_policy = false
+print_domains = false
+cpu_list = {}
+skip_mask = "0x0"
+affinity = nil
+num_threads = 0
+
+config = likwid.getConfiguration()
+cputopo = likwid.getCpuTopology()
+affinity = likwid.getAffinityInfo()
+
+if (#arg == 0) then
+    usage()
+    os.exit(0)
+end
+
+for opt,arg in likwid.getopt(arg, {"c:", "d:", "h", "i", "p", "q", "s:", "S", "t:", "v", "V:", "verbose:", "help", "version", "skip","sweep", "quiet"}) do
+    if opt == "h" or opt == "help" then
+        usage()
+        likwid.putTopology()
+        likwid.putAffinityInfo()
+        likwid.putConfiguration()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        likwid.putTopology()
+        likwid.putAffinityInfo()
+        likwid.putConfiguration()
+        os.exit(0)
+    elseif opt == "V" or opt == "verbose" then
+        verbose = tonumber(arg)
+        likwid.setVerbosity(verbose)
+    elseif (opt == "c") then
+        if (affinity ~= nil) then
+            num_threads,cpu_list = likwid.cpustr_to_cpulist(arg)
+        else
+            num_threads,cpu_list = likwid.cpustr_to_cpulist_physical(arg)
+        end
+        if (num_threads == 0) then
+            print("Failed to parse cpulist " .. arg)
+            likwid.putTopology()
+            likwid.putAffinityInfo()
+            likwid.putConfiguration()
+            os.exit(1)
+        end
+    elseif (opt == "d") then
+        delimiter = arg
+    elseif opt == "S" or opt == "sweep" then
+        if (affinity == nil) then
+            print("Option -S is not supported for unknown processor!")
+            likwid.putTopology()
+            likwid.putAffinityInfo()
+            likwid.putConfiguration()
+            os.exit(1)
+        end
+        sweep_sockets = true
+    elseif (opt == "i") then
+        interleaved_policy = true
+    elseif (opt == "p") then
+        print_domains = true
+    elseif opt == "s" or opt == "skip" then
+        local s,e = arg:find("0x")
+        if s == nil then
+            print("Skip mask must be given in hex, hence start with 0x")
+            os.exit(1)
+        end
+        skip_mask = arg
+    elseif opt == "q" or opt == "quiet" then
+        likwid.setenv("LIKWID_SILENT","true")
+        quiet = 1
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        likwid.putTopology()
+        likwid.putAffinityInfo()
+        likwid.putConfiguration()
+        os.exit(1)
+    end
+end
+
+if print_domains and num_threads > 0 then
+    outstr = ""
+    for i, cpu in pairs(cpu_list) do
+        outstr = outstr .. delimiter .. cpu
+    end
+    print(outstr:sub(2,outstr:len()))
+    likwid.putTopology()
+    likwid.putAffinityInfo()
+    likwid.putConfiguration()
+    os.exit(0)
+elseif print_domains then
+    for k,v in pairs(affinity["domains"]) do
+        print(string.format("Domain %s:", v["tag"]))
+        print("\t" .. table.concat(v["processorList"], ","))
+        print("")
+    end
+    likwid.putTopology()
+    likwid.putAffinityInfo()
+    likwid.putConfiguration()
+    os.exit(0)
+end
+
+if num_threads == 0 then
+    num_threads, cpu_list = likwid.cpustr_to_cpulist("N:0-"..cputopo["numHWThreads"]-1)
+end
+
+if interleaved_policy then
+    print("Set mem_policy to interleaved")
+    likwid.setMemInterleaved(num_threads, cpu_list)
+end
+
+if sweep_sockets then
+    print("Sweeping memory")
+    likwid.memSweep(num_threads, cpu_list)
+end
+
+local omp_threads = os.getenv("OMP_NUM_THREADS")
+if omp_threads == nil then
+    likwid.setenv("OMP_NUM_THREADS",tostring(num_threads))
+elseif num_threads > tonumber(omp_threads) then
+    print(string.format("Environment variable OMP_NUM_THREADS already set to %s but %d cpus required", omp_threads,num_threads))
+end
+
+
+if num_threads > 1 then
+    local preload = os.getenv("LD_PRELOAD")
+    local pinString = tostring(cpu_list[2])
+    for i=3,likwid.tablelength(cpu_list) do
+        pinString = pinString .. "," .. cpu_list[i]
+    end
+    pinString = pinString .. "," .. cpu_list[1]
+    skipString = skip_mask
+
+    likwid.setenv("KMP_AFFINITY","disabled")
+    likwid.setenv("LIKWID_PIN", pinString)
+    likwid.setenv("LIKWID_SKIP",skipString)
+
+    if preload == nil then
+        likwid.setenv("LD_PRELOAD",likwid.pinlibpath)
+    else
+        likwid.setenv("LD_PRELOAD",likwid.pinlibpath .. ":" .. preload)
+    end
+end
+
+likwid.pinProcess(cpu_list[1], quiet)
+local exec = table.concat(arg," ",1, likwid.tablelength(arg)-2)
+local err
+err = os.execute(exec)
+if (err == false) then
+    print("Failed to execute command: ".. exec)
+    likwid.putTopology()
+    likwid.putAffinityInfo()
+    likwid.putConfiguration()
+    os.exit(1)
+end
+
+likwid.putAffinityInfo()
+likwid.putTopology()
+likwid.putConfiguration()
+os.exit(0)
diff --git a/src/applications/likwid-powermeter.c b/src/applications/likwid-powermeter.c
deleted file mode 100644
index 4daa393..0000000
--- a/src/applications/likwid-powermeter.c
+++ /dev/null
@@ -1,507 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-powermeter.c
- *
- *      Description:  An application to get information about power 
- *      consumption on architectures implementing the RAPL interface.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <strUtil.h>
-#include <error.h>
-#include <lock.h>
-#include <timer.h>
-#include <cpuid.h>
-#include <numa.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <affinity.h>
-#include <perfmon.h>
-#include <power.h>
-#include <thermal.h>
-#include <bstrlib.h>
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#define HELP_MSG \
-fprintf(stdout, "\nlikwid-powermeter --  Version  %d.%d \n\n",VERSION,RELEASE); \
-fprintf(stdout, "A tool to print Power and Clocking information on Intel SandyBridge CPUS.\n"); \
-fprintf(stdout, "Options:\n"); \
-fprintf(stdout, "-h\t\t Help message\n"); \
-fprintf(stdout, "-v\t\t Version information\n"); \
-fprintf(stdout, "-M <0|1>\t set how MSR registers are accessed: 0=direct, 1=msrd \n"); \
-fprintf(stdout, "-c <list>\t specify sockets to measure\n"); \
-fprintf(stdout, "-i\t\t print information from MSR_PKG_POWER_INFO register and Turbo Mode\n"); \
-fprintf(stdout, "-s <duration>\t set measure duration in sec. (default 2s) \n"); \
-fprintf(stdout, "-p\t\t print dynamic clocking and CPI values (requires executable)\n\n");   \
-fprintf(stdout, "Usage: likwid-powermeter -s 4 -c 1 \n");  \
-fprintf(stdout, "Alternative as wrapper: likwid-powermeter -c 1 ./a.out\n"); \
-fflush(stdout);
-
-#define VERSION_MSG \
-fprintf(stdout, "likwid-powermeter  %d.%d \n\n",VERSION,RELEASE); \
-fflush(stdout);
-
-
-int main (int argc, char** argv)
-{
-    int socket_fd = -1;
-    int optInfo = 0;
-    int optClock = 0;
-    int optStethoscope = 0;
-    int optSockets = 0;
-    int optTemp = 0;
-    double runtime;
-    int hasDRAM = 0;
-    int hasPP0 = 0;
-    int hasPP1 = 0;
-    int c, i;
-    bstring argString;
-    bstring eventString = bfromcstr("CLOCK");
-    int numSockets=1;
-    int numThreads=0;
-    int threadsSockets[MAX_NUM_NODES*2];
-    int threads[MAX_NUM_THREADS];
-    const AffinityDomain* socketDomains[MAX_NUM_NODES*2];
-    threadsSockets[0] = 0;
-
-    if (argc == 1)
-    {
-        HELP_MSG;
-        exit (EXIT_SUCCESS);
-    }
-
-    while ((c = getopt (argc, argv, "+c:hiM:ps:vt")) != -1)
-    {
-        switch (c)
-        {
-            case 'c':
-                CHECK_OPTION_STRING;
-                numSockets = bstr_to_cpuset_physical((uint32_t*) threadsSockets, argString);
-                bdestroy(argString);
-                optSockets = 1;
-                break;
-
-            case 'h':
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-            case 'i':
-                optInfo = 1;
-                break;
-            case 'M':  /* Set MSR Access mode */
-                CHECK_OPTION_STRING;
-                accessClient_setaccessmode(str2int((char*) argString->data));
-                bdestroy(argString);
-                break;
-            case 'p':
-                optClock = 1;
-                break;
-            case 's':
-                CHECK_OPTION_STRING;
-                optStethoscope = str2int((char*) argString->data);
-                bdestroy(argString);
-                break;
-            case 'v':
-                VERSION_MSG;
-                exit (EXIT_SUCCESS);
-            case 't':
-                optTemp = 1;
-                break;
-            case '?':
-                if (optopt == 's' || optopt == 'M' || optopt == 'c')
-                {
-                    HELP_MSG;
-                }
-                else if (isprint (optopt))
-                {
-                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
-                }
-                else
-                {
-                    fprintf (stderr,
-                            "Unknown option character `\\x%x'.\n",
-                            optopt);
-                }
-                exit( EXIT_FAILURE);
-            default:
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-        }
-    }
-
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to performance counters is locked.\n");
-        exit(EXIT_FAILURE);
-    }
-    if (optClock && optind == argc)
-    {
-        fprintf(stderr,"Commandline option -p requires an executable.\n");
-        exit(EXIT_FAILURE);
-    }
-    if (optSockets && !optStethoscope && optind == argc)
-    {
-        fprintf(stderr,"Commandline option -c requires an executable if not used in combination with -s.\n");
-        exit(EXIT_FAILURE);
-    }
-    if (optStethoscope == 0 && optind == argc && !optInfo)
-    {
-        fprintf(stderr,"Either -s <seconds> or executable must be given on commandline.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    if (cpuid_init() == EXIT_FAILURE)
-    {
-        fprintf(stderr, "CPU not supported\n");
-        exit(EXIT_FAILURE);
-    }
-    if (numSockets > cpuid_topology.numSockets)
-    {
-        fprintf(stderr, "System has only %d sockets but %d are given on commandline.\n",
-                        cpuid_topology.numSockets, numSockets);
-        exit(EXIT_FAILURE);
-    }
-
-    numa_init();
-    affinity_init();
-
-    for (c = 0; c < numSockets; c++)
-    {
-        if (threadsSockets[c] >= cpuid_topology.numSockets)
-        {
-            fprintf(stderr, "System has no socket %d\n", threadsSockets[c]);
-            exit(EXIT_FAILURE);
-        }
-        bstring socketStr = bformat("S%d",threadsSockets[c]);
-        socketDomains[threadsSockets[c]] = affinity_getDomain(socketStr);
-    }
-
-    accessClient_init(&socket_fd);
-    msr_init(socket_fd);
-    timer_init();
-
-    /* check for supported processors */
-    if ((cpuid_info.model == SANDYBRIDGE_EP) ||
-            (cpuid_info.model == SANDYBRIDGE) ||
-            (cpuid_info.model == IVYBRIDGE) ||
-            (cpuid_info.model == IVYBRIDGE_EP) ||
-            (cpuid_info.model == HASWELL) ||
-            (cpuid_info.model == HASWELL_EX) ||
-            (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
-            (cpuid_info.model == NEHALEM_LYNNFIELD) ||
-            (cpuid_info.model == NEHALEM_WESTMERE) ||
-            (cpuid_info.model == ATOM_SILVERMONT_C) ||
-            (cpuid_info.model == ATOM_SILVERMONT_E) ||
-            (cpuid_info.model == ATOM_SILVERMONT_F1) ||
-            (cpuid_info.model == ATOM_SILVERMONT_F2) ||
-            (cpuid_info.model == ATOM_SILVERMONT_F3))
-    {
-        if (numSockets == 0)
-        {
-            numSockets = numa_info.numberOfNodes;
-        }
-        for(int i=0; i<numSockets; i++)
-        {
-            power_init(socketDomains[threadsSockets[i]]->processorList[0]);
-        }
-    }
-    else
-    {
-        fprintf (stderr, "Query Turbo Mode only supported on Intel Nehalem/Westmere/SandyBridge/IvyBridge/Haswell/Silvermont processors!\n");
-        exit(EXIT_FAILURE);
-    }
-
-    double clock = (double) timer_getCpuClock();
-
-    fprintf(stdout, HLINE);
-    fprintf(stdout, "CPU name:\t%s \n",cpuid_info.name);
-    fprintf(stdout, "CPU clock:\t%3.2f GHz \n",  (float) clock * 1.E-09);
-    fprintf(stdout, HLINE);
-    fflush(stdout);
-
-    if (optInfo)
-    {
-        if (power_info.turbo.numSteps != 0)
-        {
-            fprintf(stdout, "Base clock:\t%.2f MHz \n",  power_info.baseFrequency );
-            fprintf(stdout, "Minimal clock:\t%.2f MHz \n",  power_info.minFrequency );
-            fprintf(stdout, "Turbo Boost Steps:\n");
-            for (int i=0; i < power_info.turbo.numSteps; i++ )
-            {
-                fprintf(stdout, "C%d %.2f MHz \n",i+1,  power_info.turbo.steps[i] );
-            }
-        }
-        fprintf(stdout, HLINE);
-        fflush(stdout);
-    }
-
-    if ((cpuid_info.model == SANDYBRIDGE_EP) ||
-        (cpuid_info.model == IVYBRIDGE_EP) ||
-        (cpuid_info.model == HASWELL_EX) ||
-        (cpuid_info.model == HASWELL))
-    {
-        hasDRAM = 1;
-    }
-    if ((cpuid_info.model == SANDYBRIDGE_EP) ||
-        (cpuid_info.model == SANDYBRIDGE) ||
-        (cpuid_info.model == IVYBRIDGE_EP) ||
-        (cpuid_info.model == IVYBRIDGE) ||
-        (cpuid_info.model == HASWELL) ||
-        (cpuid_info.model == ATOM_SILVERMONT_E) ||
-        (cpuid_info.model == ATOM_SILVERMONT_F1) ||
-        (cpuid_info.model == ATOM_SILVERMONT_F2) ||
-        (cpuid_info.model == ATOM_SILVERMONT_F3))
-    {
-        hasPP0 = 1;
-    }
-    if ((cpuid_info.model == HASWELL) ||
-        (cpuid_info.model == SANDYBRIDGE) ||
-        (cpuid_info.model == IVYBRIDGE))
-    {
-        hasPP1 = 1;
-    }
-    if ((cpuid_info.model != SANDYBRIDGE) &&
-        (cpuid_info.model != SANDYBRIDGE_EP)  &&
-        (cpuid_info.model != IVYBRIDGE)  &&
-        (cpuid_info.model != IVYBRIDGE_EP)  &&
-        (cpuid_info.model != HASWELL) &&
-        (cpuid_info.model != HASWELL_M1) &&
-        (cpuid_info.model != HASWELL_M2) &&
-        (cpuid_info.model != HASWELL_EX) &&
-        (cpuid_info.model != ATOM_SILVERMONT_C) &&
-        (cpuid_info.model != ATOM_SILVERMONT_E) &&
-        (cpuid_info.model != ATOM_SILVERMONT_F1) &&
-        (cpuid_info.model != ATOM_SILVERMONT_F2) &&
-        (cpuid_info.model != ATOM_SILVERMONT_F3))
-    {
-        fprintf (stderr, "RAPL not supported on this processor!\n");
-        exit(EXIT_FAILURE);
-    }
-
-    if (optInfo)
-    {
-        fprintf(stdout, "Thermal Spec Power: %g Watts \n", power_info.tdp );
-        fprintf(stdout, "Minimum  Power: %g Watts \n", power_info.minPower);
-        fprintf(stdout, "Maximum  Power: %g Watts \n", power_info.maxPower);
-        fprintf(stdout, "Maximum  Time Window: %g micro sec \n", power_info.maxTimeWindow);
-        fprintf(stdout, HLINE);
-        fflush(stdout);
-        exit(EXIT_SUCCESS);
-    }
-
-    if (optClock)
-    {
-        affinity_init();
-        argString = bformat("S%u:0-%u", threadsSockets[0],
-                        socketDomains[threadsSockets[0]]->numberOfProcessors-1);
-        for (int i=1; i<numSockets; i++)
-        {
-            bstring tExpr = bformat("@S%u:0-%u", threadsSockets[i],
-                                socketDomains[threadsSockets[i]]->numberOfProcessors-1);
-            bconcat(argString, tExpr);
-        }
-        numThreads = bstr_to_cpuset(threads, argString);
-        bdestroy(argString);
-        perfmon_init(numThreads, threads, stdout);
-        perfmon_setupEventSet(eventString, NULL);
-    }
-
-    {
-        PowerData pDataPkg[MAX_NUM_NODES*2];
-        PowerData pDataDram[MAX_NUM_NODES*2];
-        PowerData pDataPP0[MAX_NUM_NODES*2];
-        PowerData pDataPP1[MAX_NUM_NODES*2];
-        fprintf(stdout, "Measure on sockets: %d", threadsSockets[0]);
-        for (int i=1; i<numSockets; i++)
-        {
-            fprintf(stdout, ", %d", threadsSockets[i]);
-        }
-        fprintf(stdout, "\n");
-        fflush(stdout);
-
-        if (optStethoscope)
-        {
-            if (optClock)
-            {
-                perfmon_startCounters();
-            }
-            else
-            {
-                for (int i=0; i<numSockets; i++)
-                {
-                    int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
-                    if (hasDRAM) power_start(&(pDataDram[i]), cpuId, DRAM);
-                    if (hasPP0) power_start(&(pDataPP0[i]), cpuId, PP0);
-                    if (hasPP1) power_start(&(pDataPP1[i]), cpuId, PP1);
-                    power_start(&(pDataPkg[i]), cpuId, PKG);
-                }
-            }
-            sleep(optStethoscope);
-
-            if (optClock)
-            {
-                perfmon_stopCounters();
-                perfmon_printCounterResults();
-                perfmon_finalize();
-            }
-            else
-            {
-                for (int i=0; i<numSockets; i++)
-                {
-                    int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
-                    power_stop(&(pDataPkg[i]), cpuId, PKG);
-                    if (hasPP1) power_stop(&(pDataPP1[i]), cpuId, PP1);
-                    if (hasPP0) power_stop(&(pDataPP0[i]), cpuId, PP0);
-                    if (hasDRAM) power_stop(&(pDataDram[i]), cpuId, DRAM);
-                }
-            }
-            runtime = (double) optStethoscope;
-        }
-        else
-        {
-            TimerData time;
-            argv +=  optind;
-            bstring exeString = bfromcstr(argv[0]);
-
-            for (int i=1; i<(argc-optind); i++)
-            {
-                bconchar(exeString, ' ');
-                bcatcstr(exeString, argv[i]);
-            }
-            fprintf(stdout, "Executing: %s\n",bdata(exeString));
-            fflush(stdout);
-
-
-            if (optClock)
-            {
-                perfmon_startCounters();
-            }
-            else
-            {
-                for (int i=0; i<numSockets; i++)
-                {
-                    int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
-                    if (hasDRAM) power_start(&(pDataDram[i]), cpuId, DRAM);
-                    if (hasPP0) power_start(&(pDataPP0[i]), cpuId, PP0);
-                    if (hasPP1) power_start(&(pDataPP1[i]), cpuId, PP1);
-                    power_start(&(pDataPkg[i]), cpuId, PKG);
-                }
-
-                timer_start(&time);
-            }
-
-            if (system(bdata(exeString)) == EOF)
-            {
-                fprintf(stderr, "Failed to execute %s!\n", bdata(exeString));
-                exit(EXIT_FAILURE);
-            }
-
-            if (optClock)
-            {
-                perfmon_stopCounters();
-                perfmon_printCounterResults();
-                perfmon_finalize();
-            }
-            else
-            {
-                timer_stop(&time);
-
-                for (int i=0; i<numSockets; i++)
-                {
-                    int cpuId = socketDomains[threadsSockets[i]]->processorList[0];
-                    power_stop(&(pDataPkg[i]), cpuId, PKG);
-                    if (hasDRAM) power_stop(&(pDataDram[i]), cpuId, DRAM);
-                    if (hasPP0) power_stop(&(pDataPP0[i]), cpuId, PP0);
-                    if (hasPP1) power_stop(&(pDataPP1[i]), cpuId, PP1);
-                }
-                runtime = timer_print(&time);
-            }
-        }
-
-        if (!optClock)
-        {
-            fprintf(stdout, "Runtime: %g second \n",runtime);
-            fprintf(stdout, HLINE);
-            for (int i=0; i<numSockets; i++)
-            {
-                fprintf(stdout, "Socket %d (Measured on CPU %d)\n",threadsSockets[i],
-                                    socketDomains[threadsSockets[i]]->processorList[0]);
-                fprintf(stdout, "Domain: PKG \n");
-                fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPkg[i])));
-                fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPkg[i])) / runtime );
-                if (hasDRAM)
-                {
-                    fprintf(stdout, "Domain: DRAM \n");
-                    fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataDram[i])));
-                    fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataDram[i])) / runtime );
-                }
-                if (hasPP0)
-                {
-                    fprintf(stdout, "Domain: PP0 \n");
-                    fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPP0[i])));
-                    fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPP0[i])) / runtime );
-                }
-                if (hasPP1)
-                {
-                    fprintf(stdout, "Domain: PP1 \n");
-                    fprintf(stdout, "Energy consumed: %g Joules \n", power_printEnergy(&(pDataPP1[i])));
-                    fprintf(stdout, "Power consumed: %g Watts \n", power_printEnergy(&(pDataPP1[i])) / runtime );
-                }
-                fprintf(stdout, "\n");
-            }
-            fflush(stdout);
-        }
-    }
-
-
-    if ( optTemp && cpuid_hasFeature(TM2))
-    {
-        printf("Current core temperatures:\n");
-        for (i = 0; i < numSockets; i++)
-        {
-            printf("Socket %d\n",threadsSockets[i]);
-            for (c = 0; c < socketDomains[threadsSockets[i]]->numberOfProcessors; c++ )
-            {
-                thermal_init(i);
-                printf("Core %d: %u C\n",
-                        socketDomains[threadsSockets[i]]->processorList[c],
-                        thermal_read(socketDomains[threadsSockets[i]]->processorList[c]));
-            }
-        }
-    }
-
-
-    msr_finalize();
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-powermeter.lua b/src/applications/likwid-powermeter.lua
new file mode 100644
index 0000000..ce3501d
--- /dev/null
+++ b/src/applications/likwid-powermeter.lua
@@ -0,0 +1,331 @@
+#!<PREFIX>/bin/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-powermeter.lua
+ *
+ *      Description:  An application to get information about power 
+ *      consumption on architectures implementing the RAPL interface.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+local function version()
+    print(string.format("likwid-powermeter --  Version %d.%d",likwid.version,likwid.release))
+end
+
+local function examples()
+    print("Examples:")
+    print("Measure the power consumption for 4 seconds on socket 1")
+    print("likwid-powermeter -s 4 -c 1")
+    print("")
+    print("Use it as wrapper for an application to measure the energy for the whole execution")
+    print("likwid-powermeter -c 1 ./a.out")
+end
+
+local function usage()
+    version()
+    print("A tool to print power and clocking information on x86 CPUs.\n")
+    print("Options:")
+    print("-h, --help\t Help message")
+    print("-v, --version\t Version information")
+    print("-V, --verbose <level>\t Verbose output, 0 (only errors), 1 (info), 2 (details), 3 (developer)")
+    print("-M <0|1>\t\t Set how MSR registers are accessed, 0=direct, 1=accessDaemon")
+    print("-c <list>\t\t Specify sockets to measure")
+    print("-i, --info\t Print information from MSR_PKG_POWER_INFO register and Turbo mode")
+    print("-s <duration>\t Set measure duration in us, ms or s. (default 2s)")
+    print("-p\t\t Print dynamic clocking and CPI values, uses likwid-perfctr")
+    print("-t\t\t Print current temperatures of all CPU cores")
+    print("-f\t\t Print current temperatures in Fahrenheit")
+    print("")
+    examples()
+end
+
+local config = likwid.getConfiguration();
+
+print_info = false
+use_perfctr = false
+stethoscope = false
+fahrenheit = false
+print_temp = false
+verbose = 0
+if config["daemonMode"] < 0 then
+    access_mode = 1
+else
+    access_mode = config["daemonMode"]
+end
+time_interval = 2.E06
+sockets = {}
+domainList = {"PKG", "PP0", "PP1", "DRAM"}
+
+cpuinfo = likwid.getCpuInfo()
+cputopo = likwid.getCpuTopology()
+numatopo = likwid.getNumaInfo()
+affinity = likwid_getAffinityInfo()
+
+for opt,arg in likwid.getopt(arg, {"V:", "c:", "h", "i", "M:", "p", "s:", "v", "f", "t", "help", "info", "version", "verbose:"}) do
+    if (type(arg) == "string") then
+        local s,e = arg:find("-");
+        if s == 1 then
+            print(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+            print("Did you forget an argument to an option?")
+            os.exit(1)
+        end
+    end
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif (opt == "c") then
+        num_sockets, sockets = likwid.sockstr_to_socklist(arg)
+        if num_sockets == 0 then
+            os.exit(1)
+        end
+    elseif (opt == "M") then
+        access_mode = tonumber(arg)
+        if (access_mode == nil) then
+            print("Access mode (-M) must be an number")
+            usage()
+            os.exit(1)
+        elseif (access_mode < 0) or (access_mode > 1) then
+            print(string.format("Access mode (-M) %d not valid.",access_mode))
+            usage()
+            os.exit(1)
+        end
+        
+    elseif opt == "i" or opt == "info" then
+        print_info = true
+    elseif (opt == "p") then
+        use_perfctr = true
+    elseif (opt == "f") then
+        fahrenheit = true
+        print_temp = true
+    elseif (opt == "t") then
+        print_temp = true
+    elseif opt == "V" or opt == "verbose" then
+        verbose = tonumber(arg)
+        likwid.setVerbosity(verbose)
+    elseif (opt == "s") then
+        time_interval = likwid.parse_time(arg)
+        stethoscope = true
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    end
+end
+
+
+
+cpulist = {}
+before = {}
+after = {}
+if #sockets > 0 then
+    for i,socketId in pairs(sockets) do
+        local affinityID = "S"..tostring(socketId)
+        for j, domain in pairs(affinity["domains"]) do
+            if domain["tag"] == affinityID then
+                table.insert(cpulist,domain["processorList"][1])
+                before[domain["processorList"][1]] = {}
+                after[domain["processorList"][1]] = {}
+                for _, id in pairs(domainList) do
+                    before[domain["processorList"][1]][id] = 0
+                    after[domain["processorList"][1]][id] = 0
+                end
+            end
+        end
+    end
+else
+    for j, domain in pairs(affinity["domains"]) do
+        if domain["tag"]:match("S%d+") then
+            table.insert(cpulist,domain["processorList"][1])
+            table.insert(sockets, domain["tag"]:match("S(%d+)"))
+            before[domain["processorList"][1]] = {}
+            after[domain["processorList"][1]] = {}
+            for _, id in pairs(domainList) do
+                before[domain["processorList"][1]][id] = 0
+                after[domain["processorList"][1]][id] = 0
+            end
+        end
+    end
+end
+
+
+if likwid.setAccessClientMode(access_mode) ~= 0 then
+    os.exit(1)
+end
+
+power = likwid.getPowerInfo()
+if not power then
+    print(string.format("The %s does not support reading power data",cpuinfo["name"]))
+    os.exit(1)
+end
+
+
+if not use_perfctr then
+    print(likwid.hline);
+    print(string.format("CPU name:\t%s",cpuinfo["osname"]))
+    print(string.format("CPU type:\t%s",cpuinfo["name"]))
+    if cpuinfo["clock"] > 0 then
+        print(string.format("CPU clock:\t%3.2f GHz",cpuinfo["clock"] *  1.E-09))
+    else
+        print(string.format("CPU clock:\t%3.2f GHz",likwid.getCpuClock() *  1.E-09))
+    end
+    print(likwid.hline)
+end
+
+if print_info or verbose > 0 then
+    if (power["turbo"]["numSteps"] > 0) then
+        print(string.format("Base clock:\t%.2f MHz", power["baseFrequency"]))
+        print(string.format("Minimal clock:\t%.2f MHz", power["minFrequency"]))
+        print("Turbo Boost Steps:")
+        for i,step in pairs(power["turbo"]["steps"]) do
+            print(string.format("C%d %.2f MHz",i-1,power["turbo"]["steps"][i]))
+        end
+    end
+    print(likwid.hline)
+end
+
+if power["hasRAPL"] == 0 then
+    print("Measuring power is not supported on this machine")
+    os.exit(1)
+end
+
+if (print_info) then
+    for i, dname in pairs(domainList) do
+        local domain = power["domains"][dname]
+        if domain["supportInfo"] then
+            print(string.format("Info for RAPL domain %s:", dname));
+            print(string.format("Thermal Spec Power: %g Watt",domain["tdp"]*1E-6))
+            print(string.format("Minimum Power: %g Watt",domain["minPower"]*1E-6))
+            print(string.format("Maximum Power: %g Watt",domain["maxPower"]*1E-6))
+            print(string.format("Maximum Time Window: %g micro sec",domain["maxTimeWindow"]))
+            print()
+        end
+    end
+    print(likwid.hline)
+end
+
+if (stethoscope) and (time_interval < power["timeUnit"]) then
+    print("Time interval too short, minimum measurement time is "..tostring(power["timeUnit"]).. " us")
+    os.exit(1)
+end
+
+local execString = ""
+if (use_perfctr) then
+    affinity = likwid.getAffinityInfo()
+    argString = ""
+    for i,socket in pairs(sockets) do
+        argString = argString .. string.format("S%u:0-%u",socket,(cputopo["numCoresPerSocket"]*cputopo["numThreadsPerCore"])-1)
+        if (i < #sockets) then
+            argString = argString .. "@"
+        end
+    end
+    execString = string.format("<PREFIX>/bin/likwid-perfctr -C %s -g CLOCK ",argString)
+end
+
+
+if #arg == 0 then
+    stethoscope = true
+else
+    for i=1,#arg do
+        execString = execString .. arg[i] .. " "
+    end
+end
+
+if not print_info and not print_temp then
+    for i,socket in pairs(sockets) do
+        cpu = cpulist[i]
+        for idx, dom in pairs(domainList) do
+            if (power["domains"][dom]["supportStatus"]) then before[cpu][dom] = likwid.startPower(cpu, idx) end
+        end
+    end
+    time_before = likwid.startClock()
+    if (stethoscope) then
+        if time_interval >= 1.E06 then
+            sleep(time_interval/1.E06)
+        else
+            usleep(time_interval)
+        end
+    else
+        err = os.execute(execString)
+        if (err == false) then
+            print(string.format("Failed to execute %s!",execString))
+            likwid.finalize()
+            os.exit(1)
+        end
+    end
+    time_after = likwid.stopClock()
+
+    for i,socket in pairs(sockets) do
+        cpu = cpulist[i]
+        for idx, dom in pairs(domainList) do
+            if (power["domains"][dom]["supportStatus"]) then after[cpu][dom] = likwid.stopPower(cpu, idx) end
+        end
+    end
+    runtime = likwid.getClock(time_before, time_after)
+    print(likwid.hline)
+    print(string.format("Runtime: %g s",runtime))
+
+    for i,socket in pairs(sockets) do
+        cpu = cpulist[i]
+        print(string.format("Measure for socket %d on CPU %d", socket,cpu ))
+        for j, dom in pairs(domainList) do
+            if power["domains"][dom]["supportStatus"] then
+                local energy = likwid.calcPower(before[cpu][dom], after[cpu][dom], 0)
+                print(string.format("Energy consumed: %g Joules",energy))
+                print(string.format("Power consumed: %g Watt",energy/runtime))
+            end
+        end
+        if i < #sockets then print("") end
+    end
+    print(likwid.hline)
+end
+
+if print_temp and (string.find(cpuinfo["features"],"TM2") ~= nil) then
+    print(likwid.hline)
+    likwid.initTemp(cpulist[i]);
+    print("Current core temperatures:");
+    for i=1,cputopo["numSockets"] do
+        local tag = "S" .. tostring(i-1)
+        for _, domain in pairs(affinity["domains"]) do
+            if domain["tag"] == tag then
+                for j=1,#domain["processorList"] do
+                    local cpuid = domain["processorList"][j]
+                    if (fahrenheit) then
+                        print(string.format("Socket %d Core %d: %u F",i-1,cpuid, 1.8*likwid.readTemp(cpuid)+32));
+                    else
+                        print(string.format("Socket %d Core %d: %u C",i-1,cpuid, likwid.readTemp(cpuid)));
+                    end
+                end
+            end
+        end
+    end
+    print(likwid.hline)
+end
+
+likwid.finalize()
diff --git a/src/applications/likwid-setFrequencies.lua b/src/applications/likwid-setFrequencies.lua
new file mode 100644
index 0000000..202930d
--- /dev/null
+++ b/src/applications/likwid-setFrequencies.lua
@@ -0,0 +1,314 @@
+#!<PREFIX>/bin/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-setFrequencies.lua
+ *
+ *      Description:  A application to set the CPU frequency of CPU cores and domains.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+package.path = '<PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+
+sys_base_path = "/sys/devices/system/cpu"
+set_command = "<PREFIX>/sbin/likwid-setFreq"
+
+
+function version()
+    print(string.format("likwid-setFrequencies --  Version %d.%d",likwid.version,likwid.release))
+end
+
+function usage()
+    version()
+    print("A tool to adjust frequencies and governors on x86 CPUs.\n")
+    print("Options:")
+    print("-h\t Help message")
+    print("-v\t Version information")
+    print("-c dom\t Likwid thread domain which to apply settings (default are all CPUs)")
+    print("\t See likwid-pin -h for details")
+    print("-g gov\t Set governor (" .. table.concat(getAvailGovs(nil), ", ") .. ") (set to ondemand if omitted)")
+    print("-f freq\t Set fixed frequency, implicitly sets userspace governor")
+    print("-p\t Print current frequencies")
+    print("-l\t List available frequencies")
+    print("-m\t List available governors")
+end
+
+function getAvailFreq(cpuid)
+    if (cpuid == nil) or (cpuid < 1) then
+        cpuid = 0
+    end
+    local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_frequencies")
+    if verbosity == 3 then
+        print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_frequencies" )
+    end
+    local line = fp:read("*l")
+    fp:close()
+    local tmp = likwid.stringsplit(line:gsub("^%s*(.-)%s*$", "%1"), " ", nil, " ")
+    local avail = {}
+    local turbo = tostring(tonumber(tmp[1])/1E6)
+    for i=2,#tmp do
+        avail[i-1] = tostring(tonumber(tmp[i])/1E6)
+        if not avail[i-1]:match("%d.%d") then
+            avail[i-1] = avail[i-1] ..".0"
+        end
+    end
+    if verbosity == 1 then
+        print(string.format("The system provides %d scaling frequencies, frequency %s is taken as turbo mode", #avail,turbo))
+    end
+    return avail, turbo
+end
+
+function getCurFreq()
+    local freqs = {}
+    local govs = {}
+    for cpuid=0,topo["numHWThreads"]-1 do
+        local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_cur_freq")
+        if verbosity == 3 then
+            print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_cur_freq" )
+        end
+        local line = fp:read("*l")
+        fp:close()
+        freqs[cpuid] = tostring(tonumber(line)/1E6)
+        if not freqs[cpuid]:match("%d.%d") then
+            freqs[cpuid] = freqs[cpuid] ..".0"
+        end
+        local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_governor")
+        if verbosity == 3 then
+            print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_governor" )
+        end
+        local line = fp:read("*l")
+        fp:close()
+        govs[cpuid] = line
+    end
+    return freqs, govs
+end
+
+function getAvailGovs(cpuid)
+    if (cpuid == nil) or (cpuid < 1) then
+        cpuid = 0
+    end
+    local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_governors")
+    if verbosity == 3 then
+        print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",cpuid) .. "/cpufreq/scaling_available_governors" )
+    end
+    local line = fp:read("*l")
+    fp:close()
+    local avail = likwid.stringsplit(line:gsub("^%s*(.-)%s*$", "%1"), "%s+", nil, "%s+")
+    for i=1,#avail do
+        if avail[i] == "userspace" then
+            table.remove(avail, i)
+            break
+        end
+    end
+    table.insert(avail, "turbo")
+    if verbosity == 1 then
+        print(string.format("The system provides %d scaling governors", #avail))
+    end
+    return avail
+end
+
+local function testDriver()
+    local fp = io.open(sys_base_path .. "/" .. string.format("cpu%d",0) .. "/cpufreq/scaling_driver")
+    if verbosity == 3 then
+        print("Reading "..sys_base_path .. "/" .. string.format("cpu%d",0) .. "/cpufreq/scaling_driver" )
+    end
+    local line = fp:read("*l")
+    fp:close()
+    if line == "acpi-cpufreq" then
+        return true
+    end
+    return false
+end
+
+verbosity = 0
+governor = nil
+frequency = nil
+domain = nil
+printCurFreq = false
+printAvailFreq = false
+printAvailGovs = false
+
+if #arg == 0 then
+    usage()
+    os.exit(0)
+end
+
+
+for opt,arg in likwid.getopt(arg, {"g:", "c:", "f:", "l", "p", "h", "v", "m", "help","version","freq:"}) do
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif (opt == "c") then
+        domain = arg
+    elseif (opt == "g") then
+        governor = arg
+    elseif opt == "f" or opt == "freq" then
+        frequency = arg
+    elseif (opt == "p") then
+        printCurFreq = true
+    elseif (opt == "l") then
+        printAvailFreq = true
+    elseif (opt == "m") then
+        printAvailGovs = true
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    end
+end
+if not testDriver() then
+    print("The system does not use the acpi-cpufreq driver, other drivers are not usable with likwid-setFrequencies.")
+    os.exit(1)
+end
+
+topo = likwid.getCpuTopology()
+affinity = likwid.getAffinityInfo()
+if not domain then
+    domain = "N:0-" .. tostring(topo["numHWThreads"]-1)
+end
+if domain:match("[SCM]%d+") then
+    for i, dom in pairs(affinity["domains"]) do
+        if dom["tag"]:match(domain) then
+            domain = domain..":0-"..tostring(dom["numberOfProcessors"]-1)
+        end
+    end
+end
+cpulist = {}
+numthreads, cpulist = likwid.cpustr_to_cpulist(domain)
+if verbosity == 3 then
+    print(string.format("Given CPU expression expands to %d CPU cores:", numthreads))
+    local str = tostring(cpulist[1])
+    for i=2, numthreads  do
+        str = str .. "," .. tostring(cpulist[i])
+    end
+    print(str)
+end
+
+
+if printAvailGovs then
+    local govs = getAvailGovs(nil)
+    print("Available governors:")
+    print(table.concat(govs, ", "))
+end
+
+if printAvailFreq then
+    local freqs, turbo = getAvailFreq(nil)
+    print("Available frequencies:")
+    print(turbo .. ", " .. table.concat(freqs, ", "))
+end
+
+if printCurFreq then
+    print("Current frequencies:")
+    local freqs = {}
+    local govs = {}
+    freqs, govs = getCurFreq()
+    for i=1,#cpulist do
+        print(string.format("CPU %d: governor %12s frequency %5s GHz",cpulist[i],govs[cpulist[i]], freqs[cpulist[i]]))
+    end
+end
+
+if printAvailGovs or printAvailFreq or printCurFreq then
+    os.exit(0)
+end
+
+if frequency then
+    local freqs, turbo = getAvailFreq(nil)
+    local valid_freq = false
+    for k,v in pairs(freqs) do
+        if (frequency == v) then
+            valid_freq = true
+            break
+        end
+    end
+    if frequency == turbo then
+        valid_freq = true
+    end
+    if not valid_freq then
+        print(string.format("Frequency %s not available! Please select one of\n%s", frequency, table.concat(freqs, ", ")))
+        os.exit(1)
+    end
+    for i=1,#cpulist do
+        local cmd = set_command .. " " .. tostring(cpulist[i]) .. " " .. tostring(tonumber(frequency)*1E6)
+        if governor then
+            cmd = cmd .. " " .. governor
+        end
+        if verbosity == 3 then
+            print("Execute: ".. cmd)
+        end
+        local err = os.execute(cmd)
+        if err == false or err == nil then
+            print("Failed to set frequency for CPU "..tostring(cpulist[i]))
+        end
+    end
+    if governor then
+        governor = nil
+    end
+end
+
+if governor then
+    local govs = getAvailGovs(nil)
+    local freqs, turbo = getAvailFreq(nil)
+    local cur_freqs, cur_govs = getCurFreq()
+    local valid_gov = false
+    for k,v in pairs(govs) do
+        if (governor == v) then
+            valid_gov = true
+            break
+        end
+    end
+    if (governor == "turbo") then
+        valid_gov = true
+        for i=1,#cpulist do
+            cur_freqs[cpulist[i]] = turbo
+        end
+    end
+    if not valid_gov then
+        print(string.format("Governor %s not available! Please select one of\n%s", governor, table.concat(govs, ", ")))
+        os.exit(1)
+    end
+    for i=1,#cpulist do
+        if governor ~= cur_govs[cpulist[i]] then
+            local cmd = set_command .. " " .. tostring(cpulist[i]) .. " "
+            if governor == "turbo" then
+                cmd = cmd .. tostring(tonumber(turbo)*1E6)
+            else
+                cmd = cmd .. tostring(tonumber(cur_freqs[cpulist[i]])*1E6) .. " " .. governor
+            end
+            if verbosity == 3 then
+                print("Execute: ".. cmd)
+            end
+            local err = os.execute(cmd)
+            if err == false or err == nil then
+                print("Failed to set governor for CPU "..tostring(cpulist[i]))
+            end
+        end
+    end
+end
+likwid.putAffinityInfo()
+likwid.putTopology()
+os.exit(0)
diff --git a/src/applications/likwid-topology.c b/src/applications/likwid-topology.c
deleted file mode 100644
index 7ba0e33..0000000
--- a/src/applications/likwid-topology.c
+++ /dev/null
@@ -1,509 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  likwid-topology.c
- *
- *      Description:  A application to determine the thread and cache topology
- *                    on x86 processors.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* #####   HEADER FILE INCLUDES   ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sched.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <ctype.h>
-
-#include <types.h>
-#include <error.h>
-#include <cpuid.h>
-#include <timer.h>
-#include <affinity.h>
-#include <numa.h>
-#include <cpuFeatures.h>
-#include <tree.h>
-#include <asciiBoxes.h>
-#include <strUtil.h>
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#define HELP_MSG \
-    fprintf(OUTSTREAM, "\nlikwid-topology --  Version %d.%d \n\n",VERSION,RELEASE); \
-    fprintf(OUTSTREAM, "A tool to print the thread and cache topology on x86 CPUs.\n"); \
-    fprintf(OUTSTREAM, "Options:\n"); \
-    fprintf(OUTSTREAM, "-h\t Help message\n"); \
-    fprintf(OUTSTREAM, "-v\t Version information\n"); \
-    fprintf(OUTSTREAM, "-c\t list cache information\n"); \
-    fprintf(OUTSTREAM, "-C\t measure processor clock\n"); \
-    fprintf(OUTSTREAM, "-o\t Store output to file, with output conversion according to file suffix\n"); \
-    fprintf(OUTSTREAM, "\t Conversion scripts can be supplied in %s\n",TOSTRING(LIKWIDFILTERPATH)); \
-    fprintf(OUTSTREAM, "-g\t graphical output\n\n"); \
-    fflush(OUTSTREAM);
-
-#define VERSION_MSG \
-    fprintf(OUTSTREAM, "likwid-topology  %d.%d \n\n",VERSION,RELEASE); \
-    fflush(OUTSTREAM);
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-int main (int argc, char** argv)
-{
-    int optGraphical = 0;
-    int optCaches = 0;
-    int optClock = 0;
-    int c;
-    int tmp;
-    TreeNode* socketNode;
-    TreeNode* coreNode;
-    TreeNode* threadNode;
-    BoxContainer* container;
-    bstring  argString;
-    bstring  filterScript = bfromcstr("NO");
-    FILE* OUTSTREAM = stdout;
-
-    while ((c = getopt (argc, argv, "hvcCgo:")) != -1)
-    {
-        switch (c)
-        {
-            case 'h':
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-            case 'v':
-                VERSION_MSG;
-                exit (EXIT_SUCCESS);
-            case 'g':
-                optGraphical = 1;
-                break;
-            case 'c':
-                optCaches = 1;
-                break;
-            case 'C':
-                optClock = 1;
-                break;
-            case 'o':
-                if (! (argString = bSecureInput(200,optarg)))
-                {
-                    fprintf(stderr, "Failed to read argument string!\n");
-                }
-
-                OUTSTREAM = bstr_to_outstream(argString, filterScript);
-
-                if(!OUTSTREAM)
-                {
-                    fprintf(stderr, "Failed to parse out file pattern.\n");
-                }
-                break;
-            case '?':
-                if (isprint (optopt))
-                {
-                    fprintf (stderr, "Unknown option `-%c'.\n", optopt);
-                }
-                else
-                {
-                    fprintf (stderr,
-                            "Unknown option character `\\x%x'.\n",
-                            optopt);
-                }
-                return EXIT_FAILURE;
-            default:
-                HELP_MSG;
-                exit (EXIT_SUCCESS);
-        }
-    }
-
-    if (cpuid_init() == EXIT_FAILURE)
-    {
-        ERROR_PLAIN_PRINT(Unsupported processor!);
-    }
-    affinity_init();
-    numa_init();
-
-    fprintf(OUTSTREAM, HLINE);
-    fprintf(OUTSTREAM, "CPU type:\t%s\n",cpuid_info.name);
-
-    if (optClock)
-    {
-        timer_init();
-        fprintf(OUTSTREAM, "CPU clock:\t%3.2f GHz\n",  (float) timer_getCpuClock() * 1.E-09);
-    }
-
-    /*----------------------------------------------------------------------
-     *  Thread Topology
-     *----------------------------------------------------------------------*/
-    fprintf(OUTSTREAM, SLINE);
-    fprintf(OUTSTREAM, "Hardware Thread Topology\n");
-    fprintf(OUTSTREAM, SLINE);
-    fprintf(OUTSTREAM, "Sockets:\t%u \n", cpuid_topology.numSockets);
-    fprintf(OUTSTREAM, "Cores per socket:\t%u \n", cpuid_topology.numCoresPerSocket);
-    fprintf(OUTSTREAM, "Threads per core:\t%u \n", cpuid_topology.numThreadsPerCore);
-    fprintf(OUTSTREAM, HLINE);
-    fprintf(OUTSTREAM, "HWThread\tThread\t\tCore\t\tSocket\n");
-
-    for ( uint32_t i=0; i <  cpuid_topology.numHWThreads; i++)
-    {
-        fprintf(OUTSTREAM, "%d\t\t%u\t\t%u\t\t%u\n",i
-                ,cpuid_topology.threadPool[i].threadId
-                ,cpuid_topology.threadPool[i].coreId
-                ,cpuid_topology.threadPool[i].packageId);
-    }
-    fprintf(OUTSTREAM, HLINE);
-
-    socketNode = tree_getChildNode(cpuid_topology.topologyTree);
-    while (socketNode != NULL)
-    {
-        fprintf(OUTSTREAM, "Socket %d: ( ",socketNode->id);
-        coreNode = tree_getChildNode(socketNode);
-
-        while (coreNode != NULL)
-        {
-            threadNode = tree_getChildNode(coreNode);
-
-            while (threadNode != NULL)
-            {
-                fprintf(OUTSTREAM, "%d ",threadNode->id);
-                threadNode = tree_getNextNode(threadNode);
-            }
-            coreNode = tree_getNextNode(coreNode);
-        }
-        socketNode = tree_getNextNode(socketNode);
-        fprintf(OUTSTREAM, ")\n");
-    }
-    fprintf(OUTSTREAM, HLINE"\n");
-    fflush(OUTSTREAM);
-
-    /*----------------------------------------------------------------------
-     *  Cache Topology
-     *----------------------------------------------------------------------*/
-    fprintf(OUTSTREAM, SLINE);
-    fprintf(OUTSTREAM, "Cache Topology\n");
-    fprintf(OUTSTREAM, SLINE);
-
-    for ( uint32_t i=0; i <  cpuid_topology.numCacheLevels; i++)
-    {
-        if (cpuid_topology.cacheLevels[i].type != INSTRUCTIONCACHE)
-        {
-            fprintf(OUTSTREAM, "Level:\t%d\n",cpuid_topology.cacheLevels[i].level);
-            if (cpuid_topology.cacheLevels[i].size < 1048576)
-            {
-                fprintf(OUTSTREAM, "Size:\t%d kB\n",
-                        cpuid_topology.cacheLevels[i].size/1024);
-            }
-            else
-            {
-                fprintf(OUTSTREAM, "Size:\t%d MB\n",
-                        cpuid_topology.cacheLevels[i].size/1048576);
-            }
-
-            if( optCaches)
-            {
-                switch (cpuid_topology.cacheLevels[i].type) {
-                    case DATACACHE:
-                        fprintf(OUTSTREAM, "Type:\tData cache\n");
-                        break;
-
-                    case INSTRUCTIONCACHE:
-                        fprintf(OUTSTREAM, "Type:\tInstruction cache\n");
-                        break;
-
-                    case UNIFIEDCACHE:
-                        fprintf(OUTSTREAM, "Type:\tUnified cache\n");
-                        break;
-                    default:
-                        /* make the compiler happy */
-                        break;
-                }
-                fprintf(OUTSTREAM, "Associativity:\t%d\n",
-                        cpuid_topology.cacheLevels[i].associativity);
-                fprintf(OUTSTREAM, "Number of sets:\t%d\n",
-                        cpuid_topology.cacheLevels[i].sets);
-                fprintf(OUTSTREAM, "Cache line size:\t%d\n",
-                        cpuid_topology.cacheLevels[i].lineSize);
-                if(cpuid_topology.cacheLevels[i].inclusive)
-                {
-                    fprintf(OUTSTREAM, "Non Inclusive cache\n");
-                }
-                else
-                {
-                    fprintf(OUTSTREAM, "Inclusive cache\n");
-                }
-                fprintf(OUTSTREAM, "Shared among %d threads\n",
-                        cpuid_topology.cacheLevels[i].threads);
-            }
-            fprintf(OUTSTREAM, "Cache groups:\t");
-            tmp = cpuid_topology.cacheLevels[i].threads;
-            socketNode = tree_getChildNode(cpuid_topology.topologyTree);
-            fprintf(OUTSTREAM, "( ");
-            while (socketNode != NULL)
-            {
-                coreNode = tree_getChildNode(socketNode);
-
-                while (coreNode != NULL)
-                {
-                    threadNode = tree_getChildNode(coreNode);
-
-                    while (threadNode != NULL)
-                    {
-
-                        if (tmp)
-                        {
-                            fprintf(OUTSTREAM, "%d ",threadNode->id);
-                            tmp--;
-                        }
-                        else
-                        {
-                            fprintf(OUTSTREAM, ") ( %d ",threadNode->id);
-                            tmp = cpuid_topology.cacheLevels[i].threads;
-                            tmp--;
-                        }
-
-                        threadNode = tree_getNextNode(threadNode);
-                    }
-                    coreNode = tree_getNextNode(coreNode);
-                }
-                socketNode = tree_getNextNode(socketNode);
-            }
-            fprintf(OUTSTREAM, ")\n");
-
-            fprintf(OUTSTREAM, HLINE);
-        }
-    }
-
-    fprintf(OUTSTREAM, "\n");
-    fflush(OUTSTREAM);
-
-    /*----------------------------------------------------------------------
-     *  NUMA Topology
-     *----------------------------------------------------------------------*/
-    fprintf(OUTSTREAM, SLINE);
-    fprintf(OUTSTREAM, "NUMA Topology\n");
-    fprintf(OUTSTREAM, SLINE);
-
-    if (numa_init() < 0)
-    {
-        fprintf(OUTSTREAM, "NUMA is not supported on this node!\n");
-    }
-    else
-    {
-        fprintf(OUTSTREAM, "NUMA domains: %d \n",numa_info.numberOfNodes);
-        fprintf(OUTSTREAM, HLINE);
-
-        for ( uint32_t i = 0; i < numa_info.numberOfNodes; i++)
-        {
-            fprintf(OUTSTREAM, "Domain %d:\n", i);
-            fprintf(OUTSTREAM, "Processors: ");
-
-            for ( int j = 0; j < numa_info.nodes[i].numberOfProcessors; j++)
-            {
-                fprintf(OUTSTREAM, " %d",numa_info.nodes[i].processors[j]);
-            }
-            fprintf(OUTSTREAM, "\n");
-
-            fprintf(OUTSTREAM, "Relative distance to nodes: ");
-
-            for ( int j = 0; j < numa_info.nodes[i].numberOfDistances; j++)
-            {
-                fprintf(OUTSTREAM, " %d",numa_info.nodes[i].distances[j]);
-            }
-            fprintf(OUTSTREAM, "\n");
-
-            fprintf(OUTSTREAM, "Memory: %g MB free of total %g MB\n",
-                    numa_info.nodes[i].freeMemory/1024.0, numa_info.nodes[i].totalMemory/1024.0);
-            fprintf(OUTSTREAM, HLINE);
-        }
-    }
-    fprintf(OUTSTREAM, "\n");
-    fflush(OUTSTREAM);
-
-    /*----------------------------------------------------------------------
-     *  Graphical topology
-     *----------------------------------------------------------------------*/
-    if(optGraphical)
-    {
-        int j;
-        bstring  boxLabel = bfromcstr("0");
-
-        fprintf(OUTSTREAM, SLINE);
-        fprintf(OUTSTREAM, "Graphical:\n");
-        fprintf(OUTSTREAM, SLINE);
-
-        /* Allocate without instruction cache */
-        if ( cpuid_info.family == P6_FAMILY || cpuid_info.family == MIC_FAMILY )
-        {
-            container = asciiBoxes_allocateContainer(
-                    cpuid_topology.numCacheLevels,
-                    cpuid_topology.numCoresPerSocket);
-        }
-        else
-        {
-            container = asciiBoxes_allocateContainer(
-                    cpuid_topology.numCacheLevels+1,
-                    cpuid_topology.numCoresPerSocket);
-        }
-
-        socketNode = tree_getChildNode(cpuid_topology.topologyTree);
-        while (socketNode != NULL)
-        {
-            fprintf(OUTSTREAM, "Socket %d:\n",socketNode->id);
-            j=0;
-            coreNode = tree_getChildNode(socketNode);
-
-            /* add threads */
-            while (coreNode != NULL)
-            {
-                threadNode = tree_getChildNode(coreNode);
-                tmp =0;
-
-                while (threadNode != NULL)
-                {
-                    if (tmp > 0)
-                    {
-                        bformata(boxLabel,"  %d", threadNode->id);
-                    }
-                    else
-                    {
-                        boxLabel = bformat("%d",threadNode->id);
-                    }
-                    tmp++;
-                    threadNode = tree_getNextNode(threadNode);
-                }
-                asciiBoxes_addBox(container, 0, j, boxLabel);
-                j++;
-                coreNode = tree_getNextNode(coreNode);
-            }
-
-            /* add caches */
-            {
-                int columnCursor=0;
-                int lineCursor=1;
-                uint32_t sharedCores;
-                int numCachesPerLevel;
-                int cacheWidth;
-
-                for ( uint32_t i=0; i < cpuid_topology.numCacheLevels; i++ )
-                {
-                    sharedCores = cpuid_topology.cacheLevels[i].threads /
-                        cpuid_topology.numThreadsPerCore;
-
-                    if (cpuid_topology.cacheLevels[i].type != INSTRUCTIONCACHE)
-                    {
-                        if ( sharedCores > cpuid_topology.numCoresPerSocket )
-                        {
-                            numCachesPerLevel = 1;
-                        }
-                        else
-                        {
-                            numCachesPerLevel =
-                                cpuid_topology.numCoresPerSocket/sharedCores;
-                        }
-
-                        columnCursor=0;
-                        for ( j=0; j < numCachesPerLevel; j++ )
-                        {
-                            if (cpuid_topology.cacheLevels[i].size < 1048576)
-                            {
-                                boxLabel = bformat("%dkB",
-                                        cpuid_topology.cacheLevels[i].size/1024);
-                            }
-                            else
-                            {
-                                boxLabel = bformat("%dMB",
-                                        cpuid_topology.cacheLevels[i].size/1048576);
-                            }
-
-                            if (sharedCores > 1)
-                            {
-                                if (sharedCores > cpuid_topology.numCoresPerSocket)
-                                {
-                                    cacheWidth = cpuid_topology.numCoresPerSocket-1;
-                                }
-                                else
-                                {
-                                    cacheWidth = sharedCores-1;
-                                }
-                                asciiBoxes_addJoinedBox(
-                                        container,
-                                        lineCursor,
-                                        columnCursor,
-                                        columnCursor+cacheWidth,
-                                        boxLabel);
-
-                                columnCursor += sharedCores;
-                            }
-                            else
-                            {
-                                asciiBoxes_addBox(
-                                        container,
-                                        lineCursor,
-                                        columnCursor,
-                                        boxLabel);
-
-                                columnCursor++;
-                            }
-
-                        }
-                        lineCursor++;
-                    }
-                }
-            }
-
-            asciiBoxes_print(OUTSTREAM, container);
-            socketNode = tree_getNextNode(socketNode);
-        }
-        bdestroy(boxLabel);
-    }
-
-    fflush(OUTSTREAM);
-
-    /* call filterscript if specified */
-    if (!biseqcstr(filterScript,"NO"))
-    {
-        struct bstrList* tokens;
-        tokens = bsplit(filterScript,' ');
-        if (access(bdata(tokens->entry[0]), F_OK))
-        {
-            fprintf(stderr, "Cannot find filter %s!\n", bdata(tokens->entry[0]));
-            bstrListDestroy(tokens);
-            exit(EXIT_FAILURE);
-        }
-        if (access(bdata(tokens->entry[0]), X_OK))
-        {
-            fprintf(stderr, "Cannot execute filter %s!\n", bdata(tokens->entry[0]));
-            bstrListDestroy(tokens);
-            exit(EXIT_FAILURE);
-        }
-        bstrListDestroy(tokens);
-        bcatcstr(filterScript, " topology");
-
-        if (system(bdata(filterScript)) == EOF)
-        {
-            fprintf(stderr, "Failed to execute filter %s!\n", bdata(filterScript));
-            exit(EXIT_FAILURE);
-        }
-    }
-
-    return EXIT_SUCCESS;
-}
-
diff --git a/src/applications/likwid-topology.lua b/src/applications/likwid-topology.lua
new file mode 100644
index 0000000..ee7d1fb
--- /dev/null
+++ b/src/applications/likwid-topology.lua
@@ -0,0 +1,383 @@
+#!<PREFIX>/bin/likwid-lua
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid-topology.lua
+ *
+ *      Description:  A application to determine the thread and cache topology
+ *                    on x86 processors.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+package.path = '<PREFIX>/share/lua/?.lua;' .. package.path
+
+local likwid = require("likwid")
+stdout_print = print
+
+function version()
+    print(string.format("likwid-topology --  Version %d.%d",likwid.version,likwid.release))
+end
+
+function usage()
+    version()
+    print("A tool to print the thread and cache topology on x86 CPUs.\n")
+    print("Options:")
+    print("-h, --help\t\t Help message")
+    print("-v, --version\t\t Version information")
+    print("-V, --verbose <level>\t Set verbosity")
+    print("-c, --caches\t\t List cache information")
+    print("-C, --clock\t\t Measure processor clock")
+    print("-O\t\t\t CSV output")
+    print("-o, --output <file>\t Store output to file. (Optional: Apply text filter)")
+    print("-g\t\t\t Graphical output")
+end
+
+print_caches = false
+print_graphical = false
+measure_clock = false
+outfile = nil
+output_csv = {}
+
+for opt,arg in likwid.getopt(arg, {"h","v","c","C","g","o:","V:","O","help","version","verbose:","clock","caches","output:"}) do
+    if (type(arg) == "string") then
+        local s,e = arg:find("-");
+        if s == 1 then
+            print(string.format("Argmument %s to option -%s starts with invalid character -.", arg, opt))
+            print("Did you forget an argument to an option?")
+            os.exit(1)
+        end
+    end
+    if opt == "h" or opt == "help" then
+        usage()
+        os.exit(0)
+    elseif opt == "v" or opt == "version" then
+        version()
+        os.exit(0)
+    elseif opt == "V" or opt == "verbose" then
+        if tonumber(arg) >= 0 and tonumber(arg) <=3 then
+            likwid.setVerbosity(tonumber(arg))
+        else
+            print("Verbosity level not valid. Must be between 0 (only errors) and 3 (developer output)")
+        end
+    elseif opt == "c" or opt == "caches" then
+        print_caches = true
+    elseif opt == "C" or opt == "clock" then
+        measure_clock = true
+    elseif opt == "g" then
+        print_graphical = true
+    elseif opt == "O" then
+        print_csv = true
+    elseif opt == "o" or opt == "output" then
+        local suffix = string.match(arg, ".-[^\\/]-%.?([^%.\\/]*)$")
+        if suffix ~= "txt" then
+            print_csv = true
+        end
+        outfile = arg:gsub("%%h", likwid.gethostname())
+        io.output(arg:gsub(string.match(arg, ".-[^\\/]-%.?([^%.\\/]*)$"),"tmp"))
+        print = function(...) for k,v in pairs({...}) do io.write(v .. "\n") end end
+    elseif opt == "?" then
+        print("Invalid commandline option -"..arg)
+        os.exit(1)
+    end
+end
+
+local config = likwid.getConfiguration()
+local cpuinfo = likwid.getCpuInfo()
+local cputopo = likwid.getCpuTopology()
+local numainfo = likwid.getNumaInfo()
+local affinity = likwid.getAffinityInfo()
+
+
+table.insert(output_csv, likwid.hline)
+local lines = 3
+if measure_clock then
+    lines = 4
+end
+table.insert(output_csv, "STRUCT,Info,"..tostring(lines))
+table.insert(output_csv, string.format("CPU name:\t%s",cpuinfo["osname"]))
+table.insert(output_csv, string.format("CPU type:\t%s",cpuinfo["name"]))
+table.insert(output_csv, string.format("CPU stepping:\t%s",cpuinfo["stepping"]))
+if (measure_clock) then
+    if cpuinfo["clock"] == 0 then
+        table.insert(output_csv, string.format("CPU clock:\t%3.2f GHz", likwid.getCpuClock() * 1.E-09))
+    else
+        table.insert(output_csv, string.format("CPU clock:\t%3.2f GHz", cpuinfo["clock"] * 1.E-09))
+    end
+end
+
+table.insert(output_csv, likwid.sline)
+table.insert(output_csv, "STRUCT,Hardware Thread Topology,3")
+table.insert(output_csv, "Hardware Thread Topology")
+table.insert(output_csv, likwid.sline)
+table.insert(output_csv, string.format("Sockets:\t\t%u",cputopo["numSockets"]))
+table.insert(output_csv, string.format("Cores per socket:\t%u",cputopo["numCoresPerSocket"]))
+table.insert(output_csv, string.format("Threads per core:\t%u",cputopo["numThreadsPerCore"]))
+table.insert(output_csv, likwid.hline)
+table.insert(output_csv, "TABLE,Topology,"..tostring(cputopo["numHWThreads"]))
+table.insert(output_csv, "HWThread\tThread\t\tCore\t\tSocket\t\tAvailable")
+
+for cntr=0,cputopo["numHWThreads"]-1 do
+    if cputopo["threadPool"][cntr]["inCpuSet"] then
+        table.insert(output_csv, string.format("%d\t\t%u\t\t%u\t\t%u\t\t*",cntr,
+                            cputopo["threadPool"][cntr]["threadId"],
+                            cputopo["threadPool"][cntr]["coreId"],
+                            cputopo["threadPool"][cntr]["packageId"]))
+    else
+        table.insert(output_csv, string.format("%d\t\t%u\t\t%u\t\t%u",cntr,
+                            cputopo["threadPool"][cntr]["threadId"],
+                            cputopo["threadPool"][cntr]["coreId"],
+                            cputopo["threadPool"][cntr]["packageId"]))
+    end
+end
+table.insert(output_csv, likwid.hline)
+
+table.insert(output_csv, "STRUCT,Sockets,"..tostring(cputopo["numSockets"]))
+for socket=0,cputopo["numSockets"]-1 do
+    csv_str = string.format("Socket %d:\t\t( ",cputopo["topologyTree"][socket]["ID"])
+    for core=0,cputopo["numCoresPerSocket"]-1 do
+        for thread=0, cputopo["numThreadsPerCore"]-1 do
+            csv_str = csv_str ..tostring(cputopo["topologyTree"][socket]["Childs"][core]["Childs"][thread]).. ","
+        end
+    end
+    table.insert(output_csv, csv_str:sub(1,#csv_str-1).." )")
+end
+
+table.insert(output_csv, likwid.hline)
+
+
+table.insert(output_csv, likwid.sline)
+table.insert(output_csv, "Cache Topology")
+table.insert(output_csv, likwid.sline)
+
+for level=1,cputopo["numCacheLevels"] do
+    if (cputopo["cacheLevels"][level]["type"] ~= "INSTRUCTIONCACHE") then
+        lines = 3
+        if print_caches then lines = 9 end
+        table.insert(output_csv, string.format("STRUCT,Cache Topology L%d,%d", cputopo["cacheLevels"][level]["level"],lines))
+        table.insert(output_csv, string.format("Level:\t\t\t%d",cputopo["cacheLevels"][level]["level"]))
+        if (cputopo["cacheLevels"][level]["size"] < 1048576) then
+            table.insert(output_csv, string.format("Size:\t\t\t%d kB",cputopo["cacheLevels"][level]["size"]/1024))
+        else
+            table.insert(output_csv, string.format("Size:\t\t\t%d MB",cputopo["cacheLevels"][level]["size"]/1048576))
+        end
+        
+        if (print_caches) then
+            if (cputopo["cacheLevels"][level]["type"] == "DATACACHE") then
+                table.insert(output_csv, "Type:\t\t\tData cache")
+            elseif (cputopo["cacheLevels"][level]["type"] == "UNIFIEDCACHE") then
+                table.insert(output_csv, "Type:\t\t\tUnified cache")
+            end
+
+            table.insert(output_csv, string.format("Associativity:\t\t%d",cputopo["cacheLevels"][level]["associativity"]))
+            table.insert(output_csv, string.format("Number of sets:\t\t%d",cputopo["cacheLevels"][level]["sets"]))
+            table.insert(output_csv, string.format("Cache line size:\t%d",cputopo["cacheLevels"][level]["lineSize"]))
+            
+            if (cputopo["cacheLevels"][level]["inclusive"] == 0) then
+                table.insert(output_csv, "Cache type:\t\tNon Inclusive")
+            else
+                table.insert(output_csv, "Cache type:\t\tInclusive")
+            end
+            table.insert(output_csv, string.format("Shared by threads:\t%d",cputopo["cacheLevels"][level]["threads"]))
+        end
+        local threads = cputopo["cacheLevels"][level]["threads"]
+        str = "Cache groups:\t\t( "
+        for socket=0,cputopo["numSockets"]-1 do
+            for core=0,cputopo["numCoresPerSocket"]-1 do
+                for cpu=0,cputopo["numThreadsPerCore"]-1 do
+                    if (threads ~= 0) then
+                        str = str .. cputopo["topologyTree"][socket]["Childs"][core]["Childs"][cpu] .. " "
+                        threads = threads - 1
+                    else
+                        str = str .. string.format(") ( %d ",cputopo["topologyTree"][socket]["Childs"][core]["Childs"][cpu])
+                        threads = cputopo["cacheLevels"][level]["threads"]
+                        threads = threads - 1
+                    end
+                end
+            end
+        end
+        str = str .. ")"
+        table.insert(output_csv, str)
+        table.insert(output_csv, likwid.hline)
+    end
+end
+
+
+table.insert(output_csv, likwid.sline)
+table.insert(output_csv, "NUMA Topology")
+table.insert(output_csv, likwid.sline)
+
+if (numainfo["numberOfNodes"] == 0) then
+    table.insert(output_csv, "No NUMA")
+else
+    table.insert(output_csv, string.format("NUMA domains:\t\t%d",numainfo["numberOfNodes"]))
+    table.insert(output_csv, likwid.hline)
+    for node=1,numainfo["numberOfNodes"] do
+        table.insert(output_csv, string.format("STRUCT,NUMA Topology %d,5",numainfo["nodes"][node]["id"]))
+        table.insert(output_csv, string.format("Domain:\t\t\t%d",numainfo["nodes"][node]["id"]))
+        csv_str = "Processors:\t\t( "
+        for cpu=1,numainfo["nodes"][node]["numberOfProcessors"] do
+            csv_str = csv_str .. numainfo["nodes"][node]["processors"][cpu] .. ","
+        end
+        table.insert(output_csv, csv_str:sub(1,#csv_str-1).. " )")
+        csv_str = "Distances:\t\t"
+        for cpu=1,numainfo["nodes"][node]["numberOfDistances"] do
+            csv_str = csv_str .. numainfo["nodes"][node]["distances"][cpu][cpu-1] .. ","
+        end
+        table.insert(output_csv, csv_str:sub(1,#csv_str-1))
+        table.insert(output_csv, string.format("Free memory:\t\t%g MB",tonumber(numainfo["nodes"][node]["freeMemory"]/1024.0)))
+        table.insert(output_csv, string.format("Total memory:\t\t%g MB",tonumber(numainfo["nodes"][node]["totalMemory"]/1024.0)))
+        table.insert(output_csv, likwid.hline)
+    end
+end
+
+
+
+if print_csv then
+    longest_line = 0
+    local tmpList = {}
+    for i=#output_csv,1,-1 do
+        output_csv[i] = output_csv[i]:gsub("[\t]+",",")
+        output_csv[i] = output_csv[i]:gsub("%( ","")
+        output_csv[i] = output_csv[i]:gsub(" %)[%s]*",",")
+        output_csv[i] = output_csv[i]:gsub(",$","")
+        if  output_csv[i]:sub(1,1) == "*" or
+            output_csv[i]:sub(1,1) == "-" or
+            output_csv[i]:match("^Hardware Thread Topology") or
+            output_csv[i]:match("^Cache Topology") or
+            output_csv[i]:match("^NUMA Topology") then
+            table.remove(output_csv,i)
+        end
+        tmpList = likwid.stringsplit(output_csv[i],",")
+        if #tmpList > longest_line then longest_line = #tmpList end
+    end
+    for i=1,#output_csv do
+        tmpList = likwid.stringsplit(output_csv[i],",")
+        if #tmpList < longest_line then
+            output_csv[i] = output_csv[i]..string.rep(",",longest_line-#tmpList)
+        end
+    end
+else
+    for i=#output_csv,1,-1 do
+        output_csv[i] = output_csv[i]:gsub(","," ")
+        if output_csv[i]:match("^TABLE") or
+           output_csv[i]:match("^STRUCT") then
+            table.remove(output_csv,i)
+        end
+    end
+end
+
+for _,line in pairs(output_csv) do print(line) end
+
+if print_graphical and not print_csv then
+    print("\n")
+    print(likwid.sline)
+    print("Graphical Topology")
+    print(likwid.sline)
+    for socket=0,cputopo["numSockets"]-1 do
+        print(string.format("Socket %d:",cputopo["topologyTree"][socket]["ID"]))
+        container = {}
+        for core=0,cputopo["numCoresPerSocket"]-1 do
+            local tmpString = ""
+            for thread=0,cputopo["numThreadsPerCore"]-1 do
+                if thread == 0 then
+                    tmpString = tmpString .. tostring(cputopo["topologyTree"][socket]["Childs"][core]["Childs"][thread])
+                else
+                    tmpString = tmpString .. " " .. tostring(cputopo["topologyTree"][socket]["Childs"][core]["Childs"][thread]).. " "
+                end
+            end
+            likwid.addSimpleAsciiBox(container, 1, core+1, tmpString)
+        end
+        
+        local columnCursor = 1
+        local lineCursor = 2
+        for cache=1,cputopo["numCacheLevels"] do
+            if cputopo["cacheLevels"][cache]["type"] ~= "INSTRUCTIONCACHE" then
+                local cachesAtCurLevel = 0
+                local sharedCores = cputopo["cacheLevels"][cache]["threads"]/cputopo["numThreadsPerCore"]
+                if sharedCores >= cputopo["numCoresPerSocket"] then
+                    cachesAtCurLevel = 1
+                else
+                    cachesAtCurLevel = cputopo["numCoresPerSocket"]/sharedCores
+                end
+                columnCursor = 1
+                for cachesAtLevel=1,cachesAtCurLevel do
+                    local tmpString = ""
+                    local cacheWidth = 0
+                    if cputopo["cacheLevels"][cache]["size"] < 1048576 then
+                        tmpString = string.format("%dkB", cputopo["cacheLevels"][cache]["size"]/1024)
+                    else
+                        tmpString = string.format("%dMB", cputopo["cacheLevels"][cache]["size"]/1048576)
+                    end
+                    if sharedCores > 1 then
+                        if sharedCores > cputopo["numCoresPerSocket"] then
+                            cacheWidth = sharedCores
+                        else
+                            cacheWidth = sharedCores - 1
+                        end
+                        likwid.addJoinedAsciiBox(container, lineCursor, columnCursor,columnCursor + cacheWidth, tmpString)
+                        columnCursor = columnCursor + cacheWidth
+                    else
+                        likwid.addSimpleAsciiBox(container, lineCursor, columnCursor, tmpString)
+                        columnCursor = columnCursor + 1
+                    end
+                end
+                lineCursor = lineCursor + 1
+            end
+        end
+        likwid.printAsciiBox(container);
+    end
+end
+
+if outfile then
+    local suffix = string.match(outfile, ".-[^\\/]-%.?([^%.\\/]*)$")
+    local command = "<PREFIX>/share/likwid/filter/" .. suffix
+    local tmpfile = outfile:gsub("."..suffix,".tmp",1)
+    if not likwid.access(command,"x") then
+        stdout_print("Cannot find filter script, save output in CSV format to file "..outfile)
+        os.rename(tmpfile, outfile)
+    else
+        if suffix ~= "txt" and suffix ~= "csv" then
+            command = command .." ".. tmpfile .. " topology"
+            local f = assert(io.popen(command))
+            if f ~= nil then
+                local o = f:read("*a")
+                if o:len() > 0 then
+                    stdout_print(string.format("Failed to executed filter script %s.",command))
+                end
+            else
+                stdout_print("Failed to call filter script, save output in CSV format to file "..outfile)
+                os.rename(tmpfile, outfile)
+                os.remove(tmpfile)
+            end
+        else
+            os.rename(tmpfile, outfile)
+            os.remove(tmpfile)
+        end
+    end
+end
+
+likwid.putAffinityInfo()
+likwid.putNumaInfo()
+likwid.putTopology()
+likwid.putConfiguration()
+os.exit(0)
diff --git a/src/applications/likwid.lua b/src/applications/likwid.lua
new file mode 100644
index 0000000..efdc95c
--- /dev/null
+++ b/src/applications/likwid.lua
@@ -0,0 +1,1532 @@
+--[[
+ * =======================================================================================
+ *
+ *      Filename:  likwid.lua
+ *
+ *      Description:  Lua LIKWID interface library
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+]]
+
+local likwid = {}
+package.cpath = '<PREFIX>/lib/?.so;' .. package.cpath
+require("liblikwid")
+require("math")
+
+likwid.groupfolder = "<PREFIX>/share/likwid/perfgroups"
+
+likwid.version = <VERSION>
+likwid.release = <RELEASE>
+likwid.pinlibpath = "<PREFIX>/lib/liblikwidpin.so"
+likwid.dline = string.rep("=",80)
+likwid.hline =  string.rep("-",80)
+likwid.sline = string.rep("*",80)
+
+
+
+likwid.getConfiguration = likwid_getConfiguration
+likwid.putConfiguration = likwid_putConfiguration
+likwid.setAccessClientMode = likwid_setAccessClientMode
+likwid.init = likwid_init
+likwid.addEventSet = likwid_addEventSet
+likwid.setupCounters = likwid_setupCounters
+likwid.startCounters = likwid_startCounters
+likwid.stopCounters = likwid_stopCounters
+likwid.readCounters = likwid_readCounters
+likwid.switchGroup = likwid_switchGroup
+likwid.finalize = likwid_finalize
+likwid.getEventsAndCounters = likwid_getEventsAndCounters
+likwid.getResult = likwid_getResult
+likwid.getNumberOfGroups = likwid_getNumberOfGroups
+likwid.getRuntimeOfGroup = likwid_getRuntimeOfGroup
+likwid.getIdOfActiveGroup = likwid_getIdOfActiveGroup
+likwid.getNumberOfEvents = likwid_getNumberOfEvents
+likwid.getNumberOfThreads = likwid_getNumberOfThreads
+likwid.getCpuInfo = likwid_getCpuInfo
+likwid.getCpuTopology = likwid_getCpuTopology
+likwid.putTopology = likwid_putTopology
+likwid.getNumaInfo = likwid_getNumaInfo
+likwid.putNumaInfo = likwid_putNumaInfo
+likwid.setMemInterleaved = likwid_setMemInterleaved
+likwid.getAffinityInfo = likwid_getAffinityInfo
+likwid.putAffinityInfo = likwid_putAffinityInfo
+likwid.getPowerInfo = likwid_getPowerInfo
+likwid.putPowerInfo = likwid_putPowerInfo
+likwid.getOnlineDevices = likwid_getOnlineDevices
+likwid.printSupportedCPUs = likwid_printSupportedCPUs
+likwid.getCpuClock = likwid_getCpuClock
+likwid.startClock = likwid_startClock
+likwid.stopClock = likwid_stopClock
+likwid.getClockCycles = likwid_getClockCycles
+likwid.getClock = likwid_getClock
+likwid.sleep = sleep
+likwid.usleep = usleep
+likwid.startPower = likwid_startPower
+likwid.stopPower = likwid_stopPower
+likwid.calcPower = likwid_printEnergy
+likwid.getPowerLimit = likwid_powerLimitGet
+likwid.setPowerLimit = likwid_powerLimitSet
+likwid.statePowerLimit = likwid_powerLimitState
+likwid.initTemp = likwid_initTemp
+likwid.readTemp = likwid_readTemp
+likwid.memSweep = likwid_memSweep
+likwid.memSweepDomain = likwid_memSweepDomain
+likwid.pinProcess = likwid_pinProcess
+likwid.setenv = likwid_setenv
+likwid.getpid = likwid_getpid
+likwid.setVerbosity = likwid_setVerbosity
+likwid.access = likwid_access
+likwid.startProgram = likwid_startProgram
+likwid.checkProgram = likwid_checkProgram
+likwid.killProgram = likwid_killProgram
+likwid.catchSignal = likwid_catchSignal
+likwid.getSignalState = likwid_getSignalState
+
+infinity = math.huge
+
+
+local function getopt(args, ostrlist)
+    local arg, place,placeend = nil, 0, 0;
+    return function ()
+        if place == 0 then -- update scanning pointer
+            place = 1
+            if #args == 0 or args[1]:sub(1, 1) ~= '-' then place = 0; return nil end
+            if #args[1] >= 2 then
+                if args[1]:sub(2, 2) == '-' then
+                    if #args[1] == 2 then -- found "--"
+                        place = 0
+                        table.remove(args, 1)
+                        return args[1], nil
+                    end
+                    place = place + 1
+                end
+                if args[1]:sub(3, 3) == '-' then
+                    place = 0
+                    table.remove(args, 1)
+                    return args[1], nil
+                end
+                place = place + 1
+                placeend = #args[1]
+            end
+        end
+        local optopt = args[1]:sub(place, placeend)
+        place = place + 1;
+        local givopt = ""
+        local needarg = false
+        for _, ostr in pairs(ostrlist) do
+            local matchstring = "^"..ostr.."$"
+            placeend = place + #ostr -1
+            if ostr:sub(#ostr,#ostr) == ":" then
+                matchstring = "^"..ostr:sub(1,#ostr-1).."$"
+                needarg = true
+                placeend = place + #ostr -2
+            end
+            if optopt:match(matchstring) then
+                givopt = ostr
+                break
+            end
+            needarg = false
+        end
+        if givopt == "" then -- unknown option
+            if optopt == '-' then return nil end
+            if place > #args[1] then
+                table.remove(args, 1)
+                place = 0;
+            end
+            return '?',  optopt;
+        end
+
+        if not needarg then -- do not need argument
+            arg = true;
+            table.remove(args, 1)
+            place = 0;
+        else -- need an argument
+            if placeend < #args[1] then -- no white space
+                arg = args[1]:sub(placeend,#args[1])
+            else
+                table.remove(args, 1);
+                if #args == 0 then -- an option requiring argument is the last one
+                    place = 0
+                    if givopt:sub(placeend, placeend) == ':' then return ':' end
+                    return '?', optopt
+                else arg = args[1] end
+            end
+            table.remove(args, 1)
+            place = 0;
+        end
+        return optopt, arg
+    end
+end
+
+
+likwid.getopt = getopt
+
+local function tablelength(T)
+    local count = 0
+    if T == nil then return count end
+    if type(T) ~= "table" then return count end
+    for _ in pairs(T) do count = count + 1 end
+    return count
+end
+
+likwid.tablelength = tablelength
+
+local function tableprint(T, long)
+    if T == nil or type(T) ~= "table" or tablelength(T) == 0 then
+        print("[]")
+        return
+    end
+    local start_index = 0
+    local end_index = #T
+    if T[start_index] == nil then
+        start_index = 1
+        end_index = #T
+    end
+    outstr = ""
+    if T[start_index] ~= nil then
+        for i=start_index,end_index do
+            if not long then
+                outstr = outstr .. "," .. tostring(T[i])
+            else
+                outstr = outstr .. "," .. "[" .. tostring(i) .. "] = ".. tostring(T[i])
+            end
+        end
+    else
+        for k,v in pairs(T) do
+            if not long then
+                outstr = outstr .. "," .. tostring(v)
+            else
+                outstr = outstr .. "," .. "[" .. tostring(k) .. "] = ".. tostring(v)
+            end
+        end
+    end
+    print("["..outstr:sub(2,outstr:len()).."]")
+end
+
+likwid.tableprint = tableprint
+
+local function get_spaces(str, min_space, max_space)
+    local length = str:len()
+    local back = 0
+    local front = 0
+    back = math.ceil((max_space-str:len()) /2)
+    front = max_space - back - str:len()
+
+    if (front < back) then
+        local tmp = front
+        front = back
+        back = tmp
+    end
+    return string.rep(" ", front),string.rep(" ", back)
+end
+
+local function calculate_metric(formula, counters_to_values)
+    local function cmp(a,b)
+        if a:len() > b:len() then return true end
+        return false
+    end
+    local result = "Nan"
+    local err = false
+    local clist = {}
+    for counter,value in pairs(counters_to_values) do
+        table.insert(clist, counter)
+    end
+    table.sort(clist, cmp)
+    for _,counter in pairs(clist) do
+        formula = string.gsub(formula, tostring(counter), tostring(counters_to_values[counter]))
+    end
+    for c in formula:gmatch"." do
+        if c ~= "+" and c ~= "-" and  c ~= "*" and  c ~= "/" and c ~= "(" and c ~= ")" and c ~= "." and c:lower() ~= "e" then
+            local tmp = tonumber(c)
+            if type(tmp) ~= "number" then
+                print(c,tmp)
+                print("Not all formula entries can be substituted with measured values")
+                print("Current formula: "..formula)
+                err = true
+                break
+            end
+        end
+    end
+    if not err then
+        if formula then
+            result = assert(loadstring("return (" .. formula .. ")")())
+            if (result == nil or result ~= result or result == infinity or result == -infinity) then
+                result = 0
+            end
+        else
+            result = 0
+        end
+    end
+    return result
+end
+
+likwid.calculate_metric = calculate_metric
+
+local function printtable(tab)
+    local nr_columns = tablelength(tab)
+    if nr_columns == 0 then
+        print("Table has no columns. Empty table?")
+        return
+    end
+    local nr_lines = tablelength(tab[1])
+    local min_lengths = {}
+    local max_lengths = {}
+    for i, col in pairs(tab) do
+        if tablelength(col) ~= nr_lines then
+            print("Not all columns have the same row count, nr_lines"..tostring(nr_lines)..", current "..tablelength(col))
+            return
+        end
+        if min_lengths[i] == nil then
+            min_lengths[i] = 10000000
+            max_lengths[i] = 0
+        end
+        for j, field in pairs(col) do
+            if tostring(field):len() > max_lengths[i] then
+                max_lengths[i] = tostring(field):len()
+            end
+            if tostring(field):len() < min_lengths[i] then
+                min_lengths[i] = tostring(field):len()
+            end
+        end
+    end
+    hline = ""
+    for i=1,#max_lengths do
+        hline = hline .. "+-"..string.rep("-",max_lengths[i]).."-"
+    end
+    hline = hline .. "+"
+    print(hline)
+    
+    str = "| "
+    for i=1,nr_columns do
+        front, back = get_spaces(tostring(tab[i][1]), min_lengths[i],max_lengths[i])
+        str = str .. front.. tostring(tab[i][1]) ..back.. " | "
+    end
+    print(str)
+    print(hline)
+    
+    for j=2,nr_lines do
+        str = "| "
+        for i=1,nr_columns do
+            front, back = get_spaces(tostring(tab[i][j]), min_lengths[i],max_lengths[i])
+            str = str .. front.. tostring(tab[i][j]) ..back.. " | "
+        end
+        print(str)
+    end
+    if nr_lines > 1 then
+        print(hline)
+    end
+    print()
+end
+
+likwid.printtable = printtable
+
+local function printcsv(tab, linelength)
+    local nr_columns = tablelength(tab)
+    if nr_columns == 0 then
+        print("Table has no columns. Empty table?")
+        return
+    end
+    local nr_lines = tablelength(tab[1])
+    local str = ""
+    for j=1,nr_lines do
+        str = ""
+        for i=1,nr_columns do
+            str = str .. tostring(tab[i][j])
+            if (i ~= nr_columns) then
+                str = str .. ","
+            end
+        end
+        if nr_columns < linelength then
+            str = str .. string.rep(",", linelength-nr_columns)
+        end
+        print(str)
+    end
+    
+end
+
+likwid.printcsv = printcsv
+
+local function stringsplit(astr, sSeparator, nMax, bRegexp)
+    assert(sSeparator ~= '')
+    assert(nMax == nil or nMax >= 1)
+    if astr == nil then return {} end
+    local aRecord = {}
+
+    if astr:len() > 0 then
+        local bPlain = not bRegexp
+        nMax = nMax or -1
+
+        local nField=1 nStart=1
+        local nFirst,nLast = astr:find(sSeparator, nStart, bPlain)
+        while nFirst and nMax ~= 0 do
+            aRecord[nField] = astr:sub(nStart, nFirst-1)
+            nField = nField+1
+            nStart = nLast+1
+            nFirst,nLast = astr:find(sSeparator, nStart, bPlain)
+            nMax = nMax-1
+            end
+        aRecord[nField] = astr:sub(nStart)
+    end
+
+    return aRecord
+end
+
+likwid.stringsplit = stringsplit
+
+local function cpulist_sort(cpulist)
+    local newlist = {}
+    if #cpulist == 0 then
+        return newlist
+    end
+    local topo = likwid_getCpuTopology()
+    for offset=1,topo["numThreadsPerCore"] do
+        for i=0, #cpulist/topo["numThreadsPerCore"] do
+            table.insert(newlist, cpulist[(i*topo["numThreadsPerCore"])+offset])
+        end
+    end
+    return newlist
+end
+
+local function cpulist_concat(cpulist, addlist)
+    for i, add in pairs(addlist) do
+        table.insert(cpulist, add)
+    end
+    return cpulist
+end
+
+local function cpustr_valid(cpustr)
+    invalidlist = {"%.", "_", ";", "!", "§", "%$", "%%", "%&", "/", "\\",  "%(","%)","=", "?","`","´" ,"~","°","|","%^","<",">", "{","}","%[","%]","#","\'","\"", "*"}
+    for i, inval in pairs(invalidlist) do
+        local s,e = cpustr:find(inval)
+        if s ~= nil then
+            return false
+        end
+    end
+    return true
+end
+
+local function cpustr_to_cpulist_scatter(cpustr)
+    local cpulist = {}
+    local domain_list = {}
+    local domain_cpus = {}
+    if not cpustr_valid(cpustr) then
+        print("ERROR: Expression contains invalid characters")
+        return {}
+    end
+    local s,e = cpustr:find(":")
+    if s ~= nil then
+        local domain = cpustr:sub(1,s-1)
+        local expression = cpustr:sub(s+1,cpustr:len())
+        local affinity = likwid_getAffinityInfo()
+        local topo = likwid_getCpuTopology()
+
+        for dom,content in pairs(affinity["domains"]) do
+            s,e = content["tag"]:find(domain)
+            if s ~= nil then 
+                table.insert(domain_list, dom)
+                table.insert(domain_cpus, cpulist_sort(affinity["domains"][dom]["processorList"]))
+            end
+        end
+
+        local num_domains = tablelength(domain_list)
+        local domain_idx = 1
+        local threadID = 1
+        -- Adding physical cores
+        for i=1,topo["activeHWThreads"]/num_domains do
+            for idx, _ in pairs(domain_list) do
+                table.insert(cpulist, domain_cpus[idx][i])
+            end
+        end
+    else
+        print("ERROR: Cannot parse scatter expression, should look something like <domain>:scatter")
+        return {}
+    end
+    return cpulist
+end
+
+
+local function cpustr_to_cpulist_expression(cpustr)
+    local cpulist = {}
+    if not cpustr_valid(cpustr) then
+        print("ERROR: Expression contains invalid characters")
+        return {}
+    end
+    local affinity = likwid_getAffinityInfo()
+    local exprlist = stringsplit(cpustr, ":")
+    table.remove(exprlist, 1)
+    local domain = 0
+
+    local tag = "X"
+    local count = 0
+    local chunk = 1
+    local stride = 1
+
+    if #exprlist == 2 then
+        tag = exprlist[1]
+        count = tonumber(exprlist[2])
+    elseif #exprlist == 4 then
+        tag = exprlist[1]
+        count = tonumber(exprlist[2])
+        chunk = tonumber(exprlist[3])
+        stride = tonumber(exprlist[4])
+    end
+    if tag == "X" or count == nil or chunk == nil or stride == nil then
+        print("ERROR: Invalid expression, cannot parse all needed values")
+        return {}
+    end
+    for domidx, domcontent in pairs(affinity["domains"]) do
+        if domcontent["tag"] == tag then
+            domain = domidx
+            break
+        end
+    end
+    if domain == 0 then
+        print(string.format("ERROR: Invalid affinity domain %s", tag))
+        return {}
+    end
+
+    index = 1
+    selected = 0
+    for i=1,count do
+        for j=0, chunk-1 do
+            table.insert(cpulist, affinity["domains"][domain]["processorList"][index+j])
+            selected = selected+1
+            if (selected >= count) then break end
+        end
+        index = index + stride
+        if (index > affinity["domains"][domain]["numberOfProcessors"]) then
+            index = 1
+        end
+        if (selected >= count) then break end
+    end
+    return cpulist
+end
+
+
+local function cpustr_to_cpulist_logical(cpustr)
+    local cpulist = {}
+    local sorted_list = {}
+    if not cpustr_valid(cpustr) then
+        print("ERROR: Expression contains invalid characters")
+        return {}
+    end
+    local affinity = likwid_getAffinityInfo()
+    local exprlist = stringsplit(cpustr, ":")
+    table.remove(exprlist, 1)
+    local domain = 0
+    if #exprlist ~= 2 then
+        print("ERROR: Invalid expression, should look like L:<domain>:<indexlist> or be in a cpuset")
+        return {}
+    end
+    local tag = exprlist[1]
+    local indexstr = exprlist[2]
+    for domidx, domcontent in pairs(affinity["domains"]) do
+        if domcontent["tag"] == tag then
+            domain = domidx
+            break
+        end
+    end
+    if domain == 0 then
+        print(string.format("ERROR: Invalid affinity domain %s", tag))
+        return {}
+    end
+    sorted_list = cpulist_sort(affinity["domains"][domain]["processorList"])
+
+    indexlist = stringsplit(indexstr, ",")
+    for i, item in pairs(indexlist) do
+        local s,e = item:find("-")
+        if s == nil then
+            local index = tonumber(item)+1
+            if index > affinity["domains"][domain]["numberOfProcessors"] then
+                print(string.format("CPU index %s larger than number of processors in affinity group %s", item, tag))
+                return {}
+            end
+            table.insert(cpulist, sorted_list[index])
+        else
+            start, ende = item:match("(%d*)-(%d*)")
+            start = start + 1
+            ende = ende + 1
+            if tonumber(start) == nil then
+                print("ERROR: CPU indices smaller than 0 are not allowed")
+                return {}
+            end
+            if tonumber(start) > tonumber(ende) then
+                print(string.format("ERROR: CPU list %s invalid, start %s is larger than end %s", item, start, ende))
+                return {}
+            end
+            if tonumber(ende) > #sorted_list then
+                print(string.format("ERROR: CPU list end %d larger than number of processors in affinity group %s", ende, tag))
+                return {}
+            end
+            for i=tonumber(start),tonumber(ende) do
+                table.insert(cpulist, sorted_list[i])
+            end
+        end
+    end
+    return cpulist
+end
+
+local function cpustr_to_cpulist_physical(cpustr)
+    local function present(list, check)
+        for i, item in pairs(list) do
+            if item == check then
+                return true
+            end
+        end
+        return false
+    end
+    local cpulist = {}
+    if not cpustr_valid(cpustr) then
+        print("ERROR: Expression contains invalid characters")
+        return {}
+    end
+    local affinity = likwid_getAffinityInfo()
+    local domain = 0
+    tag, indexstr = cpustr:match("^(%g+):(%g+)")
+    if tag == nil then
+        tag = "N"
+        indexstr = cpustr:match("^(%g+)")
+    end
+    for domidx, domcontent in pairs(affinity["domains"]) do
+        if domcontent["tag"] == tag then
+            domain = domidx
+            break
+        end
+    end
+    if domain == 0 then
+        print(string.format("ERROR: Invalid affinity domain %s", tag))
+        return {}
+    end
+    indexlist = stringsplit(indexstr, ",")
+    for i, item in pairs(indexlist) do
+        local s,e = item:find("-")
+        if s == nil then
+            if present(affinity["domains"][domain]["processorList"], tonumber(item)) then
+                table.insert(cpulist, tonumber(item))
+            else
+                print(string.format("ERROR: CPU %s not in affinity domain %s", item, tag))
+                return {}
+            end
+        else
+            start, ende = item:match("^(%d*)-(%d*)")
+            if tonumber(start) == nil then
+                print("ERROR: CPU indices smaller than 0 are not allowed")
+                return {}
+            end
+            if tonumber(ende) >= affinity["domains"][domain]["numberOfProcessors"] then
+                print(string.format("ERROR: CPU list end %d larger than number of processors in affinity group %s", ende, tag))
+                return {}
+            end
+            for i=tonumber(start),tonumber(ende) do
+                if present(affinity["domains"][domain]["processorList"], i) then
+                    table.insert(cpulist, i)
+                else
+                    print(string.format("ERROR: CPU %s not in affinity domain %s", i, tag))
+                    return {}
+                end
+            end
+        end
+    end
+    return cpulist
+end
+
+likwid.cpustr_to_cpulist_physical = cpustr_to_cpulist_physical
+
+
+local function cpustr_to_cpulist(cpustr)
+    local strlist = stringsplit(cpustr, "@")
+    local topo = likwid_getCpuTopology()
+    local cpulist = {}
+    for pos, str in pairs(strlist) do
+        if str:match("^%a*:scatter") then
+            cpulist = cpulist_concat(cpulist, cpustr_to_cpulist_scatter(str))
+        elseif str:match("^E:%a") then
+            cpulist = cpulist_concat(cpulist, cpustr_to_cpulist_expression(str))
+        elseif str:match("^L:%a") then
+            cpulist = cpulist_concat(cpulist, cpustr_to_cpulist_logical(str))
+        elseif topo["activeHWThreads"] < topo["numHWThreads"] then
+            print(string.format("INFO: You are running LIKWID in a cpuset with %d CPUs, only logical numbering allowed",topo["activeHWThreads"]))
+            if str:match("^N:") or str:match("^S%d*:") or str:match("^C%d*:") or str:match("^M%d*:") then
+                cpulist = cpulist_concat(cpulist, cpustr_to_cpulist_logical("L:"..str))
+            else
+                cpulist = cpulist_concat(cpulist, cpustr_to_cpulist_logical("L:N:"..str))
+            end
+        elseif str:match("^N:") or str:match("^S%d*:") or str:match("^C%d*:") or str:match("^M%d*:") then
+            cpulist = cpulist_concat(cpulist, cpustr_to_cpulist_logical("L:"..str))
+        else
+            local tmplist = cpustr_to_cpulist_physical(str)
+            if tmplist == {} then
+                print(string.format("ERROR: Cannot analyze string %s", str))
+            else
+                cpulist = cpulist_concat(cpulist, tmplist)
+            end
+        end
+    end
+    return tablelength(cpulist),cpulist
+end
+
+likwid.cpustr_to_cpulist = cpustr_to_cpulist
+
+local function cpuexpr_to_list(cpustr, prefix)
+    local cpulist = {}
+    if not cpustr_valid(cpustr) then
+        print("ERROR: Expression contains invalid characters")
+        return 0, {}
+    end
+    local affinity = likwid_getAffinityInfo()
+    local domain = 0
+    local exprlist = stringsplit(cpustr,",")
+    for i, expr in pairs(exprlist) do
+        local added = false
+        for domidx, domcontent in pairs(affinity["domains"]) do
+            if domcontent["tag"] == prefix..expr then
+                table.insert(cpulist, tonumber(expr))
+                added = true
+                break
+            end
+        end
+        if not added then
+            print(string.format("ERROR: No affinity domain with index %s%s", prefix, expr))
+            return 0, {}
+        end
+    end
+    return tablelength(cpulist),cpulist
+end
+
+local function nodestr_to_nodelist(cpustr)
+    return cpuexpr_to_list(cpustr, "M")
+end
+
+likwid.nodestr_to_nodelist = nodestr_to_nodelist
+
+local function sockstr_to_socklist(cpustr)
+    return cpuexpr_to_list(cpustr, "S")
+end
+
+likwid.sockstr_to_socklist = sockstr_to_socklist
+
+local function get_groups()
+    groups = {}
+    local cpuinfo = likwid.getCpuInfo()
+    if cpuinfo == nil then return 0, {} end
+    local f = io.popen("ls " .. likwid.groupfolder .. "/" .. cpuinfo["short_name"] .."/*.txt 2>/dev/null")
+    if f == nil then
+        print("Cannot read groups for architecture "..cpuinfo["short_name"])
+        return 0, {}
+    end
+    t = stringsplit(f:read("*a"),"\n")
+    f:close()
+    for i, a in pairs(t) do
+        if a ~= "" then
+            table.insert(groups,a:sub((a:match'^.*()/')+1,a:len()-4))
+        end
+    end
+    return #groups,groups
+end
+
+likwid.get_groups = get_groups
+
+local function new_groupdata(eventString, fix_ctrs)
+    local gdata = {}
+    local num_events = 1
+    gdata["Events"] = {}
+    gdata["EventString"] = ""
+    gdata["GroupString"] = ""
+    local s,e = eventString:find(":")
+    if s == nil then
+        return gdata
+    end
+    if fix_ctrs > 0 then
+        if not eventString:match("FIXC2") and fix_ctrs == 3 then
+            eventString = "CPU_CLK_UNHALTED_REF:FIXC2,"..eventString
+        end
+        if not eventString:match("FIXC1") and fix_ctrs >= 2 then
+            eventString = "CPU_CLK_UNHALTED_CORE:FIXC1,"..eventString
+        end
+        if not eventString:match("FIXC0") and fix_ctrs >= 1 then
+            eventString = "INSTR_RETIRED_ANY:FIXC0,"..eventString
+        end
+    end
+    gdata["EventString"] = eventString
+    gdata["GroupString"] = eventString
+    local eventslist = likwid.stringsplit(eventString,",")
+    for i,e in pairs(eventslist) do
+        eventlist = likwid.stringsplit(e,":")
+        gdata["Events"][num_events] = {}
+        gdata["Events"][num_events]["Event"] = eventlist[1]
+        gdata["Events"][num_events]["Counter"] = eventlist[2]
+        if #eventlist > 2 then
+            table.remove(eventlist, 2)
+            table.remove(eventlist, 1)
+            gdata["Events"][num_events]["Options"] = eventlist
+        end
+        num_events = num_events + 1
+    end
+    return gdata
+end
+
+
+local function get_groupdata(group)
+    groupdata = {}
+    local group_exist = 0
+    local cpuinfo = likwid.getCpuInfo()
+    if cpuinfo == nil then return nil end
+
+    num_groups, groups = get_groups()
+    for i, a in pairs(groups) do
+        if (a == group) then group_exist = 1 end
+    end
+    if (group_exist == 0) then return new_groupdata(group, cpuinfo["perf_num_fixed_ctr"]) end
+    
+    local f = assert(io.open(likwid.groupfolder .. "/" .. cpuinfo["short_name"] .. "/" .. group .. ".txt", "r"))
+    local t = f:read("*all")
+    f:close()
+    local parse_eventset = false
+    local parse_metrics = false
+    local parse_long = false
+    groupdata["EventString"] = ""
+    groupdata["Events"] = {}
+    groupdata["Metrics"] = {}
+    groupdata["LongDescription"] = ""
+    groupdata["GroupString"] = group
+    nr_events = 1
+    nr_metrics = 1
+    for i, line in pairs(stringsplit(t,"\n")) do
+        
+        if (parse_eventset or parse_metrics or parse_long) and line:len() == 0 then
+            parse_eventset = false
+            parse_metrics = false
+            parse_long = false
+        end
+
+        if line:match("^SHORT%a*") ~= nil then
+            linelist = stringsplit(line, "%s+", nil, "%s+")
+            table.remove(linelist, 1)
+            groupdata["ShortDescription"] = table.concat(linelist, " ")  
+        end
+
+        if line:match("^EVENTSET$") ~= nil then
+            parse_eventset = true
+        end
+
+        if line:match("^METRICS$") ~= nil then
+            parse_metrics = true
+        end
+
+        if line:match("^LONG$") ~= nil then
+            parse_long = true
+        end
+
+        if parse_eventset and line:match("^EVENTSET$") == nil then
+            linelist = stringsplit(line:gsub("^%s*(.-)%s*$", "%1"), "%s+", nil, "%s+")
+            eventstring = linelist[2] .. ":" .. linelist[1]
+            if #linelist > 2 then
+                table.remove(linelist,2)
+                table.remove(linelist,1)
+                eventstring = eventstring .. ":".. table.concat(":",linelist)
+            end
+            groupdata["EventString"] = groupdata["EventString"] .. "," .. eventstring
+            groupdata["Events"][nr_events] = {}
+            groupdata["Events"][nr_events]["Event"] = linelist[2]:gsub("^%s*(.-)%s*$", "%1")
+            groupdata["Events"][nr_events]["Counter"] = linelist[1]:gsub("^%s*(.-)%s*$", "%1")
+            nr_events = nr_events + 1
+        end
+        
+        if parse_metrics and line:match("^METRICS$") == nil then
+            linelist = stringsplit(line:gsub("^%s*(.-)%s*$", "%1"), "%s+", nil, "%s+")
+            formula = linelist[#linelist]
+            table.remove(linelist)
+            groupdata["Metrics"][nr_metrics] = {}
+            groupdata["Metrics"][nr_metrics]["description"] = table.concat(linelist, " ")
+            groupdata["Metrics"][nr_metrics]["formula"] = formula
+            nr_metrics = nr_metrics + 1
+        end
+        
+        if parse_long and line:match("^LONG$") == nil then
+            groupdata["LongDescription"] = groupdata["LongDescription"] .. "\n" .. line
+        end
+    end
+    groupdata["LongDescription"] = groupdata["LongDescription"]:sub(2)
+    groupdata["EventString"] = groupdata["EventString"]:sub(2)
+    
+    return groupdata
+    
+end
+
+likwid.get_groupdata = get_groupdata
+
+
+
+
+local function parse_time(timestr)
+    local duration = 0
+    local s1,e1 = timestr:find("ms")
+    local s2,e2 = timestr:find("us")
+    if s1 ~= nil then
+        duration = tonumber(timestr:sub(1,s1-1)) * 1.E03
+    elseif s2 ~= nil then
+        duration = tonumber(timestr:sub(1,s2-1))
+    else
+        s1,e1 = timestr:find("s")
+        if s1 == nil then
+            print("Cannot parse time, '" .. timestr .. "' not well formatted, we need a time unit like s, ms, us")
+            os.exit(1)
+        end
+        duration = tonumber(timestr:sub(1,s1-1)) * 1.E06
+    end
+    return duration
+end
+
+likwid.parse_time = parse_time
+
+
+
+local function min_max_avg(values)
+    min = math.huge
+    max = 0.0
+    sum = 0.0
+    count = 0
+    for _, value in pairs(values) do
+        if value ~= nil then
+            if (value < min) then min = value end
+            if (value > max) then max = value end
+            sum = sum + value
+            count = count + 1
+        end
+    end
+    return min, max, sum/count
+end
+
+local function tableMinMaxAvgSum(inputtable, skip_cols, skip_lines)
+    local outputtable = {}
+    local nr_columns = #inputtable
+    if nr_columns == 0 then
+        return {}
+    end
+    local nr_lines = #inputtable[1]
+    if nr_lines == 0 then
+        return {}
+    end
+    minOfLine = {"Min"}
+    maxOfLine = {"Max"}
+    sumOfLine = {"Sum"}
+    avgOfLine = {"Avg"}
+    for i=skip_lines+1,nr_lines do
+        minOfLine[i-skip_lines+1] = math.huge
+        maxOfLine[i-skip_lines+1] = 0
+        sumOfLine[i-skip_lines+1] = 0
+        avgOfLine[i-skip_lines+1] = 0
+    end
+    for j=skip_cols+1,nr_columns do
+        for i=skip_lines+1, nr_lines do
+            local res = tonumber(inputtable[j][i])
+            if res ~= nil then
+                minOfLine[i-skip_lines+1] = math.min(res, minOfLine[i-skip_lines+1])
+                maxOfLine[i-skip_lines+1] = math.max(res, maxOfLine[i-skip_lines+1])
+                sumOfLine[i-skip_lines+1] = sumOfLine[i-skip_lines+1] + res
+            else
+                minOfLine[i-skip_lines+1] = 0
+                maxOfLine[i-skip_lines+1] = 0
+                sumOfLine[i-skip_lines+1] = 0
+            end
+            avgOfLine[i-skip_lines+1] = sumOfLine[i-skip_lines+1]/(nr_columns-skip_cols)
+        end
+    end
+
+    local tmptable = {}
+    table.insert(tmptable, inputtable[1][1])
+    for j=2,#inputtable[1] do
+        table.insert(tmptable, inputtable[1][j].." STAT")
+    end
+    table.insert(outputtable, tmptable)
+    for i=2,skip_cols do
+        local tmptable = {}
+        table.insert(tmptable, inputtable[i][1])
+        for j=2,#inputtable[i] do
+            table.insert(tmptable, inputtable[i][j])
+        end
+        table.insert(outputtable, tmptable)
+    end
+    table.insert(outputtable, sumOfLine)
+    table.insert(outputtable, minOfLine)
+    table.insert(outputtable, maxOfLine)
+    table.insert(outputtable, avgOfLine)
+    return outputtable
+end
+
+likwid.tableToMinMaxAvgSum = tableMinMaxAvgSum
+
+local function printOutput(groups, results, groupData, cpulist)
+    local nr_groups = #groups
+    local maxLineFields = 0
+    local cpuinfo = likwid_getCpuInfo()
+    local clock = likwid.getCpuClock()
+    for g, group in pairs(groups) do
+        local groupID = group["ID"]
+        local num_events = likwid_getNumberOfEvents(groupID);
+        local num_threads = likwid_getNumberOfThreads(groupID-1);
+        local runtime = likwid_getRuntimeOfGroup(groupID)
+        local groupName = groupData[groupID]["GroupString"]
+        if groupName == groupData[groupID]["EventString"] then
+            groupName = "Custom"
+        end
+        local firsttab =  {}
+        local firsttab_combined = {}
+        local secondtab = {}
+        local secondtab_combined = {}
+        firsttab[1] = {"Event"}
+        firsttab_combined[1] = {"Event"}
+        firsttab[2] = {"Counter"}
+        firsttab_combined[2] = {"Counter"}
+        if not groupData[groupID]["Metrics"] then
+            table.insert(firsttab[1],"Runtime (RDTSC) [s]")
+            table.insert(firsttab[2],"TSC")
+        end
+        
+        for i=1,num_events do
+            table.insert(firsttab[1],groupData[groupID]["Events"][i]["Event"])
+            table.insert(firsttab_combined[1],groupData[groupID]["Events"][i]["Event"] .. " STAT")
+        end
+
+        for i=1,num_events do
+            table.insert(firsttab[2],groupData[groupID]["Events"][i]["Counter"])
+            table.insert(firsttab_combined[2],groupData[groupID]["Events"][i]["Counter"])
+        end
+        
+
+        for j=1,num_threads do
+            tmpList = {"Core "..tostring(cpulist[j])}
+            if not groupData[groupID]["Metrics"] then
+                table.insert(tmpList, string.format("%e",runtime))
+            end
+            for i=1,num_events do
+                local tmp = tostring(results[groupID][i][j])
+                if tostring(results[groupID][i][j]):len() > 12 then
+                    tmp = string.format("%e", results[groupID][i][j])
+                end
+                table.insert(tmpList, tmp)
+            end
+            table.insert(firsttab, tmpList)
+        end
+        
+        if #cpulist > 1 then
+            firsttab_combined = tableMinMaxAvgSum(firsttab, 2, 1)
+        end
+
+        if groupData[groupID]["Metrics"] then
+            local counterlist = {}
+            counterlist["time"] = runtime
+            counterlist["inverseClock"] = 1.0/clock;
+
+            secondtab[1] = {"Metric"}
+            secondtab_combined[1] = {"Metric"}
+            for m=1,#groupData[groupID]["Metrics"] do
+                table.insert(secondtab[1],groupData[groupID]["Metrics"][m]["description"] )
+                table.insert(secondtab_combined[1],groupData[groupID]["Metrics"][m]["description"].." STAT" )
+            end
+            for j=1,num_threads do
+                tmpList = {"Core "..tostring(cpulist[j])}
+                for i=1,num_events do
+                    counterlist[groupData[groupID]["Events"][i]["Counter"]] = results[groupID][i][j]
+                end
+                for m=1,#groupData[groupID]["Metrics"] do
+                    local tmp = calculate_metric(groupData[groupID]["Metrics"][m]["formula"], counterlist)
+                    if tostring(tmp):len() > 12 then
+                        tmp = string.format("%e",tmp)
+                    end
+                    table.insert(tmpList, tostring(tmp))
+                end
+                table.insert(secondtab,tmpList)
+            end
+
+            if #cpulist > 1 then
+                secondtab_combined = tableMinMaxAvgSum(secondtab, 1, 1)
+            end
+        end
+        maxLineFields = math.max(#firsttab, #firsttab_combined,
+                                 #secondtab, #secondtab_combined)
+        if use_csv then
+            print(string.format("STRUCT,Info,3%s",string.rep(",",maxLineFields-3)))
+            print(string.format("CPU name:,%s%s", cpuinfo["osname"],string.rep(",",maxLineFields-2)))
+            print(string.format("CPU type:,%s%s", cpuinfo["name"],string.rep(",",maxLineFields-2)))
+            print(string.format("CPU clock:,%s GHz%s", clock*1.E-09,string.rep(",",maxLineFields-2)))
+            print(string.format("TABLE,Group %d Raw,%s,%d%s",groupID,groupName,#firsttab[1]-1,string.rep(",",maxLineFields-4)))
+            likwid.printcsv(firsttab, maxLineFields)
+        else
+            if outfile ~= nil then
+                print(likwid.hline)
+                print(string.format("CPU name:\t%s",cpuinfo["osname"]))
+                print(string.format("CPU type:\t%s",cpuinfo["name"]))
+                print(string.format("CPU clock:\t%3.2f GHz",clock * 1.E-09))
+                print(likwid.hline)
+            end
+            print("Group "..tostring(groupID)..": "..groupName)
+            likwid.printtable(firsttab)
+        end
+        if #cpulist > 1 then
+            if use_csv then
+                print(string.format("TABLE,Group %d Raw Stat,%s,%d%s",groupID,groupName,#firsttab_combined[1]-1,string.rep(",",maxLineFields-4)))
+                likwid.printcsv(firsttab_combined, maxLineFields)
+            else
+                likwid.printtable(firsttab_combined)
+            end
+        end
+        if groupData[groupID]["Metrics"] then
+            if use_csv then
+                print(string.format("TABLE,Group %d Metric,%s,%d%s",groupID,groupName,#secondtab[1]-1,string.rep(",",maxLineFields-4)))
+                likwid.printcsv(secondtab, maxLineFields)
+            else
+                likwid.printtable(secondtab)
+            end
+            if #cpulist > 1 then
+                if use_csv then
+                    print(string.format("TABLE,Group %d Metric Stat,%s,%d%s",groupID,groupName,#secondtab_combined[1]-1,string.rep(",",maxLineFields-4)))
+                    likwid.printcsv(secondtab_combined, maxLineFields)
+                else
+                    likwid.printtable(secondtab_combined)
+                end
+            end
+        end
+    end
+end
+
+
+likwid.printOutput = printOutput
+
+local function printMarkerOutput(groups, results, groupData, cpulist)
+    local nr_groups = #groups
+    local maxLineFields = 0
+    local clock = likwid_getCpuClock();
+    for g, group in pairs(groups) do
+        local groupName = groupData[g]["GroupString"]
+        if groupName == groupData[g]["EventString"] then
+            groupName = "Custom"
+        end
+        for r, region in pairs(groups[g]) do
+            local nr_threads = likwid.tablelength(groups[g][r]["Time"])
+            local nr_events = likwid.tablelength(groupData[g]["Events"])
+            if tablelength(groups[g][r]["Count"]) > 0 then
+
+                local infotab = {}
+                local firsttab = {}
+                local firsttab_combined = {}
+                local secondtab = {}
+                local secondtab_combined = {}
+
+                infotab[1] = {"Region Info","RDTSC Runtime [s]","call count"}
+                for thread=1, nr_threads do
+                    local tmpList = {}
+                    table.insert(tmpList, "Core "..tostring(cpulist[thread]))
+                    table.insert(tmpList, string.format("%.6f", groups[g][r]["Time"][thread]))
+                    table.insert(tmpList, tostring(groups[g][r]["Count"][thread]))
+                    table.insert(infotab, tmpList)
+                end
+
+                firsttab[1] = {"Event"}
+                firsttab_combined[1] = {"Event"}
+                for e=1,nr_events do
+                    table.insert(firsttab[1],groupData[g]["Events"][e]["Event"])
+                    table.insert(firsttab_combined[1],groupData[g]["Events"][e]["Event"].." STAT")
+                end
+                firsttab[2] = {"Counter"}
+                firsttab_combined[2] = {"Counter"}
+                for e=1,nr_events do
+                    table.insert(firsttab[2],groupData[g]["Events"][e]["Counter"])
+                    table.insert(firsttab_combined[2],groupData[g]["Events"][e]["Counter"])
+                end
+                for t=1,nr_threads do
+                    local tmpList = {}
+                    table.insert(tmpList, "Core "..tostring(cpulist[t]))
+                    for e=1,nr_events do
+                        local index = 0
+                        local tmp = results[g][r][e][t]["Value"]
+                        if tmp == nil then
+                            tmp = 0
+                        end
+                        table.insert(tmpList, string.format("%e",tmp))
+                    end
+                    table.insert(firsttab, tmpList)
+                end
+
+                if #cpulist > 1 then
+                    firsttab_combined = tableMinMaxAvgSum(firsttab, 2, 1)
+                end
+
+
+                if likwid.tablelength(groupData[g]["Metrics"]) > 0 then
+
+                    tmpList = {"Metric"}
+                    for m=1,#groupData[g]["Metrics"] do
+                        table.insert(tmpList, groupData[g]["Metrics"][m]["description"])
+                    end
+                    table.insert(secondtab, tmpList)
+                    for t=1,nr_threads do
+                        counterlist = {}
+                        for e=1,nr_events do
+                            counterlist[ results[g][r][e][t]["Counter"] ] = results[g][r][e][t]["Value"]
+                        end
+                        counterlist["inverseClock"] = 1.0/clock
+                        counterlist["time"] = groups[g][r]["Time"][t]
+                        tmpList = {}
+                        table.insert(tmpList, "Core "..tostring(cpulist[t]))
+                        for m=1,#groupData[g]["Metrics"] do
+                            local tmp = likwid.calculate_metric(groupData[g]["Metrics"][m]["formula"],counterlist)
+                            if tmp == nil or tostring(tmp) == "-nan" then
+                                tmp = "0"
+                            elseif tostring(tmp):len() > 12 then
+                                tmp = string.format("%e",tmp)
+                            end
+                            table.insert(tmpList, tmp)
+                        end
+                        table.insert(secondtab,tmpList)
+                    end
+
+                    if #cpulist > 1 then
+                        secondtab_combined = tableMinMaxAvgSum(secondtab, 1, 1)
+                    end
+                end
+                maxLineFields = math.max(#infotab, #firsttab, #firsttab_combined,
+                                         #secondtab, #secondtab_combined, 2)
+                
+                if use_csv then
+                    str = tostring(g)..","..groupName..","..groups[g][r]["Name"]
+                    if maxLineFields > 3 then
+                        str = str .. string.rep(",", maxLineFields-3)
+                    end
+                    if outfile ~= nil and g == 1 and r == 1 then
+                        print(string.format("STRUCT,Info,3%s",string.rep(",",maxLineFields-3)))
+                        print(string.format("CPU name:,%s%s", cpuinfo["osname"],string.rep(",",maxLineFields-2)))
+                        print(string.format("CPU type:,%s%s", cpuinfo["name"],string.rep(",",maxLineFields-2)))
+                        print(string.format("CPU clock:,%s GHz%s", clock*1.E-09,string.rep(",",maxLineFields-2)))
+                    end
+                else
+                    if outfile ~= nil and g == 1 and r == 1 then
+                        print(likwid.hline)
+                        print(string.format("CPU name:\t%s",cpuinfo["osname"]))
+                        print(string.format("CPU type:\t%s",cpuinfo["name"]))
+                        print(string.format("CPU clock:\t%3.2f GHz",clock * 1.E-09))
+                        print(likwid.hline)
+                    end
+                    print(likwid.dline)
+                    str = "Group "..tostring(g).." "..groupName..": Region "..groups[g][r]["Name"]
+                    print(str)
+                    print(likwid.dline)
+                end
+                
+                if use_csv then
+                    print(string.format("STRUCT,Info,5%s",string.rep(",",maxLineFields-3)))
+                    print(str)
+                    likwid.printcsv(infotab, maxLineFields)
+                    print(string.format("CPU clock,%f MHz%s",clock*1.E-9,string.rep(",",maxLineFields-2)))
+                else
+                    likwid.printtable(infotab)
+                end
+                if use_csv then
+                    print(string.format("TABLE,Group %d Raw,%s,%d%s",g,groupName,#firsttab[1]-1,string.rep(",",maxLineFields-3)))
+                    likwid.printcsv(firsttab, maxLineFields)
+                else
+                    likwid.printtable(firsttab)
+                end
+                if #cpulist > 1 then
+                    if use_csv then
+                        print(string.format("TABLE,Group %d Raw Stat,%s,%d%s",g,groupName,#firsttab_combined[1]-1,string.rep(",",maxLineFields-3)))
+                        likwid.printcsv(firsttab_combined, maxLineFields)
+                    else
+                        likwid.printtable(firsttab_combined)
+                    end
+                end
+                if likwid.tablelength(groupData[g]["Metrics"]) > 0 then
+                    if use_csv then
+                        print(string.format("TABLE,Group %d Metric,%s,%d%s",g,groupName,#secondtab[1]-1,string.rep(",",maxLineFields-3)))
+                        likwid.printcsv(secondtab, maxLineFields)
+                    else
+                        likwid.printtable(secondtab)
+                    end
+                    if #cpulist > 1 then
+                        if use_csv then
+                            print(string.format("TABLE,Group %d Metric Stat,%s,%d%s",g,groupName,#secondtab_combined[1]-1,string.rep(",",maxLineFields-3)))
+                            likwid.printcsv(secondtab_combined, maxLineFields)
+                        else
+                            likwid.printtable(secondtab_combined)
+                        end
+                    end
+                end
+            end
+        end
+    end
+end
+
+
+likwid.print_markerOutput = printMarkerOutput
+
+local function getResults()
+    local results = {}
+    local nr_groups = likwid_getNumberOfGroups()
+    local nr_threads = likwid_getNumberOfThreads()
+    for i=1,nr_groups do
+        results[i] = {}
+        local nr_events = likwid_getNumberOfEvents(i)
+        for j=1,nr_events do
+            results[i][j] = {}
+            for k=1, nr_threads do
+                results[i][j][k] = likwid_getResult(i,j,k)
+            end
+        end
+    end
+    return results
+end
+
+likwid.getResults = getResults
+
+local function getMarkerResults(filename, group_list, num_cpus)
+    local cpuinfo = likwid_getCpuInfo()
+    local ctr_and_events = likwid_getEventsAndCounters()
+    local group_data = {}
+    local results = {}
+    local f = io.open(filename, "r")
+    if f == nil then
+        print("Have you called LIKWID_MARKER_CLOSE?")
+        print(string.format("Cannot find intermediate results file %s", filename))
+        return {}, {}
+    end
+    local lines = stringsplit(f:read("*all"),"\n")
+    f:close()
+
+    -- Read first line with general counts
+    local tmpList = stringsplit(lines[1]," ")
+    if #tmpList ~= 3 then
+        print(string.format("Marker file %s not in proper format",filename))
+        return {}, {}
+    end
+    local nr_threads = tonumber(tmpList[1])
+    if tonumber(nr_threads) ~= tonumber(num_cpus) then
+        print(string.format("Marker file lists only %d cpus, but perfctr configured %d cpus", nr_threads, num_cpus))
+        return {},{}
+    end
+    local nr_regions = tonumber(tmpList[2])
+    if tonumber(nr_regions) == 0 then
+        print("No region results can be found in marker API output file")
+        return {},{}
+    end
+    local nr_groups = tonumber(tmpList[3])
+    if tonumber(nr_groups) == 0 then
+        print("No group listed in the marker API output file")
+        return {},{}
+    end
+    table.remove(lines,1)
+
+    -- Read Region IDs and names from following lines
+    for l=1, #lines do
+        r, gname, g = string.match(lines[1],"(%d+):([%a%g]*)-(%d+)")
+        if (r ~= nil and g ~= nil) then
+            g = tonumber(g)+1
+            r = tonumber(r)+1
+            
+            if group_data[g] == nil then
+                group_data[g] = {}
+            end
+            if group_data[g][r] == nil then
+                group_data[g][r] = {}
+            end
+            group_data[g][r]["ID"] = g
+            group_data[g][r]["Name"] = gname
+            group_data[g][r]["Time"] = {}
+            group_data[g][r]["Count"] = {}
+            if results[g] == nil then
+                results[g] = {}
+            end
+            if results[g][r] == nil then
+                results[g][r]= {}
+            end
+            table.remove(lines, 1 )
+        else
+            break
+        end
+    end
+
+    for l, line in pairs(lines) do
+        if line:len() > 0 then
+            r, g, t, count = string.match(line,"(%d+) (%d+) (%d+) (%d+) %a*")
+            if (r ~= nil and g ~= nil and t ~= nil and count ~= nil) then
+                r = tonumber(r)+1
+                g = tonumber(g)+1
+                t = tonumber(t)+1
+                tmpList = stringsplit(line, " ")
+                table.remove(tmpList, 1)
+                table.remove(tmpList, 1)
+                table.remove(tmpList, 1)
+                table.remove(tmpList, 1)
+                time = tonumber(tmpList[1])
+                events = tonumber(tmpList[2])
+                table.remove(tmpList, 1)
+                table.remove(tmpList, 1)
+                
+                table.insert(group_data[g][r]["Time"], t, time)
+                table.insert(group_data[g][r]["Count"], t, count)
+                for c=1, events do
+                    if results[g][r][c] == nil then
+                        results[g][r][c] = {}
+                    end
+                    if results[g][r][c][t] == nil then
+                        results[g][r][c][t] = {}
+                    end
+                    local tmp = tonumber(tmpList[c])
+                    results[g][r][c][t]["Value"] = tmp
+                    results[g][r][c][t]["Counter"] = group_list[g]["Events"][c]["Counter"]
+                end
+            end
+        end
+    end
+    return group_data, results
+end
+
+likwid.getMarkerResults = getMarkerResults
+
+
+local function msr_available(flags)
+    local ret = likwid_access("/dev/cpu/0/msr", flags)
+    if ret == 0 then
+        return true
+    else
+        local ret = likwid_access("/dev/msr0", flags)
+        if ret == 0 then
+            return true
+        end
+    end
+    return false
+end
+likwid.msr_available = msr_available
+
+
+local function addSimpleAsciiBox(container,lineIdx, colIdx, label)
+    local box = {}
+    if container[lineIdx] == nil then
+        container[lineIdx] = {}
+    end
+    box["width"] = 1
+    box["label"] = label
+    table.insert(container[lineIdx], box)
+end
+likwid.addSimpleAsciiBox = addSimpleAsciiBox
+
+local function addJoinedAsciiBox(container,lineIdx, startColIdx, endColIdx, label)
+    local box = {}
+    if container[lineIdx] == nil then
+        container[lineIdx] = {}
+    end
+    box["width"] = endColIdx-startColIdx+1
+    box["label"] = label
+    table.insert(container[lineIdx], box)
+end
+likwid.addJoinedAsciiBox = addJoinedAsciiBox
+
+local function printAsciiBox(container)
+    local boxwidth = 0
+    local numLines = #container
+    local maxNumColumns = 0
+    for i=1,numLines do
+        if #container[i] > maxNumColumns then
+            maxNumColumns = #container[i]
+        end
+        for j=1,#container[i] do
+            if container[i][j]["label"]:len() > boxwidth then
+                boxwidth = container[i][j]["label"]:len()
+            end
+        end
+    end
+    boxwidth = boxwidth + 2
+    boxline = "+" .. string.rep("-",((maxNumColumns * (boxwidth+2)) + maxNumColumns+1)) .. "+"
+    print(boxline)
+    for i=1,numLines do
+        innerboxline = "| "
+        local numColumns = #container[i]
+        for j=1,numColumns do
+            innerboxline = innerboxline .. "+"
+            if container[i][j]["width"] == 1 then
+                innerboxline = innerboxline .. string.rep("-", boxwidth)
+            else
+                innerboxline = innerboxline .. string.rep("-", (container[i][j]["width"] * boxwidth + (container[i][j]["width"]-1)*3))
+            end
+            innerboxline = innerboxline .. "+ "
+        end
+        
+        boxlabelline = "| "
+        for j=1,numColumns do
+            local offset = 0
+            local width = 0
+            local labellen = container[i][j]["label"]:len()
+            local boxlen = container[i][j]["width"]
+            if container[i][j]["width"] == 1 then
+                width = (boxwidth - labellen)/2;
+                offset = (boxwidth - labellen)%2;
+            else
+                width = (boxlen * boxwidth + ((boxlen-1)*3) - labellen)/2;
+                offset = (boxlen * boxwidth + ((boxlen-1)*3) - labellen)%2;
+            end
+            boxlabelline = boxlabelline .. "|" .. string.rep(" ",(width+offset))
+            boxlabelline = boxlabelline .. container[i][j]["label"]
+            boxlabelline = boxlabelline ..  string.rep(" ",(width)) .. "| "
+        end
+        print(innerboxline .. "|")
+        print(boxlabelline .. "|")
+        print(innerboxline .. "|")
+    end
+    print(boxline)
+end
+likwid.printAsciiBox = printAsciiBox
+
+-- Some helpers for output file substitutions
+-- getpid already defined by Lua-C-Interface
+local function gethostname()
+    local f = io.popen("hostname -s","r")
+    local hostname = f:read("*all"):gsub("^%s*(.-)%s*$", "%1")
+    f:close()
+    return hostname
+end
+
+likwid.gethostname = gethostname
+
+local function getjid()
+    local jid = os.getenv("PBS_JOBID")
+    if jid == nil then
+        jid = "X"
+    end
+    return jid
+end
+
+likwid.getjid = getjid
+
+local function getMPIrank()
+    local rank = os.getenv("PMI_RANK")
+    if rank == nil then
+        rank = os.getenv("OMPI_COMM_WORLD_RANK")
+        if rank == nil then
+            rank = "X"
+        end
+    end
+    return rank
+end
+
+likwid.getMPIrank = getMPIrank
+
+return likwid
diff --git a/src/asciiBoxes.c b/src/asciiBoxes.c
deleted file mode 100644
index a22dab5..0000000
--- a/src/asciiBoxes.c
+++ /dev/null
@@ -1,256 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  asciiBoxes.c
- *
- *      Description:  Module implementing output of nested ascii art boxes
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-
-#include <error.h>
-#include <types.h>
-#include <asciiBoxes.h>
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-BoxContainer*
-asciiBoxes_allocateContainer(int numLines, int numColumns)
-{
-    BoxContainer* container;
-
-    container = (BoxContainer*) malloc(sizeof(BoxContainer));
-    container->numLines = numLines;
-    container->numColumns = numColumns;
-
-    container->boxes = (Box**) malloc(numLines * sizeof(Box*));
-
-    for ( int i=0; i < numLines; i++ )
-    {
-        container->boxes[i] = (Box*) malloc(numColumns * sizeof(Box));
-    }
-
-    for(int i=0; i<numLines; i++)
-    {
-        for(int j=0; j<numColumns; j++)
-        {
-            container->boxes[i][j].width = 0;
-            container->boxes[i][j].label = NULL;
-        }
-    }
-
-    return container;
-}
-
-void 
-asciiBoxes_addBox(BoxContainer* container, int line, int column, bstring label)
-{
-    if ( line >= container->numLines )
-    {
-        ERROR_PRINT(line id %d too large,line);
-    }
-    if ( column >= container->numColumns )
-    {
-        ERROR_PRINT(column id %d too large,column);
-    }
-
-    container->boxes[line][column].width = 1;
-    container->boxes[line][column].label = bstrcpy(label);
-}
-
-
-void
-asciiBoxes_addJoinedBox(
-        BoxContainer* container,
-        int line,
-        int startColumn,
-        int endColumn,
-        bstring label)
-{
-    if ( line >= container->numLines )
-    {
-        ERROR_PRINT(line id %d too large,line);
-    }
-
-    if ( endColumn >= container->numColumns )
-    {
-        ERROR_PRINT(column id %d too large,endColumn);
-    }
-
-    container->boxes[line][startColumn].width = (endColumn-startColumn)+1;
-    container->boxes[line][startColumn].label = bstrcpy(label);
-}
-
-void
-asciiBoxes_print(FILE* OUTSTREAM, BoxContainer* container)
-{
-    int width;
-    int boxwidth=0; /* box width is inner width of box */
-
-    /* determine maximum label width */
-    for ( int i=0; i < container->numLines; i++ )
-    {
-        for ( int j=0; j < container->numColumns; j++ )
-        {
-            btrimws(container->boxes[i][j].label);
-            boxwidth = MAX(boxwidth,blength(container->boxes[i][j].label));
-
-            /* if box is joined increase counter */
-            if ( container->boxes[i][j].width > 1 )
-            {
-                j +=  container->boxes[i][j].width;
-            }
-        }
-    }
-    boxwidth += 2;  /* add one space each side */
-
-    /* top line */
-    printf("+");
-
-    for ( int i=0; i < (container->numColumns * (boxwidth+2) +
-                (container->numColumns+1));  /* one space between boxes */
-            i++ )
-    {
-        printf("-");
-    }
-    printf("+\n");
-
-    for ( int i=0; i < container->numLines; i++ )
-    {
-        /* Box top line */
-        printf("| ");
-
-        for ( int j=0; j < container->numColumns; j++ )
-        {
-            printf("+");
-
-            if ( container->boxes[i][j].width == 1 )
-            {
-                for ( int k=0; k < boxwidth; k++ )
-                {
-                    printf("-");
-                }
-            }
-            else 
-            {
-                for ( int k=0; k < (container->boxes[i][j].width * boxwidth +
-                            (container->boxes[i][j].width-1)*3);
-                        k++)
-                {
-                    printf("-");
-                }
-                j += container->boxes[i][j].width-1;
-            }
-            printf("+ ");
-        }
-        printf("|\n");
-        printf("| ");
-
-        /* Box label line */
-        for ( int j=0; j < container->numColumns; j++ )
-        {
-            int offset=0;
-
-            /* center label */
-            if ( container->boxes[i][j].width == 1 )
-            {
-                width = (boxwidth - blength(container->boxes[i][j].label))/2;
-                offset = (boxwidth - blength(container->boxes[i][j].label))%2;
-            }
-            else
-            {
-                width = (container->boxes[i][j].width * boxwidth +
-                        ((container->boxes[i][j].width-1)*3) -
-                        blength(container->boxes[i][j].label))/2;
-
-                offset = (container->boxes[i][j].width * boxwidth +
-                        ((container->boxes[i][j].width-1)*3) -
-                        blength(container->boxes[i][j].label))%2;
-            }
-            printf("|");
-
-            for ( int k=0; k < (width+offset); k++ )
-            {
-                printf(" ");
-            }
-
-            printf("%s",container->boxes[i][j].label->data);
-
-            for ( int k=0; k < width; k++ )
-            {
-                printf(" ");
-            }
-            printf("| ");
-
-            if ( container->boxes[i][j].width != 1 )
-            {
-                j+= container->boxes[i][j].width-1;
-            }
-        }
-        printf("|\n");
-        printf("| ");
-
-        /* Box bottom line */
-        for ( int j=0; j < container->numColumns; j++ )
-        {
-            printf("+");
-
-            if ( container->boxes[i][j].width == 1 )
-            {
-                for ( int k=0; k < boxwidth; k++ )
-                {
-                    printf("-");
-                }
-            }
-            else 
-            {
-                for ( int k=0; k < (container->boxes[i][j].width * boxwidth +
-                            (container->boxes[i][j].width-1)*3);
-                        k++ )
-                {
-                    printf("-");
-                }
-                j+= container->boxes[i][j].width-1;
-            }
-            printf("+ ");
-        }
-        printf("|\n");
-    }
-
-    /* bottom line */
-    printf("+");
-    for ( int i=0; i < (container->numColumns * (boxwidth+2) + 
-                container->numColumns+1); i++ )
-    {
-        printf("-");
-    }
-    printf("+\n");
-    fflush(stdout);
-}
-
diff --git a/src/asciiTable.c b/src/asciiTable.c
deleted file mode 100644
index 29b615a..0000000
--- a/src/asciiTable.c
+++ /dev/null
@@ -1,236 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  asciiTable.c
- *
- *      Description:  Module implementing output of ascii table.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <stdint.h>
-#include <string.h>
-
-#include <error.h>
-#include <types.h>
-#include <strUtil.h>
-#include <asciiTable.h>
-
-/* #####   LOCAL VARIABLES   ########################################### */
-
-static FILE* OUTPUT;
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-void
-asciiTable_setOutput(FILE* stream)
-{
-    OUTPUT = stream;
-}
-
-TableContainer*
-asciiTable_allocate(int numRows,int numColumns, bstrList* headerLabels)
-{
-    int i;
-    TableContainer* container;
-    OUTPUT = stdout;
-
-    container = (TableContainer*) malloc(sizeof(TableContainer));
-    container->numRows = numRows;
-    container->numColumns = numColumns;
-    container->currentRow = 0;
-    container->printed = 0;
-
-    if (numColumns != headerLabels->qty)
-    {
-        ERROR_PRINT(Number of columns %d not equal to number of header labels %d,numColumns,headerLabels->qty);
-    }
-
-    container->header = bstrListCreate();
-    bstrListAlloc (container->header, numColumns);
-
-    for(i=0; i<numColumns; i++)
-    {
-        container->header->entry[i] = bstrcpy(headerLabels->entry[i]);
-    }
-
-    container->rows = (bstrList**) malloc( numRows * sizeof(bstrList*));
-
-    for(i=0; i<numRows; i++)
-    {
-        container->rows[i] = bstrListCreate();
-        bstrListAlloc (container->rows[i], numColumns);
-    }
-
-    return container;
-}
-
-void 
-asciiTable_free(TableContainer* container)
-{
-    int i;
-
-    if(container == NULL)
-    {
-        ERROR_PLAIN_PRINT(Cannot free NULL reference);
-    }
-
-    bstrListDestroy(container->header);
-
-    for(i=0; i<container->numRows; i++)
-    {
-        bstrListDestroy(container->rows[i]);
-    }
-
-    free(container->rows);
-}
-
-void
-asciiTable_insertRow(TableContainer* container, int row, bstrList* fields)
-{
-    int i;
-
-    if (container->numColumns != fields->qty)
-    {
-        ERROR_PRINT(Number of colummns %d not equal to number of field labels %d,container->numColumns,fields->qty);
-    }
-
-    if (row >= container->numRows)
-    {
-        ERROR_PRINT(Number of Rows %d smaller than requested row index %d, container->numRows,row);
-    }
-
-    for(i=0; i<container->numColumns; i++)
-    {
-        container->rows[row]->entry[i] = bstrcpy(fields->entry[i]);
-        container->rows[row]->qty++;
-    }
-}
-
-void
-asciiTable_appendRow(TableContainer* container, bstrList* fields)
-{
-    asciiTable_insertRow(container, container->currentRow++, fields);
-}
-
-void
-asciiTable_setCurrentRow(TableContainer* container, int row)
-{
-    container->currentRow = row;
-}
-
-void
-asciiTable_print(TableContainer* container)
-{
-    int i;
-    int j;
-    int* boxwidth;
-
-    boxwidth = (int*) malloc(container->numColumns * sizeof(int));
-
-    for (j=0; j<container->numColumns; j++) boxwidth[j] = 0;
-
-    for (j=0; j<container->numColumns; j++)
-    {
-        boxwidth[j] = MAX(boxwidth[j],blength(container->header->entry[j]));
-    }
-
-    /* determine maximum label width in each column */
-    for (i=0; i<container->numRows; i++)
-    {
-        for (j=0; j<container->numColumns; j++)
-        {
-            //           btrimws(container->rows[i]->entry[j]);
-            boxwidth[j] = MAX(boxwidth[j],blength(container->rows[i]->entry[j]));
-        }
-    }
-
-    if (! container->printed)
-    {
-        /* Increase boxwidth with two spaces */
-        for (j=0; j<container->numColumns; j++) boxwidth[j] +=2;
-    }
-
-    /* print header */
-
-    for (j=0; j<container->numColumns; j++)
-    {
-        fprintf(OUTPUT,"+");
-        for (i=0;i<boxwidth[j];i++)
-        {
-            fprintf(OUTPUT,"-");
-        }
-    }
-    fprintf(OUTPUT,"+\n");
-
-    for (j=0; j<container->numColumns; j++)
-    {
-        fprintf(OUTPUT,"|");
-        bJustifyCenter(container->header->entry[j],boxwidth[j]);
-        fprintf(OUTPUT,"%s",bdata(container->header->entry[j]));
-    }
-    fprintf(OUTPUT,"|\n");
-
-    for (j=0; j<container->numColumns; j++)
-    {
-        fprintf(OUTPUT,"+");
-        for (i=0;i<boxwidth[j];i++)
-        {
-            fprintf(OUTPUT,"-");
-        }
-    }
-    fprintf(OUTPUT,"+\n");
-
-    for (i=0; i<container->numRows; i++)
-    {
-        for (j=0; j<container->numColumns; j++)
-        {
-            fprintf(OUTPUT,"|");
-            bJustifyCenter(container->rows[i]->entry[j],boxwidth[j]);
-            fprintf(OUTPUT,"%s",bdata(container->rows[i]->entry[j]));
-        }
-        fprintf(OUTPUT,"|\n");
-    }
-
-    for (j=0; j<container->numColumns; j++)
-    {
-        fprintf(OUTPUT,"+");
-        for (i=0;i<boxwidth[j];i++)
-        {
-            fprintf(OUTPUT,"-");
-        }
-    }
-    fprintf(OUTPUT,"+\n");
-    container->printed = 1;
-
-    free(boxwidth);
-}
-
-
-
-
diff --git a/src/barrier.c b/src/barrier.c
deleted file mode 100644
index 3a93f92..0000000
--- a/src/barrier.c
+++ /dev/null
@@ -1,155 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  barrier.c
- *
- *      Description:  Implementation of threaded spin loop barrier
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* #####   HEADER FILE INCLUDES   ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-
-#include <error.h>
-#include <types.h>
-#include <barrier.h>
-
-/* #####   EXPORTED VARIABLES   ########################################### */
-
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#define CACHELINE_SIZE 64
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static BarrierGroup* groups;
-static int currentGroupId = 0;
-static int maxGroupId = 0;
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-int
-barrier_registerGroup(int numThreads)
-{
-    int ret;
-
-    if (currentGroupId > maxGroupId)
-    {
-        ERROR_PRINT(Group ID %d larger than maxGroupID %d,currentGroupId,maxGroupId);
-    }
-
-    groups[currentGroupId].numberOfThreads = numThreads;
-    ret = posix_memalign(
-            (void**) &groups[currentGroupId].groupBval,
-            CACHELINE_SIZE, 
-            numThreads * 32 * sizeof(int));
-
-    if (ret < 0)
-    {
-        ERROR;
-    }
-
-
-    return currentGroupId++;
-}
-
-void
-barrier_registerThread(BarrierData* barr, int groupId, int threadId)
-{
-    int ret;
-    int i;
-    int j = 1;
-    if (groupId > currentGroupId)
-    {
-        ERROR_PLAIN_PRINT(Group not yet registered);
-    }
-    if (threadId > groups[groupId].numberOfThreads)
-    {
-        ERROR_PRINT(Thread ID %d too large,threadId);
-    }
-
-    barr->numberOfThreads = groups[groupId].numberOfThreads;
-    barr->offset = 0;
-    barr->val = 1;
-    barr->bval =  groups[groupId].groupBval;
-    ret = posix_memalign(
-            (void**) &(barr->index),
-            CACHELINE_SIZE, 
-            barr->numberOfThreads * sizeof(int));
-
-    if (ret < 0)
-    {
-        ERROR;
-    }
-
-
-    barr->index[0] = threadId;
-
-    for (i = 0; i < barr->numberOfThreads; i++)
-    {
-        if (!(i == threadId))
-        {
-            barr->index[j++] = i;
-        }
-    }
-}
-
-
-void
-barrier_init(int numberOfGroups) 
-{
-    maxGroupId = numberOfGroups-1;
-    groups = (BarrierGroup*) malloc(numberOfGroups * sizeof(BarrierGroup));
-}
-
-void
-barrier_synchronize(BarrierData* barr)
-{
-    int i;
-
-    barr->bval[barr->index[0] * 32 +  barr->offset * 16] = barr->val;
-
-    for (i = 1; i < barr->numberOfThreads; i++)
-    {
-        while (barr->bval[barr->index[i] * 32 + barr->offset * 16] != barr->val)
-        {
-            __asm__ ("pause");
-        }
-    } 
-    
-    if (barr->offset)
-    {
-        barr->val = !barr->val;
-    }
-    barr->offset = !barr->offset;
-}
-
-void barrier_destroy(void)
-{
-    free(groups);
-}
diff --git a/src/bench.c b/src/bench.c
deleted file mode 100644
index 3a0b81b..0000000
--- a/src/bench.c
+++ /dev/null
@@ -1,537 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  bench.c
- *
- *      Description:  Benchmarking framework for likwid-bench
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
-#include <pthread.h>
-#include <stdlib.h>
-#include <stdio.h>
-#include <sys/syscall.h>
-#include <string.h>
-#include <sched.h>
-#include <types.h>
-#include <unistd.h>
-
-#include <timer.h>
-#include <threads.h>
-#include <affinity.h>
-#include <barrier.h>
-#include <likwid.h>
-#ifdef PAPI
-#include <papi.h>
-#endif
-
-/* #####   EXPORTED VARIABLES   ########################################### */
-
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-//#define BARRIER pthread_barrier_wait(&threads_barrier)
-#define BARRIER   barrier_synchronize(&barr)
-
-#ifdef PERFMON
-#define START_PERFMON likwid_markerStartRegion("bench");
-#define STOP_PERFMON  likwid_markerStopRegion("bench");
-#define LIKWID_THREAD_INIT  likwid_markerThreadInit();
-#define EXECUTE EXECUTE_LIKWID
-#else
-#ifdef PAPI
-#define START_PERFMON(event_set) PAPI_start(event_set);
-#define STOP_PERFMON(event_set, result) PAPI_stop ( event_set ,result );
-#define LIKWID_THREAD_INIT
-#define EXECUTE EXECUTE_PAPI
-#else
-#define START_PERFMON
-#define STOP_PERFMON
-#define LIKWID_THREAD_INIT
-#define EXECUTE EXECUTE_LIKWID
-#endif
-#endif
-
-#define EXECUTE_LIKWID(func)   \
-    BARRIER; \
-    if (data->threadId == 0) \
-    { \
-        timer_start(&time); \
-    } \
-    START_PERFMON  \
-    for (i=0; i<  data->data.iter; i++) \
-    {   \
-    func; \
-    } \
-    BARRIER; \
-    STOP_PERFMON  \
-    if (data->threadId == 0) \
-    { \
-        timer_stop(&time); \
-        data->cycles = timer_printCycles(&time); \
-    } \
-    BARRIER 
-
-#define EXECUTE_PAPI(func)   \
-    BARRIER; \
-    if (data->threadId == 0) \
-    { \
-        timer_start(&time); \
-    } \
-    START_PERFMON(event_set)  \
-    for (i=0; i<  data->data.iter; i++) \
-    {   \
-    func; \
-    } \
-    BARRIER; \
-    STOP_PERFMON(event_set, &(result[0]))  \
-    if (data->threadId == 0) \
-    { \
-        timer_stop(&time); \
-        data->cycles = timer_printCycles(&time); \
-    } \
-    BARRIER
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-void* runTest(void* arg)
-{
-    int threadId;
-    int offset;
-    size_t size;
-    size_t i;
-    BarrierData barr;
-    ThreadData* data;
-    ThreadUserData* myData;
-    TimerData time;
-    FuncPrototype func;
-    FILE* OUTSTREAM;
-#ifdef PAPI
-    int event_set = PAPI_NULL;
-    char groupname[50];
-    char* group_ptr = &(groupname[0]);
-    long long int result[4] = {0,0,0,0};
-    group_ptr = getenv("PAPI_BENCH");
-    PAPI_create_eventset(&event_set);
-    PAPI_add_event(event_set, PAPI_TOT_CYC);
-    // L3 group
-    if (strncmp(group_ptr,"L3",2) == 0)
-    {
-        PAPI_add_event(event_set, PAPI_L3_TCA);
-    }
-    // L2 group
-    else if (strncmp(group_ptr,"L2",2) == 0)
-    {
-        PAPI_add_event(event_set, PAPI_L2_TCA);
-    }
-    // FLOPS_AVX
-    else if (strncmp(group_ptr,"FLOPS_AVX",9) == 0)
-    {
-        PAPI_add_event(event_set, PAPI_VEC_SP);
-        PAPI_add_event(event_set, PAPI_VEC_DP);
-        PAPI_add_event(event_set, PAPI_FP_INS);
-    }
-    // FLOPS_DP
-    else if (strncmp(group_ptr,"FLOPS_DP",8) == 0)
-    {
-        PAPI_add_event(event_set, PAPI_DP_OPS);
-    }
-    // FLOPS_SP
-    else if (strncmp(group_ptr,"FLOPS_SP",8) == 0)
-    {
-        PAPI_add_event(event_set, PAPI_SP_OPS);
-    }
-#endif
-
-    data = (ThreadData*) arg;
-    myData = &(data->data);
-    func = myData->test->kernel;
-    threadId = data->threadId;
-    OUTSTREAM = data->output;
-    barrier_registerThread(&barr, 0, data->globalThreadId);
-
-    /* Prepare ptrs for thread */
-    size = myData->size / data->numberOfThreads;
-    size -= (size%myData->test->stride);
-    offset = data->threadId * size;
-    myData->size = size;
-
-    switch ( myData->test->type )
-    {
-    	case SINGLE_RAND:
-        case SINGLE:
-            {
-                float* sptr;
-                for (i=0; i <  myData->test->streams; i++)
-                {
-                    sptr = (float*) myData->streams[i];
-                    sptr +=  offset;
-              //      sptr +=  size;
-                    myData->streams[i] = (float*) sptr;
-                }
-            }
-            break;
-        case DOUBLE_RAND:
-        case DOUBLE:
-            {
-                double* dptr;
-                for (i=0; i <  myData->test->streams; i++)
-                {
-                    dptr = (double*) myData->streams[i];
-                    dptr +=  offset;
-             //       dptr +=  size;
-                    myData->streams[i] = (double*) dptr;
-                }
-            }
-            break;
-    }
-
-    /* pint the thread */
-    affinity_pinThread(myData->processors[threadId]);
-
-    sleep(1);
-    LIKWID_THREAD_INIT;
-    BARRIER;
-    if (OUTSTREAM)
-    {
-        fprintf(OUTSTREAM, "Group: %d Thread %d Global Thread %d running on core %d - Vector length %llu Offset %d\n",
-                data->groupId,
-                threadId,
-                data->globalThreadId,
-                affinity_threadGetProcessorId(),
-                LLU_CAST size,
-                offset);
-    }
-    BARRIER;
-
-    /* Up to 10 streams the following registers are used for Array ptr:
-     * Size rdi
-     * in Registers: rsi  rdx  rcx  r8  r9
-     * passed on stack, then: r10  r11  r12  r13  r14  r15
-     * If more than 10 streams are used first 5 streams are in register, above 5 a macro must be used to
-     * load them from stack
-     * */
-
-    switch ( myData->test->streams ) {
-        case STREAM_1:
-            EXECUTE(func(size,myData->streams[0]));
-            break;
-        case STREAM_2:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1]));
-            break;
-        case STREAM_3:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2]));
-            break;
-        case STREAM_4:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3]));
-            break;
-        case STREAM_5:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4]));
-            break;
-        case STREAM_6:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5]));
-            break;
-        case STREAM_7:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6]));
-            break;
-        case STREAM_8:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7]));
-            break;
-        case STREAM_9:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8]));
-            break;
-        case STREAM_10:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9]));
-            break;
-        case STREAM_11:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10]));
-            break;
-        case STREAM_12:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11]));
-            break;
-        case STREAM_13:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12]));
-            break;
-        case STREAM_14:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13]));
-            break;
-        case STREAM_15:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14]));
-            break;
-        case STREAM_16:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15]));
-            break;
-        case STREAM_17:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16]));
-            break;
-        case STREAM_18:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17]));
-            break;
-        case STREAM_19:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18]));
-            break;
-        case STREAM_20:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19]));
-            break;
-        case STREAM_21:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20]));
-            break;
-        case STREAM_22:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21]));
-            break;
-        case STREAM_23:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22]));
-            break;
-        case STREAM_24:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23]));
-            break;
-        case STREAM_25:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24]));
-            break;
-        case STREAM_26:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25]));
-            break;
-        case STREAM_27:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26]));
-            break;
-        case STREAM_28:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27]));
-            break;
-        case STREAM_29:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28]));
-            break;
-        case STREAM_30:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29]));
-            break;
-        case STREAM_31:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30]));
-            break;
-        case STREAM_32:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31]));
-            break;
-        case STREAM_33:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
-                        myData->streams[32]));
-            break;
-        case STREAM_34:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
-                        myData->streams[32],myData->streams[33]));
-            break;
-        case STREAM_35:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
-                        myData->streams[32],myData->streams[33],myData->streams[34]));
-            break;
-        case STREAM_36:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
-                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35]));
-            break;
-        case STREAM_37:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
-                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
-                        myData->streams[36]));
-            break;
-        case STREAM_38:
-            EXECUTE(func(size,myData->streams[0],myData->streams[1],myData->streams[2],myData->streams[3],
-                        myData->streams[4],myData->streams[5],myData->streams[6],myData->streams[7],
-                        myData->streams[8],myData->streams[9],myData->streams[10],myData->streams[11],
-                        myData->streams[12],myData->streams[13],myData->streams[14],myData->streams[15],
-                        myData->streams[16],myData->streams[17],myData->streams[18],myData->streams[19],
-                        myData->streams[20],myData->streams[21],myData->streams[22],myData->streams[23],
-                        myData->streams[24],myData->streams[25],myData->streams[26],myData->streams[27],
-                        myData->streams[28],myData->streams[29],myData->streams[30],myData->streams[31],
-                        myData->streams[32],myData->streams[33],myData->streams[34],myData->streams[35],
-                        myData->streams[36],myData->streams[37]));
-            break;
-        default:
-            break;
-    }
-#ifdef PAPI
-    double papi_result = 0.0;
-    // L2 & L3 group
-    if (strncmp(group_ptr,"L3",2) == 0 ||
-        strncmp(group_ptr,"L2",2) == 0)
-    {
-        papi_result = ((double)result[1]) * 64.0;
-    }
-    // FLOPS_AVX
-    else if (strncmp(group_ptr,"FLOPS",5) == 0)
-    {
-        papi_result = (double) result[1]+ (double) result[2];
-    }
-    if (OUTSTREAM)
-    {
-        fprintf(OUTSTREAM, "Thread %d Result %f\n",threadId, papi_result);
-    }
-#endif
-    pthread_exit(NULL);
-}
-
-
diff --git a/src/bitUtil.c b/src/bitUtil.c
index cdce490..3547f23 100644
--- a/src/bitUtil.c
+++ b/src/bitUtil.c
@@ -5,13 +5,13 @@
  *
  *      Description:  Utility routines manipulating bit arrays.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/bstrlib.c b/src/bstrlib.c
index 52f5a99..380269c 100644
--- a/src/bstrlib.c
+++ b/src/bstrlib.c
@@ -64,27 +64,27 @@
 /* Compute the snapped size for a given requested size.  By snapping to powers
    of 2 like this, repeated reallocations are avoided. */
 static int snapUpSize (int i) {
-	if (i < 8) {
-		i = 8;
-	} else {
-		unsigned int j;
-		j = (unsigned int) i;
-
-		j |= (j >>  1);
-		j |= (j >>  2);
-		j |= (j >>  4);
-		j |= (j >>  8);		/* Ok, since int >= 16 bits */
+    if (i < 8) {
+        i = 8;
+    } else {
+        unsigned int j;
+        j = (unsigned int) i;
+
+        j |= (j >>  1);
+        j |= (j >>  2);
+        j |= (j >>  4);
+        j |= (j >>  8);        /* Ok, since int >= 16 bits */
 #if (UINT_MAX != 0xffff)
-		j |= (j >> 16);		/* For 32 bit int systems */
+        j |= (j >> 16);        /* For 32 bit int systems */
 #if (UINT_MAX > 0xffffffffUL)
-		j |= (j >> 32);		/* For 64 bit int systems */
+        j |= (j >> 32);        /* For 64 bit int systems */
 #endif
 #endif
-		/* Least power of two greater than i */
-		j++;
-		if ((int) j >= i) i = (int) j;
-	}
-	return i;
+        /* Least power of two greater than i */
+        j++;
+        if ((int) j >= i) i = (int) j;
+    }
+    return i;
 }
 
 /*  int balloc (bstring b, int len)
@@ -92,59 +92,59 @@ static int snapUpSize (int i) {
  *  Increase the size of the memory backing the bstring b to at least len.
  */
 int balloc (bstring b, int olen) {
-	int len;
-	if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen <= 0 || 
-	    b->mlen < b->slen || olen <= 0) {
-		return BSTR_ERR;
-	}
+    int len;
+    if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen <= 0 || 
+        b->mlen < b->slen || olen <= 0) {
+        return BSTR_ERR;
+    }
 
-	if (olen >= b->mlen) {
-		unsigned char * x;
+    if (olen >= b->mlen) {
+        unsigned char * x;
 
-		if ((len = snapUpSize (olen)) <= b->mlen) return BSTR_OK;
+        if ((len = snapUpSize (olen)) <= b->mlen) return BSTR_OK;
 
-		/* Assume probability of a non-moving realloc is 0.125 */
-		if (7 * b->mlen < 8 * b->slen) {
+        /* Assume probability of a non-moving realloc is 0.125 */
+        if (7 * b->mlen < 8 * b->slen) {
 
-			/* If slen is close to mlen in size then use realloc to reduce
-			   the memory defragmentation */
+            /* If slen is close to mlen in size then use realloc to reduce
+               the memory defragmentation */
 
-			reallocStrategy:;
+            reallocStrategy:;
 
-			x = (unsigned char *) bstr__realloc (b->data, (size_t) len);
-			if (x == NULL) {
+            x = (unsigned char *) bstr__realloc (b->data, (size_t) len);
+            if (x == NULL) {
 
-				/* Since we failed, try allocating the tighest possible 
-				   allocation */
+                /* Since we failed, try allocating the tighest possible 
+                   allocation */
 
-				if (NULL == (x = (unsigned char *) bstr__realloc (b->data, (size_t) (len = olen)))) {
-					return BSTR_ERR;
-				}
-			}
-		} else {
+                if (NULL == (x = (unsigned char *) bstr__realloc (b->data, (size_t) (len = olen)))) {
+                    return BSTR_ERR;
+                }
+            }
+        } else {
 
-			/* If slen is not close to mlen then avoid the penalty of copying
-			   the extra bytes that are allocated, but not considered part of
-			   the string */
+            /* If slen is not close to mlen then avoid the penalty of copying
+               the extra bytes that are allocated, but not considered part of
+               the string */
 
-			if (NULL == (x = (unsigned char *) bstr__alloc ((size_t) len))) {
+            if (NULL == (x = (unsigned char *) bstr__alloc ((size_t) len))) {
 
-				/* Perhaps there is no available memory for the two 
-				   allocations to be in memory at once */
+                /* Perhaps there is no available memory for the two 
+                   allocations to be in memory at once */
 
-				goto reallocStrategy;
+                goto reallocStrategy;
 
-			} else {
-				if (b->slen) bstr__memcpy ((char *) x, (char *) b->data, (size_t) b->slen);
-				bstr__free (b->data);
-			}
-		}
-		b->data = x;
-		b->mlen = len;
-		b->data[b->slen] = (unsigned char) '\0';
-	}
+            } else {
+                if (b->slen) bstr__memcpy ((char *) x, (char *) b->data, (size_t) b->slen);
+                bstr__free (b->data);
+            }
+        }
+        b->data = x;
+        b->mlen = len;
+        b->data[b->slen] = (unsigned char) '\0';
+    }
 
-	return BSTR_OK;
+    return BSTR_OK;
 }
 
 /*  int ballocmin (bstring b, int len)
@@ -154,24 +154,24 @@ int balloc (bstring b, int olen) {
  *  performance.
  */
 int ballocmin (bstring b, int len) {
-	unsigned char * s;
+    unsigned char * s;
 
-	if (b == NULL || b->data == NULL || (b->slen+1) < 0 || b->mlen <= 0 || 
-	    b->mlen < b->slen || len <= 0) {
-		return BSTR_ERR;
-	}
+    if (b == NULL || b->data == NULL || (b->slen+1) < 0 || b->mlen <= 0 || 
+        b->mlen < b->slen || len <= 0) {
+        return BSTR_ERR;
+    }
 
-	if (len < b->slen + 1) len = b->slen + 1;
+    if (len < b->slen + 1) len = b->slen + 1;
 
-	if (len != b->mlen) {
-		s = (unsigned char *) bstr__realloc (b->data, (size_t) len);
-		if (NULL == s) return BSTR_ERR;
-		s[b->slen] = (unsigned char) '\0';
-		b->data = s;
-		b->mlen = len;
-	}
+    if (len != b->mlen) {
+        s = (unsigned char *) bstr__realloc (b->data, (size_t) len);
+        if (NULL == s) return BSTR_ERR;
+        s[b->slen] = (unsigned char) '\0';
+        b->data = s;
+        b->mlen = len;
+    }
 
-	return BSTR_OK;
+    return BSTR_OK;
 }
 
 /*  bstring bfromcstr (const char * str)
@@ -184,21 +184,21 @@ bstring b;
 int i;
 size_t j;
 
-	if (str == NULL) return NULL;
-	j = (strlen) (str);
-	i = snapUpSize ((int) (j + (2 - (j != 0))));
-	if (i <= (int) j) return NULL;
+    if (str == NULL) return NULL;
+    j = (strlen) (str);
+    i = snapUpSize ((int) (j + (2 - (j != 0))));
+    if (i <= (int) j) return NULL;
 
-	b = (bstring) bstr__alloc (sizeof (struct tagbstring));
-	if (NULL == b) return NULL;
-	b->slen = (int) j;
-	if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
-		bstr__free (b);
-		return NULL;
-	}
+    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (NULL == b) return NULL;
+    b->slen = (int) j;
+    if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
+        bstr__free (b);
+        return NULL;
+    }
 
-	bstr__memcpy (b->data, str, j+1);
-	return b;
+    bstr__memcpy (b->data, str, j+1);
+    return b;
 }
 
 /*  bstring bfromcstralloc (int mlen, const char * str)
@@ -212,23 +212,23 @@ bstring b;
 int i;
 size_t j;
 
-	if (str == NULL) return NULL;
-	j = (strlen) (str);
-	i = snapUpSize ((int) (j + (2 - (j != 0))));
-	if (i <= (int) j) return NULL;
+    if (str == NULL) return NULL;
+    j = (strlen) (str);
+    i = snapUpSize ((int) (j + (2 - (j != 0))));
+    if (i <= (int) j) return NULL;
 
-	b = (bstring) bstr__alloc (sizeof (struct tagbstring));
-	if (b == NULL) return NULL;
-	b->slen = (int) j;
-	if (i < mlen) i = mlen;
+    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (b == NULL) return NULL;
+    b->slen = (int) j;
+    if (i < mlen) i = mlen;
 
-	if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
-		bstr__free (b);
-		return NULL;
-	}
+    if (NULL == (b->data = (unsigned char *) bstr__alloc (b->mlen = i))) {
+        bstr__free (b);
+        return NULL;
+    }
 
-	bstr__memcpy (b->data, str, j+1);
-	return b;
+    bstr__memcpy (b->data, str, j+1);
+    return b;
 }
 
 /*  bstring blk2bstr (const void * blk, int len)
@@ -240,26 +240,26 @@ bstring blk2bstr (const void * blk, int len) {
 bstring b;
 int i;
 
-	if (blk == NULL || len < 0) return NULL;
-	b = (bstring) bstr__alloc (sizeof (struct tagbstring));
-	if (b == NULL) return NULL;
-	b->slen = len;
+    if (blk == NULL || len < 0) return NULL;
+    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (b == NULL) return NULL;
+    b->slen = len;
 
-	i = len + (2 - (len != 0));
-	i = snapUpSize (i);
+    i = len + (2 - (len != 0));
+    i = snapUpSize (i);
 
-	b->mlen = i;
+    b->mlen = i;
 
-	b->data = (unsigned char *) bstr__alloc ((size_t) b->mlen);
-	if (b->data == NULL) {
-		bstr__free (b);
-		return NULL;
-	}
+    b->data = (unsigned char *) bstr__alloc ((size_t) b->mlen);
+    if (b->data == NULL) {
+        bstr__free (b);
+        return NULL;
+    }
 
-	if (len > 0) bstr__memcpy (b->data, blk, (size_t) len);
-	b->data[len] = (unsigned char) '\0';
+    if (len > 0) bstr__memcpy (b->data, blk, (size_t) len);
+    b->data[len] = (unsigned char) '\0';
 
-	return b;
+    return b;
 }
 
 /*  char * bstr2cstr (const_bstring s, char z)
@@ -273,18 +273,18 @@ char * bstr2cstr (const_bstring b, char z) {
 int i, l;
 char * r;
 
-	if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
-	l = b->slen;
-	r = (char *) bstr__alloc ((size_t) (l + 1));
-	if (r == NULL) return r;
+    if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+    l = b->slen;
+    r = (char *) bstr__alloc ((size_t) (l + 1));
+    if (r == NULL) return r;
 
-	for (i=0; i < l; i ++) {
-		r[i] = (char) ((b->data[i] == '\0') ? z : (char) (b->data[i]));
-	}
+    for (i=0; i < l; i ++) {
+        r[i] = (char) ((b->data[i] == '\0') ? z : (char) (b->data[i]));
+    }
 
-	r[l] = (unsigned char) '\0';
+    r[l] = (unsigned char) '\0';
 
-	return r;
+    return r;
 }
 
 /*  int bcstrfree (char * s)
@@ -299,11 +299,11 @@ char * r;
  *  redefinitions.
  */
 int bcstrfree (char * s) {
-	if (s) {
-		bstr__free (s);
-		return BSTR_OK;
-	}
-	return BSTR_ERR;
+    if (s) {
+        bstr__free (s);
+        return BSTR_OK;
+    }
+    return BSTR_ERR;
 }
 
 /*  int bconcat (bstring b0, const_bstring b1)
@@ -314,28 +314,28 @@ int bconcat (bstring b0, const_bstring b1) {
 int len, d;
 bstring aux = (bstring) b1;
 
-	if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL) return BSTR_ERR;
+    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL) return BSTR_ERR;
 
-	d = b0->slen;
-	len = b1->slen;
-	if ((d | (b0->mlen - d) | len | (d + len)) < 0) return BSTR_ERR;
+    d = b0->slen;
+    len = b1->slen;
+    if ((d | (b0->mlen - d) | len | (d + len)) < 0) return BSTR_ERR;
 
-	if (b0->mlen <= d + len + 1) {
-		ptrdiff_t pd = b1->data - b0->data;
-		if (0 <= pd && pd < b0->mlen) {
-			if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
-		}
-		if (balloc (b0, d + len + 1) != BSTR_OK) {
-			if (aux != b1) bdestroy (aux);
-			return BSTR_ERR;
-		}
-	}
+    if (b0->mlen <= d + len + 1) {
+        ptrdiff_t pd = b1->data - b0->data;
+        if (0 <= pd && pd < b0->mlen) {
+            if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
+        }
+        if (balloc (b0, d + len + 1) != BSTR_OK) {
+            if (aux != b1) bdestroy (aux);
+            return BSTR_ERR;
+        }
+    }
 
-	bBlockCopy (&b0->data[d], &aux->data[0], (size_t) len);
-	b0->data[d + len] = (unsigned char) '\0';
-	b0->slen = d + len;
-	if (aux != b1) bdestroy (aux);
-	return BSTR_OK;
+    bBlockCopy (&b0->data[d], &aux->data[0], (size_t) len);
+    b0->data[d + len] = (unsigned char) '\0';
+    b0->slen = d + len;
+    if (aux != b1) bdestroy (aux);
+    return BSTR_OK;
 }
 
 /*  int bconchar (bstring b, char c)
@@ -345,13 +345,13 @@ bstring aux = (bstring) b1;
 int bconchar (bstring b, char c) {
 int d;
 
-	if (b == NULL) return BSTR_ERR;
-	d = b->slen;
-	if ((d | (b->mlen - d)) < 0 || balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
-	b->data[d] = (unsigned char) c;
-	b->data[d + 1] = (unsigned char) '\0';
-	b->slen++;
-	return BSTR_OK;
+    if (b == NULL) return BSTR_ERR;
+    d = b->slen;
+    if ((d | (b->mlen - d)) < 0 || balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+    b->data[d] = (unsigned char) c;
+    b->data[d + 1] = (unsigned char) '\0';
+    b->slen++;
+    return BSTR_OK;
 }
 
 /*  int bcatcstr (bstring b, const char * s)
@@ -362,22 +362,22 @@ int bcatcstr (bstring b, const char * s) {
 char * d;
 int i, l;
 
-	if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
-	 || b->mlen <= 0 || s == NULL) return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
+     || b->mlen <= 0 || s == NULL) return BSTR_ERR;
 
-	/* Optimistically concatenate directly */
-	l = b->mlen - b->slen;
-	d = (char *) &b->data[b->slen];
-	for (i=0; i < l; i++) {
-		if ((*d++ = *s++) == '\0') {
-			b->slen += i;
-			return BSTR_OK;
-		}
-	}
-	b->slen += i;
+    /* Optimistically concatenate directly */
+    l = b->mlen - b->slen;
+    d = (char *) &b->data[b->slen];
+    for (i=0; i < l; i++) {
+        if ((*d++ = *s++) == '\0') {
+            b->slen += i;
+            return BSTR_OK;
+        }
+    }
+    b->slen += i;
 
-	/* Need to explicitely resize and concatenate tail */
-	return bcatblk (b, (const void *) s, (int) strlen (s));
+    /* Need to explicitely resize and concatenate tail */
+    return bcatblk (b, (const void *) s, (int) strlen (s));
 }
 
 /*  int bcatblk (bstring b, const void * s, int len)
@@ -387,16 +387,16 @@ int i, l;
 int bcatblk (bstring b, const void * s, int len) {
 int nl;
 
-	if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
-	 || b->mlen <= 0 || s == NULL || len < 0) return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->slen < 0 || b->mlen < b->slen
+     || b->mlen <= 0 || s == NULL || len < 0) return BSTR_ERR;
 
-	if (0 > (nl = b->slen + len)) return BSTR_ERR; /* Overflow? */
-	if (b->mlen <= nl && 0 > balloc (b, nl + 1)) return BSTR_ERR;
+    if (0 > (nl = b->slen + len)) return BSTR_ERR; /* Overflow? */
+    if (b->mlen <= nl && 0 > balloc (b, nl + 1)) return BSTR_ERR;
 
-	bBlockCopy (&b->data[b->slen], s, (size_t) len);
-	b->slen = nl;
-	b->data[nl] = (unsigned char) '\0';
-	return BSTR_OK;
+    bBlockCopy (&b->data[b->slen], s, (size_t) len);
+    b->slen = nl;
+    b->data[nl] = (unsigned char) '\0';
+    return BSTR_OK;
 }
 
 /*  bstring bstrcpy (const_bstring b)
@@ -407,36 +407,36 @@ bstring bstrcpy (const_bstring b) {
 bstring b0;
 int i,j;
 
-	/* Attempted to copy an invalid string? */
-	if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+    /* Attempted to copy an invalid string? */
+    if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
 
-	b0 = (bstring) bstr__alloc (sizeof (struct tagbstring));
-	if (b0 == NULL) {
-		/* Unable to allocate memory for string header */
-		return NULL;
-	}
+    b0 = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (b0 == NULL) {
+        /* Unable to allocate memory for string header */
+        return NULL;
+    }
 
-	i = b->slen;
-	j = snapUpSize (i + 1);
+    i = b->slen;
+    j = snapUpSize (i + 1);
 
-	b0->data = (unsigned char *) bstr__alloc (j);
-	if (b0->data == NULL) {
-		j = i + 1;
-		b0->data = (unsigned char *) bstr__alloc (j);
-		if (b0->data == NULL) {
-			/* Unable to allocate memory for string data */
-			bstr__free (b0);
-			return NULL;
-		}
-	}
+    b0->data = (unsigned char *) bstr__alloc (j);
+    if (b0->data == NULL) {
+        j = i + 1;
+        b0->data = (unsigned char *) bstr__alloc (j);
+        if (b0->data == NULL) {
+            /* Unable to allocate memory for string data */
+            bstr__free (b0);
+            return NULL;
+        }
+    }
 
-	b0->mlen = j;
-	b0->slen = i;
+    b0->mlen = j;
+    b0->slen = i;
 
-	if (i) bstr__memcpy ((char *) b0->data, (char *) b->data, i);
-	b0->data[b0->slen] = (unsigned char) '\0';
+    if (i) bstr__memcpy ((char *) b0->data, (char *) b->data, i);
+    b0->data[b0->slen] = (unsigned char) '\0';
 
-	return b0;
+    return b0;
 }
 
 /*  int bassign (bstring a, const_bstring b)
@@ -444,19 +444,19 @@ int i,j;
  *  Overwrite the string a with the contents of string b.
  */
 int bassign (bstring a, const_bstring b) {
-	if (b == NULL || b->data == NULL || b->slen < 0)
-		return BSTR_ERR;
-	if (b->slen != 0) {
-		if (balloc (a, b->slen) != BSTR_OK) return BSTR_ERR;
-		bstr__memmove (a->data, b->data, b->slen);
-	} else {
-		if (a == NULL || a->data == NULL || a->mlen < a->slen || 
-		    a->slen < 0 || a->mlen == 0) 
-			return BSTR_ERR;
-	}
-	a->data[b->slen] = (unsigned char) '\0';
-	a->slen = b->slen;
-	return BSTR_OK;
+    if (b == NULL || b->data == NULL || b->slen < 0)
+        return BSTR_ERR;
+    if (b->slen != 0) {
+        if (balloc (a, b->slen) != BSTR_OK) return BSTR_ERR;
+        bstr__memmove (a->data, b->data, b->slen);
+    } else {
+        if (a == NULL || a->data == NULL || a->mlen < a->slen || 
+            a->slen < 0 || a->mlen == 0) 
+            return BSTR_ERR;
+    }
+    a->data[b->slen] = (unsigned char) '\0';
+    a->slen = b->slen;
+    return BSTR_OK;
 }
 
 /*  int bassignmidstr (bstring a, const_bstring b, int left, int len)
@@ -466,29 +466,29 @@ int bassign (bstring a, const_bstring b) {
  *  len are clamped to the ends of b as with the function bmidstr.
  */
 int bassignmidstr (bstring a, const_bstring b, int left, int len) {
-	if (b == NULL || b->data == NULL || b->slen < 0)
-		return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->slen < 0)
+        return BSTR_ERR;
 
-	if (left < 0) {
-		len += left;
-		left = 0;
-	}
+    if (left < 0) {
+        len += left;
+        left = 0;
+    }
 
-	if (len > b->slen - left) len = b->slen - left;
+    if (len > b->slen - left) len = b->slen - left;
 
-	if (a == NULL || a->data == NULL || a->mlen < a->slen ||
-	    a->slen < 0 || a->mlen == 0)
-		return BSTR_ERR;
+    if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+        a->slen < 0 || a->mlen == 0)
+        return BSTR_ERR;
 
-	if (len > 0) {
-		if (balloc (a, len) != BSTR_OK) return BSTR_ERR;
-		bstr__memmove (a->data, b->data + left, len);
-		a->slen = len;
-	} else {
-		a->slen = 0;
-	}
-	a->data[a->slen] = (unsigned char) '\0';
-	return BSTR_OK;
+    if (len > 0) {
+        if (balloc (a, len) != BSTR_OK) return BSTR_ERR;
+        bstr__memmove (a->data, b->data + left, len);
+        a->slen = len;
+    } else {
+        a->slen = 0;
+    }
+    a->data[a->slen] = (unsigned char) '\0';
+    return BSTR_OK;
 }
 
 /*  int bassigncstr (bstring a, const char * str)
@@ -500,24 +500,24 @@ int bassignmidstr (bstring a, const_bstring b, int left, int len) {
 int bassigncstr (bstring a, const char * str) {
 int i;
 size_t len;
-	if (a == NULL || a->data == NULL || a->mlen < a->slen ||
-	    a->slen < 0 || a->mlen == 0 || NULL == str) 
-		return BSTR_ERR;
-
-	for (i=0; i < a->mlen; i++) {
-		if ('\0' == (a->data[i] = str[i])) {
-			a->slen = i;
-			return BSTR_OK;
-		}
-	}
-
-	a->slen = i;
-	len = strlen (str + i);
-	if (len > INT_MAX || i + len + 1 > INT_MAX ||
-	    0 > balloc (a, (int) (i + len + 1))) return BSTR_ERR;
-	bBlockCopy (a->data + i, str + i, (size_t) len + 1);
-	a->slen += (int) len;
-	return BSTR_OK;
+    if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+        a->slen < 0 || a->mlen == 0 || NULL == str) 
+        return BSTR_ERR;
+
+    for (i=0; i < a->mlen; i++) {
+        if ('\0' == (a->data[i] = str[i])) {
+            a->slen = i;
+            return BSTR_OK;
+        }
+    }
+
+    a->slen = i;
+    len = strlen (str + i);
+    if (len > INT_MAX || i + len + 1 > INT_MAX ||
+        0 > balloc (a, (int) (i + len + 1))) return BSTR_ERR;
+    bBlockCopy (a->data + i, str + i, (size_t) len + 1);
+    a->slen += (int) len;
+    return BSTR_OK;
 }
 
 /*  int bassignblk (bstring a, const void * s, int len)
@@ -527,14 +527,14 @@ size_t len;
  *  occurs BSTR_ERR is returned and a is not overwritten.
  */
 int bassignblk (bstring a, const void * s, int len) {
-	if (a == NULL || a->data == NULL || a->mlen < a->slen ||
-	    a->slen < 0 || a->mlen == 0 || NULL == s || len + 1 < 1) 
-		return BSTR_ERR;
-	if (len + 1 > a->mlen && 0 > balloc (a, len + 1)) return BSTR_ERR;
-	bBlockCopy (a->data, s, (size_t) len);
-	a->data[len] = (unsigned char) '\0';
-	a->slen = len;
-	return BSTR_OK;
+    if (a == NULL || a->data == NULL || a->mlen < a->slen ||
+        a->slen < 0 || a->mlen == 0 || NULL == s || len + 1 < 1) 
+        return BSTR_ERR;
+    if (len + 1 > a->mlen && 0 > balloc (a, len + 1)) return BSTR_ERR;
+    bBlockCopy (a->data, s, (size_t) len);
+    a->data[len] = (unsigned char) '\0';
+    a->slen = len;
+    return BSTR_OK;
 }
 
 /*  int btrunc (bstring b, int n)
@@ -542,13 +542,13 @@ int bassignblk (bstring a, const void * s, int len) {
  *  Truncate the bstring to at most n characters.
  */
 int btrunc (bstring b, int n) {
-	if (n < 0 || b == NULL || b->data == NULL || b->mlen < b->slen ||
-	    b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
-	if (b->slen > n) {
-		b->slen = n;
-		b->data[n] = (unsigned char) '\0';
-	}
-	return BSTR_OK;
+    if (n < 0 || b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    if (b->slen > n) {
+        b->slen = n;
+        b->data[n] = (unsigned char) '\0';
+    }
+    return BSTR_OK;
 }
 
 #define   upcase(c) (toupper ((unsigned char) c))
@@ -561,12 +561,12 @@ int btrunc (bstring b, int n) {
  */
 int btoupper (bstring b) {
 int i, len;
-	if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-	    b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
-	for (i=0, len = b->slen; i < len; i++) {
-		b->data[i] = (unsigned char) upcase (b->data[i]);
-	}
-	return BSTR_OK;
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    for (i=0, len = b->slen; i < len; i++) {
+        b->data[i] = (unsigned char) upcase (b->data[i]);
+    }
+    return BSTR_OK;
 }
 
 /*  int btolower (bstring b)
@@ -575,12 +575,12 @@ int i, len;
  */
 int btolower (bstring b) {
 int i, len;
-	if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-	    b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
-	for (i=0, len = b->slen; i < len; i++) {
-		b->data[i] = (unsigned char) downcase (b->data[i]);
-	}
-	return BSTR_OK;
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    for (i=0, len = b->slen; i < len; i++) {
+        b->data[i] = (unsigned char) downcase (b->data[i]);
+    }
+    return BSTR_OK;
 }
 
 /*  int bstricmp (const_bstring b0, const_bstring b1)
@@ -595,28 +595,28 @@ int i, len;
 int bstricmp (const_bstring b0, const_bstring b1) {
 int i, v, n;
 
-	if (bdata (b0) == NULL || b0->slen < 0 || 
-	    bdata (b1) == NULL || b1->slen < 0) return SHRT_MIN;
-	if ((n = b0->slen) > b1->slen) n = b1->slen;
-	else if (b0->slen == b1->slen && b0->data == b1->data) return BSTR_OK;
-
-	for (i = 0; i < n; i ++) {
-		v  = (char) downcase (b0->data[i])
-		   - (char) downcase (b1->data[i]);
-		if (0 != v) return v;
-	}
-
-	if (b0->slen > n) {
-		v = (char) downcase (b0->data[n]);
-		if (v) return v;
-		return UCHAR_MAX + 1;
-	}
-	if (b1->slen > n) {
-		v = - (char) downcase (b1->data[n]);
-		if (v) return v;
-		return - (int) (UCHAR_MAX + 1);
-	}
-	return BSTR_OK;
+    if (bdata (b0) == NULL || b0->slen < 0 || 
+        bdata (b1) == NULL || b1->slen < 0) return SHRT_MIN;
+    if ((n = b0->slen) > b1->slen) n = b1->slen;
+    else if (b0->slen == b1->slen && b0->data == b1->data) return BSTR_OK;
+
+    for (i = 0; i < n; i ++) {
+        v  = (char) downcase (b0->data[i])
+           - (char) downcase (b1->data[i]);
+        if (0 != v) return v;
+    }
+
+    if (b0->slen > n) {
+        v = (char) downcase (b0->data[n]);
+        if (v) return v;
+        return UCHAR_MAX + 1;
+    }
+    if (b1->slen > n) {
+        v = - (char) downcase (b1->data[n]);
+        if (v) return v;
+        return - (int) (UCHAR_MAX + 1);
+    }
+    return BSTR_OK;
 }
 
 /*  int bstrnicmp (const_bstring b0, const_bstring b1, int n)
@@ -632,31 +632,31 @@ int i, v, n;
 int bstrnicmp (const_bstring b0, const_bstring b1, int n) {
 int i, v, m;
 
-	if (bdata (b0) == NULL || b0->slen < 0 || 
-	    bdata (b1) == NULL || b1->slen < 0 || n < 0) return SHRT_MIN;
-	m = n;
-	if (m > b0->slen) m = b0->slen;
-	if (m > b1->slen) m = b1->slen;
+    if (bdata (b0) == NULL || b0->slen < 0 || 
+        bdata (b1) == NULL || b1->slen < 0 || n < 0) return SHRT_MIN;
+    m = n;
+    if (m > b0->slen) m = b0->slen;
+    if (m > b1->slen) m = b1->slen;
 
-	if (b0->data != b1->data) {
-		for (i = 0; i < m; i ++) {
-			v  = (char) downcase (b0->data[i]);
-			v -= (char) downcase (b1->data[i]);
-			if (v != 0) return b0->data[i] - b1->data[i];
-		}
-	}
+    if (b0->data != b1->data) {
+        for (i = 0; i < m; i ++) {
+            v  = (char) downcase (b0->data[i]);
+            v -= (char) downcase (b1->data[i]);
+            if (v != 0) return b0->data[i] - b1->data[i];
+        }
+    }
 
-	if (n == m || b0->slen == b1->slen) return BSTR_OK;
+    if (n == m || b0->slen == b1->slen) return BSTR_OK;
 
-	if (b0->slen > m) {
-		v = (char) downcase (b0->data[m]);
-		if (v) return v;
-		return UCHAR_MAX + 1;
-	}
+    if (b0->slen > m) {
+        v = (char) downcase (b0->data[m]);
+        if (v) return v;
+        return UCHAR_MAX + 1;
+    }
 
-	v = - (char) downcase (b1->data[m]);
-	if (v) return v;
-	return - (int) (UCHAR_MAX + 1);
+    v = - (char) downcase (b1->data[m]);
+    if (v) return v;
+    return - (int) (UCHAR_MAX + 1);
 }
 
 /*  int biseqcaseless (const_bstring b0, const_bstring b1)
@@ -670,17 +670,17 @@ int i, v, m;
 int biseqcaseless (const_bstring b0, const_bstring b1) {
 int i, n;
 
-	if (bdata (b0) == NULL || b0->slen < 0 || 
-	    bdata (b1) == NULL || b1->slen < 0) return BSTR_ERR;
-	if (b0->slen != b1->slen) return BSTR_OK;
-	if (b0->data == b1->data || b0->slen == 0) return 1;
-	for (i=0, n=b0->slen; i < n; i++) {
-		if (b0->data[i] != b1->data[i]) {
-			unsigned char c = (unsigned char) downcase (b0->data[i]);
-			if (c != (unsigned char) downcase (b1->data[i])) return 0;
-		}
-	}
-	return 1;
+    if (bdata (b0) == NULL || b0->slen < 0 || 
+        bdata (b1) == NULL || b1->slen < 0) return BSTR_ERR;
+    if (b0->slen != b1->slen) return BSTR_OK;
+    if (b0->data == b1->data || b0->slen == 0) return 1;
+    for (i=0, n=b0->slen; i < n; i++) {
+        if (b0->data[i] != b1->data[i]) {
+            unsigned char c = (unsigned char) downcase (b0->data[i]);
+            if (c != (unsigned char) downcase (b1->data[i])) return 0;
+        }
+    }
+    return 1;
 }
 
 /*  int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len)
@@ -695,18 +695,18 @@ int i, n;
 int bisstemeqcaselessblk (const_bstring b0, const void * blk, int len) {
 int i;
 
-	if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
-		return BSTR_ERR;
-	if (b0->slen < len) return BSTR_OK;
-	if (b0->data == (const unsigned char *) blk || len == 0) return 1;
+    if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
+        return BSTR_ERR;
+    if (b0->slen < len) return BSTR_OK;
+    if (b0->data == (const unsigned char *) blk || len == 0) return 1;
 
-	for (i = 0; i < len; i ++) {
-		if (b0->data[i] != ((const unsigned char *) blk)[i]) {
-			if (downcase (b0->data[i]) != 
-			    downcase (((const unsigned char *) blk)[i])) return 0;
-		}
-	}
-	return 1;
+    for (i = 0; i < len; i ++) {
+        if (b0->data[i] != ((const unsigned char *) blk)[i]) {
+            if (downcase (b0->data[i]) != 
+                downcase (((const unsigned char *) blk)[i])) return 0;
+        }
+    }
+    return 1;
 }
 
 /*
@@ -717,18 +717,18 @@ int i;
 int bltrimws (bstring b) {
 int i, len;
 
-	if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-	    b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
 
-	for (len = b->slen, i = 0; i < len; i++) {
-		if (!wspace (b->data[i])) {
-			return bdelete (b, 0, i);
-		}
-	}
+    for (len = b->slen, i = 0; i < len; i++) {
+        if (!wspace (b->data[i])) {
+            return bdelete (b, 0, i);
+        }
+    }
 
-	b->data[0] = (unsigned char) '\0';
-	b->slen = 0;
-	return BSTR_OK;
+    b->data[0] = (unsigned char) '\0';
+    b->slen = 0;
+    return BSTR_OK;
 }
 
 /*
@@ -739,20 +739,20 @@ int i, len;
 int brtrimws (bstring b) {
 int i;
 
-	if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-	    b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
 
-	for (i = b->slen - 1; i >= 0; i--) {
-		if (!wspace (b->data[i])) {
-			if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
-			b->slen = i + 1;
-			return BSTR_OK;
-		}
-	}
+    for (i = b->slen - 1; i >= 0; i--) {
+        if (!wspace (b->data[i])) {
+            if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
+            b->slen = i + 1;
+            return BSTR_OK;
+        }
+    }
 
-	b->data[0] = (unsigned char) '\0';
-	b->slen = 0;
-	return BSTR_OK;
+    b->data[0] = (unsigned char) '\0';
+    b->slen = 0;
+    return BSTR_OK;
 }
 
 /*
@@ -763,21 +763,21 @@ int i;
 int btrimws (bstring b) {
 int i, j;
 
-	if (b == NULL || b->data == NULL || b->mlen < b->slen ||
-	    b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->mlen < b->slen ||
+        b->slen < 0 || b->mlen <= 0) return BSTR_ERR;
 
-	for (i = b->slen - 1; i >= 0; i--) {
-		if (!wspace (b->data[i])) {
-			if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
-			b->slen = i + 1;
-			for (j = 0; wspace (b->data[j]); j++) {}
-			return bdelete (b, 0, j);
-		}
-	}
+    for (i = b->slen - 1; i >= 0; i--) {
+        if (!wspace (b->data[i])) {
+            if (b->mlen > i) b->data[i+1] = (unsigned char) '\0';
+            b->slen = i + 1;
+            for (j = 0; wspace (b->data[j]); j++) {}
+            return bdelete (b, 0, j);
+        }
+    }
 
-	b->data[0] = (unsigned char) '\0';
-	b->slen = 0;
-	return BSTR_OK;
+    b->data[0] = (unsigned char) '\0';
+    b->slen = 0;
+    return BSTR_OK;
 }
 
 /*  int biseq (const_bstring b0, const_bstring b1)
@@ -788,11 +788,11 @@ int i, j;
  *  O(1).  '\0' termination characters are not treated in any special way.
  */
 int biseq (const_bstring b0, const_bstring b1) {
-	if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
-		b0->slen < 0 || b1->slen < 0) return BSTR_ERR;
-	if (b0->slen != b1->slen) return BSTR_OK;
-	if (b0->data == b1->data || b0->slen == 0) return 1;
-	return !bstr__memcmp (b0->data, b1->data, b0->slen);
+    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+        b0->slen < 0 || b1->slen < 0) return BSTR_ERR;
+    if (b0->slen != b1->slen) return BSTR_OK;
+    if (b0->data == b1->data || b0->slen == 0) return 1;
+    return !bstr__memcmp (b0->data, b1->data, b0->slen);
 }
 
 /*  int bisstemeqblk (const_bstring b0, const void * blk, int len)
@@ -806,15 +806,15 @@ int biseq (const_bstring b0, const_bstring b1) {
 int bisstemeqblk (const_bstring b0, const void * blk, int len) {
 int i;
 
-	if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
-		return BSTR_ERR;
-	if (b0->slen < len) return BSTR_OK;
-	if (b0->data == (const unsigned char *) blk || len == 0) return 1;
+    if (bdata (b0) == NULL || b0->slen < 0 || NULL == blk || len < 0)
+        return BSTR_ERR;
+    if (b0->slen < len) return BSTR_OK;
+    if (b0->data == (const unsigned char *) blk || len == 0) return 1;
 
-	for (i = 0; i < len; i ++) {
-		if (b0->data[i] != ((const unsigned char *) blk)[i]) return BSTR_OK;
-	}
-	return 1;
+    for (i = 0; i < len; i ++) {
+        if (b0->data[i] != ((const unsigned char *) blk)[i]) return BSTR_OK;
+    }
+    return 1;
 }
 
 /*  int biseqcstr (const_bstring b, const char *s)
@@ -830,11 +830,11 @@ int i;
  */
 int biseqcstr (const_bstring b, const char * s) {
 int i;
-	if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
-	for (i=0; i < b->slen; i++) {
-		if (s[i] == '\0' || b->data[i] != (unsigned char) s[i]) return BSTR_OK;
-	}
-	return s[i] == '\0';
+    if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
+    for (i=0; i < b->slen; i++) {
+        if (s[i] == '\0' || b->data[i] != (unsigned char) s[i]) return BSTR_OK;
+    }
+    return s[i] == '\0';
 }
 
 /*  int biseqcstrcaseless (const_bstring b, const char *s)
@@ -851,14 +851,14 @@ int i;
  */
 int biseqcstrcaseless (const_bstring b, const char * s) {
 int i;
-	if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
-	for (i=0; i < b->slen; i++) {
-		if (s[i] == '\0' || 
-		    (b->data[i] != (unsigned char) s[i] && 
-		     downcase (b->data[i]) != (unsigned char) downcase (s[i])))
-			return BSTR_OK;
-	}
-	return s[i] == '\0';
+    if (b == NULL || s == NULL || b->data == NULL || b->slen < 0) return BSTR_ERR;
+    for (i=0; i < b->slen; i++) {
+        if (s[i] == '\0' || 
+            (b->data[i] != (unsigned char) s[i] && 
+             downcase (b->data[i]) != (unsigned char) downcase (s[i])))
+            return BSTR_OK;
+    }
+    return s[i] == '\0';
 }
 
 /*  int bstrcmp (const_bstring b0, const_bstring b1)
@@ -878,21 +878,21 @@ int i;
 int bstrcmp (const_bstring b0, const_bstring b1) {
 int i, v, n;
 
-	if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
-		b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
-	n = b0->slen; if (n > b1->slen) n = b1->slen;
-	if (b0->slen == b1->slen && (b0->data == b1->data || b0->slen == 0))
-		return BSTR_OK;
+    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+        b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
+    n = b0->slen; if (n > b1->slen) n = b1->slen;
+    if (b0->slen == b1->slen && (b0->data == b1->data || b0->slen == 0))
+        return BSTR_OK;
 
-	for (i = 0; i < n; i ++) {
-		v = ((char) b0->data[i]) - ((char) b1->data[i]);
-		if (v != 0) return v;
-		if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
-	}
+    for (i = 0; i < n; i ++) {
+        v = ((char) b0->data[i]) - ((char) b1->data[i]);
+        if (v != 0) return v;
+        if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
+    }
 
-	if (b0->slen > n) return 1;
-	if (b1->slen > n) return -1;
-	return BSTR_OK;
+    if (b0->slen > n) return 1;
+    if (b1->slen > n) return -1;
+    return BSTR_OK;
 }
 
 /*  int bstrncmp (const_bstring b0, const_bstring b1, int n)
@@ -908,24 +908,24 @@ int i, v, n;
 int bstrncmp (const_bstring b0, const_bstring b1, int n) {
 int i, v, m;
 
-	if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
-		b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
-	m = n;
-	if (m > b0->slen) m = b0->slen;
-	if (m > b1->slen) m = b1->slen;
+    if (b0 == NULL || b1 == NULL || b0->data == NULL || b1->data == NULL ||
+        b0->slen < 0 || b1->slen < 0) return SHRT_MIN;
+    m = n;
+    if (m > b0->slen) m = b0->slen;
+    if (m > b1->slen) m = b1->slen;
 
-	if (b0->data != b1->data) {
-		for (i = 0; i < m; i ++) {
-			v = ((char) b0->data[i]) - ((char) b1->data[i]);
-			if (v != 0) return v;
-			if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
-		}
-	}
+    if (b0->data != b1->data) {
+        for (i = 0; i < m; i ++) {
+            v = ((char) b0->data[i]) - ((char) b1->data[i]);
+            if (v != 0) return v;
+            if (b0->data[i] == (unsigned char) '\0') return BSTR_OK;
+        }
+    }
 
-	if (n == m || b0->slen == b1->slen) return BSTR_OK;
+    if (n == m || b0->slen == b1->slen) return BSTR_OK;
 
-	if (b0->slen > m) return 1;
-	return -1;
+    if (b0->slen > m) return 1;
+    return -1;
 }
 
 /*  bstring bmidstr (const_bstring b, int left, int len)
@@ -937,17 +937,17 @@ int i, v, m;
  */
 bstring bmidstr (const_bstring b, int left, int len) {
 
-	if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
+    if (b == NULL || b->slen < 0 || b->data == NULL) return NULL;
 
-	if (left < 0) {
-		len += left;
-		left = 0;
-	}
+    if (left < 0) {
+        len += left;
+        left = 0;
+    }
 
-	if (len > b->slen - left) len = b->slen - left;
+    if (len > b->slen - left) len = b->slen - left;
 
-	if (len <= 0) return bfromcstr ("");
-	return blk2bstr (b->data + left, len);
+    if (len <= 0) return bfromcstr ("");
+    return blk2bstr (b->data + left, len);
 }
 
 /*  int bdelete (bstring b, int pos, int len)
@@ -958,27 +958,27 @@ bstring bmidstr (const_bstring b, int left, int len) {
  *  len) is clamped to boundaries of the bstring b.
  */
 int bdelete (bstring b, int pos, int len) {
-	/* Clamp to left side of bstring */
-	if (pos < 0) {
-		len += pos;
-		pos = 0;
-	}
-
-	if (len < 0 || b == NULL || b->data == NULL || b->slen < 0 || 
-	    b->mlen < b->slen || b->mlen <= 0) 
-		return BSTR_ERR;
-	if (len > 0 && pos < b->slen) {
-		if (pos + len >= b->slen) {
-			b->slen = pos;
-		} else {
-			bBlockCopy ((char *) (b->data + pos),
-			            (char *) (b->data + pos + len), 
-			            b->slen - (pos+len));
-			b->slen -= len;
-		}
-		b->data[b->slen] = (unsigned char) '\0';
-	}
-	return BSTR_OK;
+    /* Clamp to left side of bstring */
+    if (pos < 0) {
+        len += pos;
+        pos = 0;
+    }
+
+    if (len < 0 || b == NULL || b->data == NULL || b->slen < 0 || 
+        b->mlen < b->slen || b->mlen <= 0) 
+        return BSTR_ERR;
+    if (len > 0 && pos < b->slen) {
+        if (pos + len >= b->slen) {
+            b->slen = pos;
+        } else {
+            bBlockCopy ((char *) (b->data + pos),
+                        (char *) (b->data + pos + len), 
+                        b->slen - (pos+len));
+            b->slen -= len;
+        }
+        b->data[b->slen] = (unsigned char) '\0';
+    }
+    return BSTR_OK;
 }
 
 /*  int bdestroy (bstring b)
@@ -989,21 +989,21 @@ int bdelete (bstring b, int pos, int len) {
  *  been bdestroyed is undefined.
  */
 int bdestroy (bstring b) {
-	if (b == NULL || b->slen < 0 || b->mlen <= 0 || b->mlen < b->slen ||
-	    b->data == NULL)
-		return BSTR_ERR;
+    if (b == NULL || b->slen < 0 || b->mlen <= 0 || b->mlen < b->slen ||
+        b->data == NULL)
+        return BSTR_ERR;
 
-	bstr__free (b->data);
+    bstr__free (b->data);
 
-	/* In case there is any stale usage, there is one more chance to 
-	   notice this error. */
+    /* In case there is any stale usage, there is one more chance to 
+       notice this error. */
 
-	b->slen = -1;
-	b->mlen = -__LINE__;
-	b->data = NULL;
+    b->slen = -1;
+    b->mlen = -__LINE__;
+    b->data = NULL;
 
-	bstr__free (b);
-	return BSTR_OK;
+    bstr__free (b);
+    return BSTR_OK;
 }
 
 /*  int binstr (const_bstring b1, int pos, const_bstring b2)
@@ -1023,74 +1023,74 @@ register unsigned char * d1;
 register unsigned char c1;
 register int i;
 
-	if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
-	    b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
-	if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
-	if (b1->slen < pos || pos < 0) return BSTR_ERR;
-	if (b2->slen == 0) return pos;
+    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+    if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
+    if (b1->slen < pos || pos < 0) return BSTR_ERR;
+    if (b2->slen == 0) return pos;
 
-	/* No space to find such a string? */
-	if ((lf = b1->slen - b2->slen + 1) <= pos) return BSTR_ERR;
+    /* No space to find such a string? */
+    if ((lf = b1->slen - b2->slen + 1) <= pos) return BSTR_ERR;
 
-	/* An obvious alias case */
-	if (b1->data == b2->data && pos == 0) return 0;
+    /* An obvious alias case */
+    if (b1->data == b2->data && pos == 0) return 0;
 
-	i = pos;
+    i = pos;
 
-	d0 = b2->data;
-	d1 = b1->data;
-	ll = b2->slen;
+    d0 = b2->data;
+    d1 = b1->data;
+    ll = b2->slen;
 
-	/* Peel off the b2->slen == 1 case */
-	c0 = d0[0];
-	if (1 == ll) {
-		for (;i < lf; i++) if (c0 == d1[i]) return i;
-		return BSTR_ERR;
-	}
+    /* Peel off the b2->slen == 1 case */
+    c0 = d0[0];
+    if (1 == ll) {
+        for (;i < lf; i++) if (c0 == d1[i]) return i;
+        return BSTR_ERR;
+    }
 
-	c1 = c0;
-	j = 0;
-	lf = b1->slen - 1;
+    c1 = c0;
+    j = 0;
+    lf = b1->slen - 1;
 
-	ii = -1;
-	if (i < lf) do {
-		/* Unrolled current character test */
-		if (c1 != d1[i]) {
-			if (c1 != d1[1+i]) {
-				i += 2;
-				continue;
-			}
-			i++;
-		}
+    ii = -1;
+    if (i < lf) do {
+        /* Unrolled current character test */
+        if (c1 != d1[i]) {
+            if (c1 != d1[1+i]) {
+                i += 2;
+                continue;
+            }
+            i++;
+        }
 
-		/* Take note if this is the start of a potential match */
-		if (0 == j) ii = i;
+        /* Take note if this is the start of a potential match */
+        if (0 == j) ii = i;
 
-		/* Shift the test character down by one */
-		j++;
-		i++;
+        /* Shift the test character down by one */
+        j++;
+        i++;
 
-		/* If this isn't past the last character continue */
-		if (j < ll) {
-			c1 = d0[j];
-			continue;
-		}
+        /* If this isn't past the last character continue */
+        if (j < ll) {
+            c1 = d0[j];
+            continue;
+        }
 
-		N0:;
+        N0:;
 
-		/* If no characters mismatched, then we matched */
-		if (i == ii+j) return ii;
+        /* If no characters mismatched, then we matched */
+        if (i == ii+j) return ii;
 
-		/* Shift back to the beginning */
-		i -= j;
-		j  = 0;
-		c1 = c0;
-	} while (i < lf);
+        /* Shift back to the beginning */
+        i -= j;
+        j  = 0;
+        c1 = c0;
+    } while (i < lf);
 
-	/* Deal with last case if unrolling caused a misalignment */
-	if (i == lf && ll == j+1 && c1 == d1[i]) goto N0;
+    /* Deal with last case if unrolling caused a misalignment */
+    if (i == lf && ll == j+1 && c1 == d1[i]) goto N0;
 
-	return BSTR_ERR;
+    return BSTR_ERR;
 }
 
 /*  int binstrr (const_bstring b1, int pos, const_bstring b2)
@@ -1106,38 +1106,38 @@ int binstrr (const_bstring b1, int pos, const_bstring b2) {
 int j, i, l;
 unsigned char * d0, * d1;
 
-	if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
-	    b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
-	if (b1->slen == pos && b2->slen == 0) return pos;
-	if (b1->slen < pos || pos < 0) return BSTR_ERR;
-	if (b2->slen == 0) return pos;
+    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+    if (b1->slen == pos && b2->slen == 0) return pos;
+    if (b1->slen < pos || pos < 0) return BSTR_ERR;
+    if (b2->slen == 0) return pos;
 
-	/* Obvious alias case */
-	if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return 0;
+    /* Obvious alias case */
+    if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return 0;
 
-	i = pos;
-	if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
+    i = pos;
+    if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
 
-	/* If no space to find such a string then snap back */
-	if (l + 1 <= i) i = l;
-	j = 0;
+    /* If no space to find such a string then snap back */
+    if (l + 1 <= i) i = l;
+    j = 0;
 
-	d0 = b2->data;
-	d1 = b1->data;
-	l  = b2->slen;
+    d0 = b2->data;
+    d1 = b1->data;
+    l  = b2->slen;
 
-	for (;;) {
-		if (d0[j] == d1[i + j]) {
-			j ++;
-			if (j >= l) return i;
-		} else {
-			i --;
-			if (i < 0) break;
-			j=0;
-		}
-	}
+    for (;;) {
+        if (d0[j] == d1[i + j]) {
+            j ++;
+            if (j >= l) return i;
+        } else {
+            i --;
+            if (i < 0) break;
+            j=0;
+        }
+    }
 
-	return BSTR_ERR;
+    return BSTR_ERR;
 }
 
 /*  int binstrcaseless (const_bstring b1, int pos, const_bstring b2)
@@ -1153,39 +1153,39 @@ int binstrcaseless (const_bstring b1, int pos, const_bstring b2) {
 int j, i, l, ll;
 unsigned char * d0, * d1;
 
-	if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
-	    b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
-	if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
-	if (b1->slen < pos || pos < 0) return BSTR_ERR;
-	if (b2->slen == 0) return pos;
+    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+    if (b1->slen == pos) return (b2->slen == 0)?pos:BSTR_ERR;
+    if (b1->slen < pos || pos < 0) return BSTR_ERR;
+    if (b2->slen == 0) return pos;
 
-	l = b1->slen - b2->slen + 1;
+    l = b1->slen - b2->slen + 1;
 
-	/* No space to find such a string? */
-	if (l <= pos) return BSTR_ERR;
+    /* No space to find such a string? */
+    if (l <= pos) return BSTR_ERR;
 
-	/* An obvious alias case */
-	if (b1->data == b2->data && pos == 0) return BSTR_OK;
+    /* An obvious alias case */
+    if (b1->data == b2->data && pos == 0) return BSTR_OK;
 
-	i = pos;
-	j = 0;
+    i = pos;
+    j = 0;
 
-	d0 = b2->data;
-	d1 = b1->data;
-	ll = b2->slen;
+    d0 = b2->data;
+    d1 = b1->data;
+    ll = b2->slen;
 
-	for (;;) {
-		if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
-			j ++;
-			if (j >= ll) return i;
-		} else {
-			i ++;
-			if (i >= l) break;
-			j=0;
-		}
-	}
+    for (;;) {
+        if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
+            j ++;
+            if (j >= ll) return i;
+        } else {
+            i ++;
+            if (i >= l) break;
+            j=0;
+        }
+    }
 
-	return BSTR_ERR;
+    return BSTR_ERR;
 }
 
 /*  int binstrrcaseless (const_bstring b1, int pos, const_bstring b2)
@@ -1201,38 +1201,38 @@ int binstrrcaseless (const_bstring b1, int pos, const_bstring b2) {
 int j, i, l;
 unsigned char * d0, * d1;
 
-	if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
-	    b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
-	if (b1->slen == pos && b2->slen == 0) return pos;
-	if (b1->slen < pos || pos < 0) return BSTR_ERR;
-	if (b2->slen == 0) return pos;
+    if (b1 == NULL || b1->data == NULL || b1->slen < 0 ||
+        b2 == NULL || b2->data == NULL || b2->slen < 0) return BSTR_ERR;
+    if (b1->slen == pos && b2->slen == 0) return pos;
+    if (b1->slen < pos || pos < 0) return BSTR_ERR;
+    if (b2->slen == 0) return pos;
 
-	/* Obvious alias case */
-	if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return BSTR_OK;
+    /* Obvious alias case */
+    if (b1->data == b2->data && pos == 0 && b2->slen <= b1->slen) return BSTR_OK;
 
-	i = pos;
-	if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
+    i = pos;
+    if ((l = b1->slen - b2->slen) < 0) return BSTR_ERR;
 
-	/* If no space to find such a string then snap back */
-	if (l + 1 <= i) i = l;
-	j = 0;
+    /* If no space to find such a string then snap back */
+    if (l + 1 <= i) i = l;
+    j = 0;
 
-	d0 = b2->data;
-	d1 = b1->data;
-	l  = b2->slen;
+    d0 = b2->data;
+    d1 = b1->data;
+    l  = b2->slen;
 
-	for (;;) {
-		if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
-			j ++;
-			if (j >= l) return i;
-		} else {
-			i --;
-			if (i < 0) break;
-			j=0;
-		}
-	}
+    for (;;) {
+        if (d0[j] == d1[i + j] || downcase (d0[j]) == downcase (d1[i + j])) {
+            j ++;
+            if (j >= l) return i;
+        } else {
+            i --;
+            if (i < 0) break;
+            j=0;
+        }
+    }
 
-	return BSTR_ERR;
+    return BSTR_ERR;
 }
 
 
@@ -1244,10 +1244,10 @@ unsigned char * d0, * d1;
 int bstrchrp (const_bstring b, int c, int pos) {
 unsigned char * p;
 
-	if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
-	p = (unsigned char *) bstr__memchr ((b->data + pos), (unsigned char) c, (b->slen - pos));
-	if (p) return (int) (p - b->data);
-	return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
+    p = (unsigned char *) bstr__memchr ((b->data + pos), (unsigned char) c, (b->slen - pos));
+    if (p) return (int) (p - b->data);
+    return BSTR_ERR;
 }
 
 /*  int bstrrchrp (const_bstring b, int c, int pos)
@@ -1258,11 +1258,11 @@ unsigned char * p;
 int bstrrchrp (const_bstring b, int c, int pos) {
 int i;
  
-	if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
-	for (i=pos; i >= 0; i--) {
-		if (b->data[i] == (unsigned char) c) return i;
-	}
-	return BSTR_ERR;
+    if (b == NULL || b->data == NULL || b->slen <= pos || pos < 0) return BSTR_ERR;
+    for (i=pos; i >= 0; i--) {
+        if (b->data[i] == (unsigned char) c) return i;
+    }
+    return BSTR_ERR;
 }
 
 #if !defined (BSTRLIB_AGGRESSIVE_MEMORY_FOR_SPEED_TRADEOFF)
@@ -1274,8 +1274,8 @@ int i;
 struct charField { LONG_TYPE content[CFCLEN]; };
 #define testInCharField(cf,c) ((cf)->content[(c) >> LONG_LOG_BITS_QTY] & (((long)1) << ((c) & (LONG_BITS_QTY-1))))
 #define setInCharField(cf,idx) { \
-	unsigned int c = (unsigned int) (idx); \
-	(cf)->content[c >> LONG_LOG_BITS_QTY] |= (LONG_TYPE) (1ul << (c & (LONG_BITS_QTY-1))); \
+    unsigned int c = (unsigned int) (idx); \
+    (cf)->content[c >> LONG_LOG_BITS_QTY] |= (LONG_TYPE) (1ul << (c & (LONG_BITS_QTY-1))); \
 }
 
 #else
@@ -1290,27 +1290,27 @@ struct charField { unsigned char content[CFCLEN]; };
 /* Convert a bstring to charField */
 static int buildCharField (struct charField * cf, const_bstring b) {
 int i;
-	if (b == NULL || b->data == NULL || b->slen <= 0) return BSTR_ERR;
-	memset ((void *) cf->content, 0, sizeof (struct charField));
-	for (i=0; i < b->slen; i++) {
-		setInCharField (cf, b->data[i]);
-	}
-	return BSTR_OK;
+    if (b == NULL || b->data == NULL || b->slen <= 0) return BSTR_ERR;
+    memset ((void *) cf->content, 0, sizeof (struct charField));
+    for (i=0; i < b->slen; i++) {
+        setInCharField (cf, b->data[i]);
+    }
+    return BSTR_OK;
 }
 
 static void invertCharField (struct charField * cf) {
 int i;
-	for (i=0; i < CFCLEN; i++) cf->content[i] = ~cf->content[i];
+    for (i=0; i < CFCLEN; i++) cf->content[i] = ~cf->content[i];
 }
 
 /* Inner engine for binchr */
 static int binchrCF (const unsigned char * data, int len, int pos, const struct charField * cf) {
 int i;
-	for (i=pos; i < len; i++) {
-		unsigned char c = (unsigned char) data[i];
-		if (testInCharField (cf, c)) return i;
-	}
-	return BSTR_ERR;
+    for (i=pos; i < len; i++) {
+        unsigned char c = (unsigned char) data[i];
+        if (testInCharField (cf, c)) return i;
+    }
+    return BSTR_ERR;
 }
 
 /*  int binchr (const_bstring b0, int pos, const_bstring b1);
@@ -1321,21 +1321,21 @@ int i;
  */
 int binchr (const_bstring b0, int pos, const_bstring b1) {
 struct charField chrs;
-	if (pos < 0 || b0 == NULL || b0->data == NULL ||
-	    b0->slen <= pos) return BSTR_ERR;
-	if (1 == b1->slen) return bstrchrp (b0, b1->data[0], pos);
-	if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
-	return binchrCF (b0->data, b0->slen, pos, &chrs);
+    if (pos < 0 || b0 == NULL || b0->data == NULL ||
+        b0->slen <= pos) return BSTR_ERR;
+    if (1 == b1->slen) return bstrchrp (b0, b1->data[0], pos);
+    if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
+    return binchrCF (b0->data, b0->slen, pos, &chrs);
 }
 
 /* Inner engine for binchrr */
 static int binchrrCF (const unsigned char * data, int pos, const struct charField * cf) {
 int i;
-	for (i=pos; i >= 0; i--) {
-		unsigned int c = (unsigned int) data[i];
-		if (testInCharField (cf, c)) return i;
-	}
-	return BSTR_ERR;
+    for (i=pos; i >= 0; i--) {
+        unsigned int c = (unsigned int) data[i];
+        if (testInCharField (cf, c)) return i;
+    }
+    return BSTR_ERR;
 }
 
 /*  int binchrr (const_bstring b0, int pos, const_bstring b1);
@@ -1346,12 +1346,12 @@ int i;
  */
 int binchrr (const_bstring b0, int pos, const_bstring b1) {
 struct charField chrs;
-	if (pos < 0 || b0 == NULL || b0->data == NULL || b1 == NULL ||
-	    b0->slen < pos) return BSTR_ERR;
-	if (pos == b0->slen) pos--;
-	if (1 == b1->slen) return bstrrchrp (b0, b1->data[0], pos);
-	if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
-	return binchrrCF (b0->data, pos, &chrs);
+    if (pos < 0 || b0 == NULL || b0->data == NULL || b1 == NULL ||
+        b0->slen < pos) return BSTR_ERR;
+    if (pos == b0->slen) pos--;
+    if (1 == b1->slen) return bstrrchrp (b0, b1->data[0], pos);
+    if (0 > buildCharField (&chrs, b1)) return BSTR_ERR;
+    return binchrrCF (b0->data, pos, &chrs);
 }
 
 /*  int bninchr (const_bstring b0, int pos, const_bstring b1);
@@ -1362,11 +1362,11 @@ struct charField chrs;
  */
 int bninchr (const_bstring b0, int pos, const_bstring b1) {
 struct charField chrs;
-	if (pos < 0 || b0 == NULL || b0->data == NULL || 
-	    b0->slen <= pos) return BSTR_ERR;
-	if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
-	invertCharField (&chrs);
-	return binchrCF (b0->data, b0->slen, pos, &chrs);
+    if (pos < 0 || b0 == NULL || b0->data == NULL || 
+        b0->slen <= pos) return BSTR_ERR;
+    if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
+    invertCharField (&chrs);
+    return binchrCF (b0->data, b0->slen, pos, &chrs);
 }
 
 /*  int bninchrr (const_bstring b0, int pos, const_bstring b1);
@@ -1377,12 +1377,12 @@ struct charField chrs;
  */
 int bninchrr (const_bstring b0, int pos, const_bstring b1) {
 struct charField chrs;
-	if (pos < 0 || b0 == NULL || b0->data == NULL || 
-	    b0->slen < pos) return BSTR_ERR;
-	if (pos == b0->slen) pos--;
-	if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
-	invertCharField (&chrs);
-	return binchrrCF (b0->data, pos, &chrs);
+    if (pos < 0 || b0 == NULL || b0->data == NULL || 
+        b0->slen < pos) return BSTR_ERR;
+    if (pos == b0->slen) pos--;
+    if (buildCharField (&chrs, b1) < 0) return BSTR_ERR;
+    invertCharField (&chrs);
+    return binchrrCF (b0->data, pos, &chrs);
 }
 
 /*  int bsetstr (bstring b0, int pos, bstring b1, unsigned char fill)
@@ -1397,47 +1397,47 @@ int d, newlen;
 ptrdiff_t pd;
 bstring aux = (bstring) b1;
 
-	if (pos < 0 || b0 == NULL || b0->slen < 0 || NULL == b0->data || 
-	    b0->mlen < b0->slen || b0->mlen <= 0) return BSTR_ERR;
-	if (b1 != NULL && (b1->slen < 0 || b1->data == NULL)) return BSTR_ERR;
+    if (pos < 0 || b0 == NULL || b0->slen < 0 || NULL == b0->data || 
+        b0->mlen < b0->slen || b0->mlen <= 0) return BSTR_ERR;
+    if (b1 != NULL && (b1->slen < 0 || b1->data == NULL)) return BSTR_ERR;
 
-	d = pos;
+    d = pos;
 
-	/* Aliasing case */
-	if (NULL != aux) {
-		if ((pd = (ptrdiff_t) (b1->data - b0->data)) >= 0 && pd < (ptrdiff_t) b0->mlen) {
-			if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
-		}
-		d += aux->slen;
-	}
+    /* Aliasing case */
+    if (NULL != aux) {
+        if ((pd = (ptrdiff_t) (b1->data - b0->data)) >= 0 && pd < (ptrdiff_t) b0->mlen) {
+            if (NULL == (aux = bstrcpy (b1))) return BSTR_ERR;
+        }
+        d += aux->slen;
+    }
 
-	/* Increase memory size if necessary */
-	if (balloc (b0, d + 1) != BSTR_OK) {
-		if (aux != b1) bdestroy (aux);
-		return BSTR_ERR;
-	}
+    /* Increase memory size if necessary */
+    if (balloc (b0, d + 1) != BSTR_OK) {
+        if (aux != b1) bdestroy (aux);
+        return BSTR_ERR;
+    }
 
-	newlen = b0->slen;
+    newlen = b0->slen;
 
-	/* Fill in "fill" character as necessary */
-	if (pos > newlen) {
-		bstr__memset (b0->data + b0->slen, (int) fill, (size_t) (pos - b0->slen));
-		newlen = pos;
-	}
+    /* Fill in "fill" character as necessary */
+    if (pos > newlen) {
+        bstr__memset (b0->data + b0->slen, (int) fill, (size_t) (pos - b0->slen));
+        newlen = pos;
+    }
 
-	/* Copy b1 to position pos in b0. */
-	if (aux != NULL) {
-		bBlockCopy ((char *) (b0->data + pos), (char *) aux->data, aux->slen);
-		if (aux != b1) bdestroy (aux);
-	}
+    /* Copy b1 to position pos in b0. */
+    if (aux != NULL) {
+        bBlockCopy ((char *) (b0->data + pos), (char *) aux->data, aux->slen);
+        if (aux != b1) bdestroy (aux);
+    }
 
-	/* Indicate the potentially increased size of b0 */
-	if (d > newlen) newlen = d;
+    /* Indicate the potentially increased size of b0 */
+    if (d > newlen) newlen = d;
 
-	b0->slen = newlen;
-	b0->data[newlen] = (unsigned char) '\0';
+    b0->slen = newlen;
+    b0->data[newlen] = (unsigned char) '\0';
 
-	return BSTR_OK;
+    return BSTR_OK;
 }
 
 /*  int binsert (bstring b1, int pos, bstring b2, unsigned char fill)
@@ -1452,40 +1452,40 @@ int d, l;
 ptrdiff_t pd;
 bstring aux = (bstring) b2;
 
-	if (pos < 0 || b1 == NULL || b2 == NULL || b1->slen < 0 || 
-	    b2->slen < 0 || b1->mlen < b1->slen || b1->mlen <= 0) return BSTR_ERR;
-
-	/* Aliasing case */
-	if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->mlen) {
-		if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
-	}
-
-	/* Compute the two possible end pointers */
-	d = b1->slen + aux->slen;
-	l = pos + aux->slen;
-	if ((d|l) < 0) return BSTR_ERR;
-
-	if (l > d) {
-		/* Inserting past the end of the string */
-		if (balloc (b1, l + 1) != BSTR_OK) {
-			if (aux != b2) bdestroy (aux);
-			return BSTR_ERR;
-		}
-		bstr__memset (b1->data + b1->slen, (int) fill, (size_t) (pos - b1->slen));
-		b1->slen = l;
-	} else {
-		/* Inserting in the middle of the string */
-		if (balloc (b1, d + 1) != BSTR_OK) {
-			if (aux != b2) bdestroy (aux);
-			return BSTR_ERR;
-		}
-		bBlockCopy (b1->data + l, b1->data + pos, d - l);
-		b1->slen = d;
-	}
-	bBlockCopy (b1->data + pos, aux->data, aux->slen);
-	b1->data[b1->slen] = (unsigned char) '\0';
-	if (aux != b2) bdestroy (aux);
-	return BSTR_OK;
+    if (pos < 0 || b1 == NULL || b2 == NULL || b1->slen < 0 || 
+        b2->slen < 0 || b1->mlen < b1->slen || b1->mlen <= 0) return BSTR_ERR;
+
+    /* Aliasing case */
+    if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->mlen) {
+        if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
+    }
+
+    /* Compute the two possible end pointers */
+    d = b1->slen + aux->slen;
+    l = pos + aux->slen;
+    if ((d|l) < 0) return BSTR_ERR;
+
+    if (l > d) {
+        /* Inserting past the end of the string */
+        if (balloc (b1, l + 1) != BSTR_OK) {
+            if (aux != b2) bdestroy (aux);
+            return BSTR_ERR;
+        }
+        bstr__memset (b1->data + b1->slen, (int) fill, (size_t) (pos - b1->slen));
+        b1->slen = l;
+    } else {
+        /* Inserting in the middle of the string */
+        if (balloc (b1, d + 1) != BSTR_OK) {
+            if (aux != b2) bdestroy (aux);
+            return BSTR_ERR;
+        }
+        bBlockCopy (b1->data + l, b1->data + pos, d - l);
+        b1->slen = d;
+    }
+    bBlockCopy (b1->data + pos, aux->data, aux->slen);
+    b1->data[b1->slen] = (unsigned char) '\0';
+    if (aux != b2) bdestroy (aux);
+    return BSTR_OK;
 }
 
 /*  int breplace (bstring b1, int pos, int len, bstring b2, 
@@ -1495,44 +1495,44 @@ bstring aux = (bstring) b2;
  *  fill is used is pos > b1->slen.
  */
 int breplace (bstring b1, int pos, int len, const_bstring b2, 
-			  unsigned char fill) {
+              unsigned char fill) {
 int pl, ret;
 ptrdiff_t pd;
 bstring aux = (bstring) b2;
 
-	if (pos < 0 || len < 0 || (pl = pos + len) < 0 || b1 == NULL || 
-	    b2 == NULL || b1->data == NULL || b2->data == NULL || 
-	    b1->slen < 0 || b2->slen < 0 || b1->mlen < b1->slen ||
-	    b1->mlen <= 0) return BSTR_ERR;
-
-	/* Straddles the end? */
-	if (pl >= b1->slen) {
-		if ((ret = bsetstr (b1, pos, b2, fill)) < 0) return ret;
-		if (pos + b2->slen < b1->slen) {
-			b1->slen = pos + b2->slen;
-			b1->data[b1->slen] = (unsigned char) '\0';
-		}
-		return ret;
-	}
-
-	/* Aliasing case */
-	if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->slen) {
-		if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
-	}
-
-	if (aux->slen > len) {
-		if (balloc (b1, b1->slen + aux->slen - len) != BSTR_OK) {
-			if (aux != b2) bdestroy (aux);
-			return BSTR_ERR;
-		}
-	}
-
-	if (aux->slen != len) bstr__memmove (b1->data + pos + aux->slen, b1->data + pos + len, b1->slen - (pos + len));
-	bstr__memcpy (b1->data + pos, aux->data, aux->slen);
-	b1->slen += aux->slen - len;
-	b1->data[b1->slen] = (unsigned char) '\0';
-	if (aux != b2) bdestroy (aux);
-	return BSTR_OK;
+    if (pos < 0 || len < 0 || (pl = pos + len) < 0 || b1 == NULL || 
+        b2 == NULL || b1->data == NULL || b2->data == NULL || 
+        b1->slen < 0 || b2->slen < 0 || b1->mlen < b1->slen ||
+        b1->mlen <= 0) return BSTR_ERR;
+
+    /* Straddles the end? */
+    if (pl >= b1->slen) {
+        if ((ret = bsetstr (b1, pos, b2, fill)) < 0) return ret;
+        if (pos + b2->slen < b1->slen) {
+            b1->slen = pos + b2->slen;
+            b1->data[b1->slen] = (unsigned char) '\0';
+        }
+        return ret;
+    }
+
+    /* Aliasing case */
+    if ((pd = (ptrdiff_t) (b2->data - b1->data)) >= 0 && pd < (ptrdiff_t) b1->slen) {
+        if (NULL == (aux = bstrcpy (b2))) return BSTR_ERR;
+    }
+
+    if (aux->slen > len) {
+        if (balloc (b1, b1->slen + aux->slen - len) != BSTR_OK) {
+            if (aux != b2) bdestroy (aux);
+            return BSTR_ERR;
+        }
+    }
+
+    if (aux->slen != len) bstr__memmove (b1->data + pos + aux->slen, b1->data + pos + len, b1->slen - (pos + len));
+    bstr__memcpy (b1->data + pos, aux->data, aux->slen);
+    b1->slen += aux->slen - len;
+    b1->data[b1->slen] = (unsigned char) '\0';
+    if (aux != b2) bdestroy (aux);
+    return BSTR_OK;
 }
 
 /*  int bfindreplace (bstring b, const_bstring find, const_bstring repl, 
@@ -1552,123 +1552,123 @@ ptrdiff_t pd;
 bstring auxf = (bstring) find;
 bstring auxr = (bstring) repl;
 
-	if (b == NULL || b->data == NULL || find == NULL ||
-	    find->data == NULL || repl == NULL || repl->data == NULL || 
-	    pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen || 
-	    b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR;
-	if (pos > b->slen - find->slen) return BSTR_OK;
-
-	/* Alias with find string */
-	pd = (ptrdiff_t) (find->data - b->data);
-	if ((ptrdiff_t) (pos - find->slen) < pd && pd < (ptrdiff_t) b->slen) {
-		if (NULL == (auxf = bstrcpy (find))) return BSTR_ERR;
-	}
-
-	/* Alias with repl string */
-	pd = (ptrdiff_t) (repl->data - b->data);
-	if ((ptrdiff_t) (pos - repl->slen) < pd && pd < (ptrdiff_t) b->slen) {
-		if (NULL == (auxr = bstrcpy (repl))) {
-			if (auxf != find) bdestroy (auxf);
-			return BSTR_ERR;
-		}
-	}
-
-	delta = auxf->slen - auxr->slen;
-
-	/* in-place replacement since find and replace strings are of equal 
-	   length */
-	if (delta == 0) {
-		while ((pos = instr (b, pos, auxf)) >= 0) {
-			bstr__memcpy (b->data + pos, auxr->data, auxr->slen);
-			pos += auxf->slen;
-		}
-		if (auxf != find) bdestroy (auxf);
-		if (auxr != repl) bdestroy (auxr);
-		return BSTR_OK;
-	}
-
-	/* shrinking replacement since auxf->slen > auxr->slen */
-	if (delta > 0) {
-		acc = 0;
-
-		while ((i = instr (b, pos, auxf)) >= 0) {
-			if (acc && i > pos)
-				bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
-			if (auxr->slen)
-				bstr__memcpy (b->data + i - acc, auxr->data, auxr->slen);
-			acc += delta;
-			pos = i + auxf->slen;
-		}
-
-		if (acc) {
-			i = b->slen;
-			if (i > pos)
-				bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
-			b->slen -= acc;
-			b->data[b->slen] = (unsigned char) '\0';
-		}
-
-		if (auxf != find) bdestroy (auxf);
-		if (auxr != repl) bdestroy (auxr);
-		return BSTR_OK;
-	}
-
-	/* expanding replacement since find->slen < repl->slen.  Its a lot 
-	   more complicated. */
-
-	mlen = 32;
-	d = (int *) static_d; /* Avoid malloc for trivial cases */
-	acc = slen = 0;
-
-	while ((pos = instr (b, pos, auxf)) >= 0) {
-		if (slen + 1 >= mlen) {
-			int sl;
-			int * t;
-			mlen += mlen;
-			sl = sizeof (int *) * mlen;
-			if (static_d == d) d = NULL;
-			if (sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) {
-				ret = BSTR_ERR;
-				goto done;
-			}
-			if (NULL == d) bstr__memcpy (t, static_d, sizeof (static_d));
-			d = t;
-		}
-		d[slen] = pos;
-		slen++;
-		acc -= delta;
-		pos += auxf->slen;
-		if (pos < 0 || acc < 0) {
-			ret = BSTR_ERR;
-			goto done;
-		}
-	}
-	d[slen] = b->slen;
-
-	if (BSTR_OK == (ret = balloc (b, b->slen + acc + 1))) {
-		b->slen += acc;
-		for (i = slen-1; i >= 0; i--) {
-			int s, l;
-			s = d[i] + auxf->slen;
-			l = d[i+1] - s;
-			if (l) {
-				bstr__memmove (b->data + s + acc, b->data + s, l);
-			}
-			if (auxr->slen) {
-				bstr__memmove (b->data + s + acc - auxr->slen, 
-				         auxr->data, auxr->slen);
-			}
-			acc += delta;		
-		}
-		b->data[b->slen] = (unsigned char) '\0';
-	}
-
-	done:;
-	if (static_d == d) d = NULL;
-	bstr__free (d);
-	if (auxf != find) bdestroy (auxf);
-	if (auxr != repl) bdestroy (auxr);
-	return ret;
+    if (b == NULL || b->data == NULL || find == NULL ||
+        find->data == NULL || repl == NULL || repl->data == NULL || 
+        pos < 0 || find->slen <= 0 || b->mlen < 0 || b->slen > b->mlen || 
+        b->mlen <= 0 || b->slen < 0 || repl->slen < 0) return BSTR_ERR;
+    if (pos > b->slen - find->slen) return BSTR_OK;
+
+    /* Alias with find string */
+    pd = (ptrdiff_t) (find->data - b->data);
+    if ((ptrdiff_t) (pos - find->slen) < pd && pd < (ptrdiff_t) b->slen) {
+        if (NULL == (auxf = bstrcpy (find))) return BSTR_ERR;
+    }
+
+    /* Alias with repl string */
+    pd = (ptrdiff_t) (repl->data - b->data);
+    if ((ptrdiff_t) (pos - repl->slen) < pd && pd < (ptrdiff_t) b->slen) {
+        if (NULL == (auxr = bstrcpy (repl))) {
+            if (auxf != find) bdestroy (auxf);
+            return BSTR_ERR;
+        }
+    }
+
+    delta = auxf->slen - auxr->slen;
+
+    /* in-place replacement since find and replace strings are of equal 
+       length */
+    if (delta == 0) {
+        while ((pos = instr (b, pos, auxf)) >= 0) {
+            bstr__memcpy (b->data + pos, auxr->data, auxr->slen);
+            pos += auxf->slen;
+        }
+        if (auxf != find) bdestroy (auxf);
+        if (auxr != repl) bdestroy (auxr);
+        return BSTR_OK;
+    }
+
+    /* shrinking replacement since auxf->slen > auxr->slen */
+    if (delta > 0) {
+        acc = 0;
+
+        while ((i = instr (b, pos, auxf)) >= 0) {
+            if (acc && i > pos)
+                bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
+            if (auxr->slen)
+                bstr__memcpy (b->data + i - acc, auxr->data, auxr->slen);
+            acc += delta;
+            pos = i + auxf->slen;
+        }
+
+        if (acc) {
+            i = b->slen;
+            if (i > pos)
+                bstr__memmove (b->data + pos - acc, b->data + pos, i - pos);
+            b->slen -= acc;
+            b->data[b->slen] = (unsigned char) '\0';
+        }
+
+        if (auxf != find) bdestroy (auxf);
+        if (auxr != repl) bdestroy (auxr);
+        return BSTR_OK;
+    }
+
+    /* expanding replacement since find->slen < repl->slen.  Its a lot 
+       more complicated. */
+
+    mlen = 32;
+    d = (int *) static_d; /* Avoid malloc for trivial cases */
+    acc = slen = 0;
+
+    while ((pos = instr (b, pos, auxf)) >= 0) {
+        if (slen + 1 >= mlen) {
+            int sl;
+            int * t;
+            mlen += mlen;
+            sl = sizeof (int *) * mlen;
+            if (static_d == d) d = NULL;
+            if (sl < mlen || NULL == (t = (int *) bstr__realloc (d, sl))) {
+                ret = BSTR_ERR;
+                goto done;
+            }
+            if (NULL == d) bstr__memcpy (t, static_d, sizeof (static_d));
+            d = t;
+        }
+        d[slen] = pos;
+        slen++;
+        acc -= delta;
+        pos += auxf->slen;
+        if (pos < 0 || acc < 0) {
+            ret = BSTR_ERR;
+            goto done;
+        }
+    }
+    d[slen] = b->slen;
+
+    if (BSTR_OK == (ret = balloc (b, b->slen + acc + 1))) {
+        b->slen += acc;
+        for (i = slen-1; i >= 0; i--) {
+            int s, l;
+            s = d[i] + auxf->slen;
+            l = d[i+1] - s;
+            if (l) {
+                bstr__memmove (b->data + s + acc, b->data + s, l);
+            }
+            if (auxr->slen) {
+                bstr__memmove (b->data + s + acc - auxr->slen, 
+                         auxr->data, auxr->slen);
+            }
+            acc += delta;        
+        }
+        b->data[b->slen] = (unsigned char) '\0';
+    }
+
+    done:;
+    if (static_d == d) d = NULL;
+    bstr__free (d);
+    if (auxf != find) bdestroy (auxf);
+    if (auxr != repl) bdestroy (auxr);
+    return ret;
 }
 
 /*  int bfindreplace (bstring b, const_bstring find, const_bstring repl, 
@@ -1678,7 +1678,7 @@ bstring auxr = (bstring) repl;
  *  given point in a bstring.
  */
 int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos) {
-	return findreplaceengine (b, find, repl, pos, binstr);
+    return findreplaceengine (b, find, repl, pos, binstr);
 }
 
 /*  int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, 
@@ -1688,7 +1688,7 @@ int bfindreplace (bstring b, const_bstring find, const_bstring repl, int pos) {
  *  string after a given point in a bstring.
  */
 int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int pos) {
-	return findreplaceengine (b, find, repl, pos, binstrcaseless);
+    return findreplaceengine (b, find, repl, pos, binstrcaseless);
 }
 
 /*  int binsertch (bstring b, int pos, int len, unsigned char fill)
@@ -1701,31 +1701,31 @@ int bfindreplacecaseless (bstring b, const_bstring find, const_bstring repl, int
 int binsertch (bstring b, int pos, int len, unsigned char fill) {
 int d, l, i;
 
-	if (pos < 0 || b == NULL || b->slen < 0 || b->mlen < b->slen ||
-	    b->mlen <= 0 || len < 0) return BSTR_ERR;
-
-	/* Compute the two possible end pointers */
-	d = b->slen + len;
-	l = pos + len;
-	if ((d|l) < 0) return BSTR_ERR;
-
-	if (l > d) {
-		/* Inserting past the end of the string */
-		if (balloc (b, l + 1) != BSTR_OK) return BSTR_ERR;
-		pos = b->slen;
-		b->slen = l;
-	} else {
-		/* Inserting in the middle of the string */
-		if (balloc (b, d + 1) != BSTR_OK) return BSTR_ERR;
-		for (i = d - 1; i >= l; i--) {
-			b->data[i] = b->data[i - len];
-		}
-		b->slen = d;
-	}
-
-	for (i=pos; i < l; i++) b->data[i] = fill;
-	b->data[b->slen] = (unsigned char) '\0';
-	return BSTR_OK;
+    if (pos < 0 || b == NULL || b->slen < 0 || b->mlen < b->slen ||
+        b->mlen <= 0 || len < 0) return BSTR_ERR;
+
+    /* Compute the two possible end pointers */
+    d = b->slen + len;
+    l = pos + len;
+    if ((d|l) < 0) return BSTR_ERR;
+
+    if (l > d) {
+        /* Inserting past the end of the string */
+        if (balloc (b, l + 1) != BSTR_OK) return BSTR_ERR;
+        pos = b->slen;
+        b->slen = l;
+    } else {
+        /* Inserting in the middle of the string */
+        if (balloc (b, d + 1) != BSTR_OK) return BSTR_ERR;
+        for (i = d - 1; i >= l; i--) {
+            b->data[i] = b->data[i - len];
+        }
+        b->slen = d;
+    }
+
+    for (i=pos; i < l; i++) b->data[i] = fill;
+    b->data[b->slen] = (unsigned char) '\0';
+    return BSTR_OK;
 }
 
 /*  int bpattern (bstring b, int len)
@@ -1738,15 +1738,15 @@ int d, l, i;
 int bpattern (bstring b, int len) {
 int i, d;
 
-	d = blength (b);
-	if (d <= 0 || len < 0 || balloc (b, len + 1) != BSTR_OK) return BSTR_ERR;
-	if (len > 0) {
-		if (d == 1) return bsetstr (b, len, NULL, b->data[0]);
-		for (i = d; i < len; i++) b->data[i] = b->data[i - d];
-	}
-	b->data[len] = (unsigned char) '\0';
-	b->slen = len;
-	return BSTR_OK;
+    d = blength (b);
+    if (d <= 0 || len < 0 || balloc (b, len + 1) != BSTR_OK) return BSTR_ERR;
+    if (len > 0) {
+        if (d == 1) return bsetstr (b, len, NULL, b->data[0]);
+        for (i = d; i < len; i++) b->data[i] = b->data[i - d];
+    }
+    b->data[len] = (unsigned char) '\0';
+    b->slen = len;
+    return BSTR_OK;
 }
 
 #define BS_BUFF_SZ (1024)
@@ -1760,20 +1760,20 @@ int i, d;
 int breada (bstring b, bNread readPtr, void * parm) {
 int i, l, n;
 
-	if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
-	    b->mlen <= 0 || readPtr == NULL) return BSTR_ERR;
+    if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+        b->mlen <= 0 || readPtr == NULL) return BSTR_ERR;
 
-	i = b->slen;
-	for (n=i+16; ; n += ((n < BS_BUFF_SZ) ? n : BS_BUFF_SZ)) {
-		if (BSTR_OK != balloc (b, n + 1)) return BSTR_ERR;
-		l = (int) readPtr ((void *) (b->data + i), 1, n - i, parm);
-		i += l;
-		b->slen = i;
-		if (i < n) break;
-	}
+    i = b->slen;
+    for (n=i+16; ; n += ((n < BS_BUFF_SZ) ? n : BS_BUFF_SZ)) {
+        if (BSTR_OK != balloc (b, n + 1)) return BSTR_ERR;
+        l = (int) readPtr ((void *) (b->data + i), 1, n - i, parm);
+        i += l;
+        b->slen = i;
+        if (i < n) break;
+    }
 
-	b->data[i] = (unsigned char) '\0';
-	return BSTR_OK;
+    b->data[i] = (unsigned char) '\0';
+    return BSTR_OK;
 }
 
 /*  bstring bread (bNread readPtr, void * parm)
@@ -1785,11 +1785,11 @@ int i, l, n;
 bstring bread (bNread readPtr, void * parm) {
 bstring buff;
 
-	if (0 > breada (buff = bfromcstr (""), readPtr, parm)) {
-		bdestroy (buff);
-		return NULL;
-	}
-	return buff;
+    if (0 > breada (buff = bfromcstr (""), readPtr, parm)) {
+        bdestroy (buff);
+        return NULL;
+    }
+    return buff;
 }
 
 /*  int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator)
@@ -1808,26 +1808,26 @@ bstring buff;
 int bassigngets (bstring b, bNgetc getcPtr, void * parm, char terminator) {
 int c, d, e;
 
-	if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
-	    b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
-	d = 0;
-	e = b->mlen - 2;
+    if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+        b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
+    d = 0;
+    e = b->mlen - 2;
 
-	while ((c = getcPtr (parm)) >= 0) {
-		if (d > e) {
-			b->slen = d;
-			if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
-			e = b->mlen - 2;
-		}
-		b->data[d] = (unsigned char) c;
-		d++;
-		if (c == terminator) break;
-	}
+    while ((c = getcPtr (parm)) >= 0) {
+        if (d > e) {
+            b->slen = d;
+            if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+            e = b->mlen - 2;
+        }
+        b->data[d] = (unsigned char) c;
+        d++;
+        if (c == terminator) break;
+    }
 
-	b->data[d] = (unsigned char) '\0';
-	b->slen = d;
+    b->data[d] = (unsigned char) '\0';
+    b->slen = d;
 
-	return d == 0 && c < 0;
+    return d == 0 && c < 0;
 }
 
 /*  int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator)
@@ -1846,26 +1846,26 @@ int c, d, e;
 int bgetsa (bstring b, bNgetc getcPtr, void * parm, char terminator) {
 int c, d, e;
 
-	if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
-	    b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
-	d = b->slen;
-	e = b->mlen - 2;
+    if (b == NULL || b->mlen <= 0 || b->slen < 0 || b->mlen < b->slen ||
+        b->mlen <= 0 || getcPtr == NULL) return BSTR_ERR;
+    d = b->slen;
+    e = b->mlen - 2;
 
-	while ((c = getcPtr (parm)) >= 0) {
-		if (d > e) {
-			b->slen = d;
-			if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
-			e = b->mlen - 2;
-		}
-		b->data[d] = (unsigned char) c;
-		d++;
-		if (c == terminator) break;
-	}
+    while ((c = getcPtr (parm)) >= 0) {
+        if (d > e) {
+            b->slen = d;
+            if (balloc (b, d + 2) != BSTR_OK) return BSTR_ERR;
+            e = b->mlen - 2;
+        }
+        b->data[d] = (unsigned char) c;
+        d++;
+        if (c == terminator) break;
+    }
 
-	b->data[d] = (unsigned char) '\0';
-	b->slen = d;
+    b->data[d] = (unsigned char) '\0';
+    b->slen = d;
 
-	return d == 0 && c < 0;
+    return d == 0 && c < 0;
 }
 
 /*  bstring bgets (bNgetc getcPtr, void * parm, char terminator)
@@ -1882,19 +1882,19 @@ int c, d, e;
 bstring bgets (bNgetc getcPtr, void * parm, char terminator) {
 bstring buff;
 
-	if (0 > bgetsa (buff = bfromcstr (""), getcPtr, parm, terminator) || 0 >= buff->slen) {
-		bdestroy (buff);
-		buff = NULL;
-	}
-	return buff;
+    if (0 > bgetsa (buff = bfromcstr (""), getcPtr, parm, terminator) || 0 >= buff->slen) {
+        bdestroy (buff);
+        buff = NULL;
+    }
+    return buff;
 }
 
 struct bStream {
-	bstring buff;		/* Buffer for over-reads */
-	void * parm;		/* The stream handle for core stream */
-	bNread readFnPtr;	/* fread compatible fnptr for core stream */
-	int isEOF;		/* track file's EOF state */
-	int maxBuffSz;
+    bstring buff;        /* Buffer for over-reads */
+    void * parm;        /* The stream handle for core stream */
+    bNread readFnPtr;    /* fread compatible fnptr for core stream */
+    int isEOF;        /* track file's EOF state */
+    int maxBuffSz;
 };
 
 /*  struct bStream * bsopen (bNread readPtr, void * parm)
@@ -1906,15 +1906,15 @@ struct bStream {
 struct bStream * bsopen (bNread readPtr, void * parm) {
 struct bStream * s;
 
-	if (readPtr == NULL) return NULL;
-	s = (struct bStream *) bstr__alloc (sizeof (struct bStream));
-	if (s == NULL) return NULL;
-	s->parm = parm;
-	s->buff = bfromcstr ("");
-	s->readFnPtr = readPtr;
-	s->maxBuffSz = BS_BUFF_SZ;
-	s->isEOF = 0;
-	return s;
+    if (readPtr == NULL) return NULL;
+    s = (struct bStream *) bstr__alloc (sizeof (struct bStream));
+    if (s == NULL) return NULL;
+    s->parm = parm;
+    s->buff = bfromcstr ("");
+    s->readFnPtr = readPtr;
+    s->maxBuffSz = BS_BUFF_SZ;
+    s->isEOF = 0;
+    return s;
 }
 
 /*  int bsbufflength (struct bStream * s, int sz)
@@ -1924,15 +1924,15 @@ struct bStream * s;
  */
 int bsbufflength (struct bStream * s, int sz) {
 int oldSz;
-	if (s == NULL || sz < 0) return BSTR_ERR;
-	oldSz = s->maxBuffSz;
-	if (sz > 0) s->maxBuffSz = sz;
-	return oldSz;
+    if (s == NULL || sz < 0) return BSTR_ERR;
+    oldSz = s->maxBuffSz;
+    if (sz > 0) s->maxBuffSz = sz;
+    return oldSz;
 }
 
 int bseof (const struct bStream * s) {
-	if (s == NULL || s->readFnPtr == NULL) return BSTR_ERR;
-	return s->isEOF && (s->buff->slen == 0);
+    if (s == NULL || s->readFnPtr == NULL) return BSTR_ERR;
+    return s->isEOF && (s->buff->slen == 0);
 }
 
 /*  void * bsclose (struct bStream * s)
@@ -1942,15 +1942,15 @@ int bseof (const struct bStream * s) {
  */
 void * bsclose (struct bStream * s) {
 void * parm;
-	if (s == NULL) return NULL;
-	s->readFnPtr = NULL;
-	if (s->buff) bdestroy (s->buff);
-	s->buff = NULL;
-	parm = s->parm;
-	s->parm = NULL;
-	s->isEOF = 1;
-	bstr__free (s);
-	return parm;
+    if (s == NULL) return NULL;
+    s->readFnPtr = NULL;
+    if (s->buff) bdestroy (s->buff);
+    s->buff = NULL;
+    parm = s->parm;
+    s->parm = NULL;
+    s->isEOF = 1;
+    bstr__free (s);
+    return parm;
 }
 
 /*  int bsreadlna (bstring r, struct bStream * s, char terminator)
@@ -1965,56 +1965,56 @@ int i, l, ret, rlo;
 char * b;
 struct tagbstring x;
 
-	if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0 ||
-	    r->slen < 0 || r->mlen < r->slen) return BSTR_ERR;
-	l = s->buff->slen;
-	if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-	b = (char *) s->buff->data;
-	x.data = (unsigned char *) b;
-
-	/* First check if the current buffer holds the terminator */
-	b[l] = terminator; /* Set sentinel */
-	for (i=0; b[i] != terminator; i++) ;
-	if (i < l) {
-		x.slen = i + 1;
-		ret = bconcat (r, &x);
-		s->buff->slen = l;
-		if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
-		return BSTR_OK;
-	}
-
-	rlo = r->slen;
-
-	/* If not then just concatenate the entire buffer to the output */
-	x.slen = l;
-	if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
-
-	/* Perform direct in-place reads into the destination to allow for
-	   the minimum of data-copies */
-	for (;;) {
-		if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
-		b = (char *) (r->data + r->slen);
-		l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
-		if (l <= 0) {
-			r->data[r->slen] = (unsigned char) '\0';
-			s->buff->slen = 0;
-			s->isEOF = 1;
-			/* If nothing was read return with an error message */
-			return BSTR_ERR & -(r->slen == rlo);
-		}
-		b[l] = terminator; /* Set sentinel */
-		for (i=0; b[i] != terminator; i++) ;
-		if (i < l) break;
-		r->slen += l;
-	}
-
-	/* Terminator found, push over-read back to buffer */
-	i++;
-	r->slen += i;
-	s->buff->slen = l - i;
-	bstr__memcpy (s->buff->data, b + i, l - i);
-	r->data[r->slen] = (unsigned char) '\0';
-	return BSTR_OK;
+    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0 ||
+        r->slen < 0 || r->mlen < r->slen) return BSTR_ERR;
+    l = s->buff->slen;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    b = (char *) s->buff->data;
+    x.data = (unsigned char *) b;
+
+    /* First check if the current buffer holds the terminator */
+    b[l] = terminator; /* Set sentinel */
+    for (i=0; b[i] != terminator; i++) ;
+    if (i < l) {
+        x.slen = i + 1;
+        ret = bconcat (r, &x);
+        s->buff->slen = l;
+        if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
+        return BSTR_OK;
+    }
+
+    rlo = r->slen;
+
+    /* If not then just concatenate the entire buffer to the output */
+    x.slen = l;
+    if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
+
+    /* Perform direct in-place reads into the destination to allow for
+       the minimum of data-copies */
+    for (;;) {
+        if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
+        b = (char *) (r->data + r->slen);
+        l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
+        if (l <= 0) {
+            r->data[r->slen] = (unsigned char) '\0';
+            s->buff->slen = 0;
+            s->isEOF = 1;
+            /* If nothing was read return with an error message */
+            return BSTR_ERR & -(r->slen == rlo);
+        }
+        b[l] = terminator; /* Set sentinel */
+        for (i=0; b[i] != terminator; i++) ;
+        if (i < l) break;
+        r->slen += l;
+    }
+
+    /* Terminator found, push over-read back to buffer */
+    i++;
+    r->slen += i;
+    s->buff->slen = l - i;
+    bstr__memcpy (s->buff->data, b + i, l - i);
+    r->data[r->slen] = (unsigned char) '\0';
+    return BSTR_OK;
 }
 
 /*  int bsreadlnsa (bstring r, struct bStream * s, bstring term)
@@ -2030,61 +2030,61 @@ unsigned char * b;
 struct tagbstring x;
 struct charField cf;
 
-	if (s == NULL || s->buff == NULL || r == NULL || term == NULL ||
-	    term->data == NULL || r->mlen <= 0 || r->slen < 0 ||
-	    r->mlen < r->slen) return BSTR_ERR;
-	if (term->slen == 1) return bsreadlna (r, s, term->data[0]);
-	if (term->slen < 1 || buildCharField (&cf, term)) return BSTR_ERR;
-
-	l = s->buff->slen;
-	if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-	b = (unsigned char *) s->buff->data;
-	x.data = b;
-
-	/* First check if the current buffer holds the terminator */
-	b[l] = term->data[0]; /* Set sentinel */
-	for (i=0; !testInCharField (&cf, b[i]); i++) ;
-	if (i < l) {
-		x.slen = i + 1;
-		ret = bconcat (r, &x);
-		s->buff->slen = l;
-		if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
-		return BSTR_OK;
-	}
-
-	rlo = r->slen;
-
-	/* If not then just concatenate the entire buffer to the output */
-	x.slen = l;
-	if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
-
-	/* Perform direct in-place reads into the destination to allow for
-	   the minimum of data-copies */
-	for (;;) {
-		if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
-		b = (unsigned char *) (r->data + r->slen);
-		l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
-		if (l <= 0) {
-			r->data[r->slen] = (unsigned char) '\0';
-			s->buff->slen = 0;
-			s->isEOF = 1;
-			/* If nothing was read return with an error message */
-			return BSTR_ERR & -(r->slen == rlo);
-		}
-
-		b[l] = term->data[0]; /* Set sentinel */
-		for (i=0; !testInCharField (&cf, b[i]); i++) ;
-		if (i < l) break;
-		r->slen += l;
-	}
-
-	/* Terminator found, push over-read back to buffer */
-	i++;
-	r->slen += i;
-	s->buff->slen = l - i;
-	bstr__memcpy (s->buff->data, b + i, l - i);
-	r->data[r->slen] = (unsigned char) '\0';
-	return BSTR_OK;
+    if (s == NULL || s->buff == NULL || r == NULL || term == NULL ||
+        term->data == NULL || r->mlen <= 0 || r->slen < 0 ||
+        r->mlen < r->slen) return BSTR_ERR;
+    if (term->slen == 1) return bsreadlna (r, s, term->data[0]);
+    if (term->slen < 1 || buildCharField (&cf, term)) return BSTR_ERR;
+
+    l = s->buff->slen;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    b = (unsigned char *) s->buff->data;
+    x.data = b;
+
+    /* First check if the current buffer holds the terminator */
+    b[l] = term->data[0]; /* Set sentinel */
+    for (i=0; !testInCharField (&cf, b[i]); i++) ;
+    if (i < l) {
+        x.slen = i + 1;
+        ret = bconcat (r, &x);
+        s->buff->slen = l;
+        if (BSTR_OK == ret) bdelete (s->buff, 0, i + 1);
+        return BSTR_OK;
+    }
+
+    rlo = r->slen;
+
+    /* If not then just concatenate the entire buffer to the output */
+    x.slen = l;
+    if (BSTR_OK != bconcat (r, &x)) return BSTR_ERR;
+
+    /* Perform direct in-place reads into the destination to allow for
+       the minimum of data-copies */
+    for (;;) {
+        if (BSTR_OK != balloc (r, r->slen + s->maxBuffSz + 1)) return BSTR_ERR;
+        b = (unsigned char *) (r->data + r->slen);
+        l = (int) s->readFnPtr (b, 1, s->maxBuffSz, s->parm);
+        if (l <= 0) {
+            r->data[r->slen] = (unsigned char) '\0';
+            s->buff->slen = 0;
+            s->isEOF = 1;
+            /* If nothing was read return with an error message */
+            return BSTR_ERR & -(r->slen == rlo);
+        }
+
+        b[l] = term->data[0]; /* Set sentinel */
+        for (i=0; !testInCharField (&cf, b[i]); i++) ;
+        if (i < l) break;
+        r->slen += l;
+    }
+
+    /* Terminator found, push over-read back to buffer */
+    i++;
+    r->slen += i;
+    s->buff->slen = l - i;
+    bstr__memcpy (s->buff->data, b + i, l - i);
+    r->data[r->slen] = (unsigned char) '\0';
+    return BSTR_OK;
 }
 
 /*  int bsreada (bstring r, struct bStream * s, int n)
@@ -2100,56 +2100,56 @@ int l, ret, orslen;
 char * b;
 struct tagbstring x;
 
-	if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
-	 || r->slen < 0 || r->mlen < r->slen || n <= 0) return BSTR_ERR;
+    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
+     || r->slen < 0 || r->mlen < r->slen || n <= 0) return BSTR_ERR;
 
-	n += r->slen;
-	if (n <= 0) return BSTR_ERR;
+    n += r->slen;
+    if (n <= 0) return BSTR_ERR;
 
-	l = s->buff->slen;
+    l = s->buff->slen;
 
-	orslen = r->slen;
+    orslen = r->slen;
 
-	if (0 == l) {
-		if (s->isEOF) return BSTR_ERR;
-		if (r->mlen > n) {
-			l = (int) s->readFnPtr (r->data + r->slen, 1, n - r->slen, s->parm);
-			if (0 >= l || l > n - r->slen) {
-				s->isEOF = 1;
-				return BSTR_ERR;
-			}
-			r->slen += l;
-			r->data[r->slen] = (unsigned char) '\0';
-			return 0;
-		}
-	}
+    if (0 == l) {
+        if (s->isEOF) return BSTR_ERR;
+        if (r->mlen > n) {
+            l = (int) s->readFnPtr (r->data + r->slen, 1, n - r->slen, s->parm);
+            if (0 >= l || l > n - r->slen) {
+                s->isEOF = 1;
+                return BSTR_ERR;
+            }
+            r->slen += l;
+            r->data[r->slen] = (unsigned char) '\0';
+            return 0;
+        }
+    }
 
-	if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-	b = (char *) s->buff->data;
-	x.data = (unsigned char *) b;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    b = (char *) s->buff->data;
+    x.data = (unsigned char *) b;
 
-	do {
-		if (l + r->slen >= n) {
-			x.slen = n - r->slen;
-			ret = bconcat (r, &x);
-			s->buff->slen = l;
-			if (BSTR_OK == ret) bdelete (s->buff, 0, x.slen);
-			return BSTR_ERR & -(r->slen == orslen);
-		}
+    do {
+        if (l + r->slen >= n) {
+            x.slen = n - r->slen;
+            ret = bconcat (r, &x);
+            s->buff->slen = l;
+            if (BSTR_OK == ret) bdelete (s->buff, 0, x.slen);
+            return BSTR_ERR & -(r->slen == orslen);
+        }
 
-		x.slen = l;
-		if (BSTR_OK != bconcat (r, &x)) break;
+        x.slen = l;
+        if (BSTR_OK != bconcat (r, &x)) break;
 
-		l = n - r->slen;
-		if (l > s->maxBuffSz) l = s->maxBuffSz;
+        l = n - r->slen;
+        if (l > s->maxBuffSz) l = s->maxBuffSz;
 
-		l = (int) s->readFnPtr (b, 1, l, s->parm);
+        l = (int) s->readFnPtr (b, 1, l, s->parm);
 
-	} while (l > 0);
-	if (l < 0) l = 0;
-	if (l == 0) s->isEOF = 1;
-	s->buff->slen = l;
-	return BSTR_ERR & -(r->slen == orslen);
+    } while (l > 0);
+    if (l < 0) l = 0;
+    if (l == 0) s->isEOF = 1;
+    s->buff->slen = l;
+    return BSTR_ERR & -(r->slen == orslen);
 }
 
 /*  int bsreadln (bstring r, struct bStream * s, char terminator)
@@ -2160,11 +2160,11 @@ struct tagbstring x;
  *  returned, but will be retained for subsequent read operations.
  */
 int bsreadln (bstring r, struct bStream * s, char terminator) {
-	if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0)
-		return BSTR_ERR;
-	if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-	r->slen = 0;
-	return bsreadlna (r, s, terminator);
+    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0)
+        return BSTR_ERR;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    r->slen = 0;
+    return bsreadlna (r, s, terminator);
 }
 
 /*  int bsreadlns (bstring r, struct bStream * s, bstring term)
@@ -2175,13 +2175,13 @@ int bsreadln (bstring r, struct bStream * s, char terminator) {
  *  are not returned, but will be retained for subsequent read operations.
  */
 int bsreadlns (bstring r, struct bStream * s, const_bstring term) {
-	if (s == NULL || s->buff == NULL || r == NULL || term == NULL 
-	 || term->data == NULL || r->mlen <= 0) return BSTR_ERR;
-	if (term->slen == 1) return bsreadln (r, s, term->data[0]);
-	if (term->slen < 1) return BSTR_ERR;
-	if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-	r->slen = 0;
-	return bsreadlnsa (r, s, term);
+    if (s == NULL || s->buff == NULL || r == NULL || term == NULL 
+     || term->data == NULL || r->mlen <= 0) return BSTR_ERR;
+    if (term->slen == 1) return bsreadln (r, s, term->data[0]);
+    if (term->slen < 1) return BSTR_ERR;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    r->slen = 0;
+    return bsreadlnsa (r, s, term);
 }
 
 /*  int bsread (bstring r, struct bStream * s, int n)
@@ -2193,11 +2193,11 @@ int bsreadlns (bstring r, struct bStream * s, const_bstring term) {
  *  additional characters from the core stream beyond virtual stream pointer.
  */
 int bsread (bstring r, struct bStream * s, int n) {
-	if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
-	 || n <= 0) return BSTR_ERR;
-	if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
-	r->slen = 0;
-	return bsreada (r, s, n);
+    if (s == NULL || s->buff == NULL || r == NULL || r->mlen <= 0
+     || n <= 0) return BSTR_ERR;
+    if (BSTR_OK != balloc (s->buff, s->maxBuffSz + 1)) return BSTR_ERR;
+    r->slen = 0;
+    return bsreada (r, s, n);
 }
 
 /*  int bsunread (struct bStream * s, const_bstring b)
@@ -2207,8 +2207,8 @@ int bsread (bstring r, struct bStream * s, int n) {
  *  stream.
  */
 int bsunread (struct bStream * s, const_bstring b) {
-	if (s == NULL || s->buff == NULL) return BSTR_ERR;
-	return binsert (s->buff, 0, b, (unsigned char) '?');
+    if (s == NULL || s->buff == NULL) return BSTR_ERR;
+    return binsert (s->buff, 0, b, (unsigned char) '?');
 }
 
 /*  int bspeek (bstring r, const struct bStream * s)
@@ -2217,8 +2217,8 @@ int bsunread (struct bStream * s, const_bstring b) {
  *  read prior to reads from the core stream.
  */
 int bspeek (bstring r, const struct bStream * s) {
-	if (s == NULL || s->buff == NULL) return BSTR_ERR;
-	return bassign (r, s->buff);
+    if (s == NULL || s->buff == NULL) return BSTR_ERR;
+    return bassign (r, s->buff);
 }
 
 /*  bstring bjoin (const struct bstrList * bl, const_bstring sep);
@@ -2231,46 +2231,46 @@ bstring bjoin (const struct bstrList * bl, const_bstring sep) {
 bstring b;
 int i, c, v;
 
-	if (bl == NULL || bl->qty < 0) return NULL;
-	if (sep != NULL && (sep->slen < 0 || sep->data == NULL)) return NULL;
-
-	for (i = 0, c = 1; i < bl->qty; i++) {
-		v = bl->entry[i]->slen;
-		if (v < 0) return NULL;	/* Invalid input */
-		c += v;
-		if (c < 0) return NULL;	/* Wrap around ?? */
-	}
-
-	if (sep != NULL) c += (bl->qty - 1) * sep->slen;
-
-	b = (bstring) bstr__alloc (sizeof (struct tagbstring));
-	if (NULL == b) return NULL; /* Out of memory */
-	b->data = (unsigned char *) bstr__alloc (c);
-	if (b->data == NULL) {
-		bstr__free (b);
-		return NULL;
-	}
-
-	b->mlen = c;
-	b->slen = c-1;
-
-	for (i = 0, c = 0; i < bl->qty; i++) {
-		if (i > 0 && sep != NULL) {
-			bstr__memcpy (b->data + c, sep->data, sep->slen);
-			c += sep->slen;
-		}
-		v = bl->entry[i]->slen;
-		bstr__memcpy (b->data + c, bl->entry[i]->data, v);
-		c += v;
-	}
-	b->data[c] = (unsigned char) '\0';
-	return b;
+    if (bl == NULL || bl->qty < 0) return NULL;
+    if (sep != NULL && (sep->slen < 0 || sep->data == NULL)) return NULL;
+
+    for (i = 0, c = 1; i < bl->qty; i++) {
+        v = bl->entry[i]->slen;
+        if (v < 0) return NULL;    /* Invalid input */
+        c += v;
+        if (c < 0) return NULL;    /* Wrap around ?? */
+    }
+
+    if (sep != NULL) c += (bl->qty - 1) * sep->slen;
+
+    b = (bstring) bstr__alloc (sizeof (struct tagbstring));
+    if (NULL == b) return NULL; /* Out of memory */
+    b->data = (unsigned char *) bstr__alloc (c);
+    if (b->data == NULL) {
+        bstr__free (b);
+        return NULL;
+    }
+
+    b->mlen = c;
+    b->slen = c-1;
+
+    for (i = 0, c = 0; i < bl->qty; i++) {
+        if (i > 0 && sep != NULL) {
+            bstr__memcpy (b->data + c, sep->data, sep->slen);
+            c += sep->slen;
+        }
+        v = bl->entry[i]->slen;
+        bstr__memcpy (b->data + c, bl->entry[i]->data, v);
+        c += v;
+    }
+    b->data[c] = (unsigned char) '\0';
+    return b;
 }
 
 #define BSSSC_BUFF_LEN (256)
 
 /*  int bssplitscb (struct bStream * s, const_bstring splitStr, 
- *	int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
+ *    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
  *
  *  Iterate the set of disjoint sequential substrings read from a stream 
  *  divided by any of the characters in splitStr.  An empty splitStr causes 
@@ -2287,56 +2287,56 @@ int i, c, v;
  *  undefined manner.
  */
 int bssplitscb (struct bStream * s, const_bstring splitStr, 
-	int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
 struct charField chrs;
 bstring buff;
 int i, p, ret;
 
-	if (cb == NULL || s == NULL || s->readFnPtr == NULL 
-	 || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
-
-	if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
-
-	if (splitStr->slen == 0) {
-		while (bsreada (buff, s, BSSSC_BUFF_LEN) >= 0) ;
-		if ((ret = cb (parm, 0, buff)) > 0) 
-			ret = 0;
-	} else {
-		buildCharField (&chrs, splitStr);
-		ret = p = i = 0;
-		for (;;) {
-			if (i >= buff->slen) {
-				bsreada (buff, s, BSSSC_BUFF_LEN);
-				if (i >= buff->slen) {
-					if (0 < (ret = cb (parm, p, buff))) ret = 0;
-					break;
-				}
-			}
-			if (testInCharField (&chrs, buff->data[i])) {
-				struct tagbstring t;
-				unsigned char c;
-
-				blk2tbstr (t, buff->data + i + 1, buff->slen - (i + 1));
-				if ((ret = bsunread (s, &t)) < 0) break;
-				buff->slen = i;
-				c = buff->data[i];
-				buff->data[i] = (unsigned char) '\0';
-				if ((ret = cb (parm, p, buff)) < 0) break;
-				buff->data[i] = c;
-				buff->slen = 0;
-				p += i + 1;
-				i = -1;
-			}
-			i++;
-		}
-	}
-
-	bdestroy (buff);
-	return ret;
+    if (cb == NULL || s == NULL || s->readFnPtr == NULL 
+     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+    if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
+
+    if (splitStr->slen == 0) {
+        while (bsreada (buff, s, BSSSC_BUFF_LEN) >= 0) ;
+        if ((ret = cb (parm, 0, buff)) > 0) 
+            ret = 0;
+    } else {
+        buildCharField (&chrs, splitStr);
+        ret = p = i = 0;
+        for (;;) {
+            if (i >= buff->slen) {
+                bsreada (buff, s, BSSSC_BUFF_LEN);
+                if (i >= buff->slen) {
+                    if (0 < (ret = cb (parm, p, buff))) ret = 0;
+                    break;
+                }
+            }
+            if (testInCharField (&chrs, buff->data[i])) {
+                struct tagbstring t;
+                unsigned char c;
+
+                blk2tbstr (t, buff->data + i + 1, buff->slen - (i + 1));
+                if ((ret = bsunread (s, &t)) < 0) break;
+                buff->slen = i;
+                c = buff->data[i];
+                buff->data[i] = (unsigned char) '\0';
+                if ((ret = cb (parm, p, buff)) < 0) break;
+                buff->data[i] = c;
+                buff->slen = 0;
+                p += i + 1;
+                i = -1;
+            }
+            i++;
+        }
+    }
+
+    bdestroy (buff);
+    return ret;
 }
 
 /*  int bssplitstrcb (struct bStream * s, const_bstring splitStr, 
- *	int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
+ *    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm)
  *
  *  Iterate the set of disjoint sequential substrings read from a stream 
  *  divided by the entire substring splitStr.  An empty splitStr causes 
@@ -2353,48 +2353,48 @@ int i, p, ret;
  *  undefined manner.
  */
 int bssplitstrcb (struct bStream * s, const_bstring splitStr, 
-	int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm) {
 bstring buff;
 int i, p, ret;
 
-	if (cb == NULL || s == NULL || s->readFnPtr == NULL 
-	 || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
-
-	if (splitStr->slen == 1) return bssplitscb (s, splitStr, cb, parm);
-
-	if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
-
-	if (splitStr->slen == 0) {
-		for (i=0; bsreada (buff, s, BSSSC_BUFF_LEN) >= 0; i++) {
-			if ((ret = cb (parm, 0, buff)) < 0) {
-				bdestroy (buff);
-				return ret;
-			}
-			buff->slen = 0;
-		}
-		return BSTR_OK;
-	} else {
-		ret = p = i = 0;
-		for (i=p=0;;) {
-			if ((ret = binstr (buff, 0, splitStr)) >= 0) {
-				struct tagbstring t;
-				blk2tbstr (t, buff->data, ret);
-				i = ret + splitStr->slen;
-				if ((ret = cb (parm, p, &t)) < 0) break;
-				p += i;
-				bdelete (buff, 0, i);
-			} else {
-				bsreada (buff, s, BSSSC_BUFF_LEN);
-				if (bseof (s)) {
-					if ((ret = cb (parm, p, buff)) > 0) ret = 0;
-					break;
-				}
-			}
-		}
-	}
-
-	bdestroy (buff);
-	return ret;
+    if (cb == NULL || s == NULL || s->readFnPtr == NULL 
+     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+
+    if (splitStr->slen == 1) return bssplitscb (s, splitStr, cb, parm);
+
+    if (NULL == (buff = bfromcstr (""))) return BSTR_ERR;
+
+    if (splitStr->slen == 0) {
+        for (i=0; bsreada (buff, s, BSSSC_BUFF_LEN) >= 0; i++) {
+            if ((ret = cb (parm, 0, buff)) < 0) {
+                bdestroy (buff);
+                return ret;
+            }
+            buff->slen = 0;
+        }
+        return BSTR_OK;
+    } else {
+        ret = p = i = 0;
+        for (i=p=0;;) {
+            if ((ret = binstr (buff, 0, splitStr)) >= 0) {
+                struct tagbstring t;
+                blk2tbstr (t, buff->data, ret);
+                i = ret + splitStr->slen;
+                if ((ret = cb (parm, p, &t)) < 0) break;
+                p += i;
+                bdelete (buff, 0, i);
+            } else {
+                bsreada (buff, s, BSSSC_BUFF_LEN);
+                if (bseof (s)) {
+                    if ((ret = cb (parm, p, buff)) > 0) ret = 0;
+                    break;
+                }
+            }
+        }
+    }
+
+    bdestroy (buff);
+    return ret;
 }
 
 /*  int bstrListCreate (void)
@@ -2403,17 +2403,17 @@ int i, p, ret;
  */
 struct bstrList * bstrListCreate (void) {
 struct bstrList * sl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
-	if (sl) {
-		sl->entry = (bstring *) bstr__alloc (1*sizeof (bstring));
-		if (!sl->entry) {
-			bstr__free (sl);
-			sl = NULL;
-		} else {
-			sl->qty = 0;
-			sl->mlen = 1;
-		}
-	}
-	return sl;
+    if (sl) {
+        sl->entry = (bstring *) bstr__alloc (1*sizeof (bstring));
+        if (!sl->entry) {
+            bstr__free (sl);
+            sl = NULL;
+        } else {
+            sl->qty = 0;
+            sl->mlen = 1;
+        }
+    }
+    return sl;
 }
 
 /*  int bstrListDestroy (struct bstrList * sl)
@@ -2422,19 +2422,19 @@ struct bstrList * sl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList)
  */
 int bstrListDestroy (struct bstrList * sl) {
 int i;
-	if (sl == NULL || sl->qty < 0) return BSTR_ERR;
-	for (i=0; i < sl->qty; i++) {
-		if (sl->entry[i]) {
-			bdestroy (sl->entry[i]);
-			sl->entry[i] = NULL;
-		}
-	}
-	sl->qty  = -1;
-	sl->mlen = -1;
-	bstr__free (sl->entry);
-	sl->entry = NULL;
-	bstr__free (sl);
-	return BSTR_OK;
+    if (sl == NULL || sl->qty < 0) return BSTR_ERR;
+    for (i=0; i < sl->qty; i++) {
+        if (sl->entry[i]) {
+            bdestroy (sl->entry[i]);
+            sl->entry[i] = NULL;
+        }
+    }
+    sl->qty  = -1;
+    sl->mlen = -1;
+    bstr__free (sl->entry);
+    sl->entry = NULL;
+    bstr__free (sl);
+    return BSTR_OK;
 }
 
 /*  int bstrListAlloc (struct bstrList * sl, int msz)
@@ -2446,21 +2446,21 @@ int bstrListAlloc (struct bstrList * sl, int msz) {
 bstring * l;
 int smsz;
 size_t nsz;
-	if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
-	if (sl->mlen >= msz) return BSTR_OK;
-	smsz = snapUpSize (msz);
-	nsz = ((size_t) smsz) * sizeof (bstring);
-	if (nsz < (size_t) smsz) return BSTR_ERR;
-	l = (bstring *) bstr__realloc (sl->entry, nsz);
-	if (!l) {
-		smsz = msz;
-		nsz = ((size_t) smsz) * sizeof (bstring);
-		l = (bstring *) bstr__realloc (sl->entry, nsz);
-		if (!l) return BSTR_ERR;
-	}
-	sl->mlen = smsz;
-	sl->entry = l;
-	return BSTR_OK;
+    if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
+    if (sl->mlen >= msz) return BSTR_OK;
+    smsz = snapUpSize (msz);
+    nsz = ((size_t) smsz) * sizeof (bstring);
+    if (nsz < (size_t) smsz) return BSTR_ERR;
+    l = (bstring *) bstr__realloc (sl->entry, nsz);
+    if (!l) {
+        smsz = msz;
+        nsz = ((size_t) smsz) * sizeof (bstring);
+        l = (bstring *) bstr__realloc (sl->entry, nsz);
+        if (!l) return BSTR_ERR;
+    }
+    sl->mlen = smsz;
+    sl->entry = l;
+    return BSTR_OK;
 }
 
 /*  int bstrListAllocMin (struct bstrList * sl, int msz)
@@ -2471,20 +2471,20 @@ size_t nsz;
 int bstrListAllocMin (struct bstrList * sl, int msz) {
 bstring * l;
 size_t nsz;
-	if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
-	if (msz < sl->qty) msz = sl->qty;
-	if (sl->mlen == msz) return BSTR_OK;
-	nsz = ((size_t) msz) * sizeof (bstring);
-	if (nsz < (size_t) msz) return BSTR_ERR;
-	l = (bstring *) bstr__realloc (sl->entry, nsz);
-	if (!l) return BSTR_ERR;
-	sl->mlen = msz;
-	sl->entry = l;
-	return BSTR_OK;
+    if (!sl || msz <= 0 || !sl->entry || sl->qty < 0 || sl->mlen <= 0 || sl->qty > sl->mlen) return BSTR_ERR;
+    if (msz < sl->qty) msz = sl->qty;
+    if (sl->mlen == msz) return BSTR_OK;
+    nsz = ((size_t) msz) * sizeof (bstring);
+    if (nsz < (size_t) msz) return BSTR_ERR;
+    l = (bstring *) bstr__realloc (sl->entry, nsz);
+    if (!l) return BSTR_ERR;
+    sl->mlen = msz;
+    sl->entry = l;
+    return BSTR_OK;
 }
 
 /*  int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
- *	int (* cb) (void * parm, int ofs, int len), void * parm)
+ *    int (* cb) (void * parm, int ofs, int len), void * parm)
  *
  *  Iterate the set of disjoint sequential substrings over str divided by the
  *  character in splitChar.
@@ -2499,25 +2499,25 @@ size_t nsz;
  *  otherwise bsplitcb will continue in an undefined manner.
  */
 int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
-	int (* cb) (void * parm, int ofs, int len), void * parm) {
+    int (* cb) (void * parm, int ofs, int len), void * parm) {
 int i, p, ret;
 
-	if (cb == NULL || str == NULL || pos < 0 || pos > str->slen) 
-		return BSTR_ERR;
+    if (cb == NULL || str == NULL || pos < 0 || pos > str->slen) 
+        return BSTR_ERR;
 
-	p = pos;
-	do {
-		for (i=p; i < str->slen; i++) {
-			if (str->data[i] == splitChar) break;
-		}
-		if ((ret = cb (parm, p, i - p)) < 0) return ret;
-		p = i + 1;
-	} while (p <= str->slen);
-	return BSTR_OK;
+    p = pos;
+    do {
+        for (i=p; i < str->slen; i++) {
+            if (str->data[i] == splitChar) break;
+        }
+        if ((ret = cb (parm, p, i - p)) < 0) return ret;
+        p = i + 1;
+    } while (p <= str->slen);
+    return BSTR_OK;
 }
 
 /*  int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
- *	int (* cb) (void * parm, int ofs, int len), void * parm)
+ *    int (* cb) (void * parm, int ofs, int len), void * parm)
  *
  *  Iterate the set of disjoint sequential substrings over str divided by any 
  *  of the characters in splitStr.  An empty splitStr causes the whole str to
@@ -2533,35 +2533,35 @@ int i, p, ret;
  *  otherwise bsplitscb will continue in an undefined manner.
  */
 int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
-	int (* cb) (void * parm, int ofs, int len), void * parm) {
+    int (* cb) (void * parm, int ofs, int len), void * parm) {
 struct charField chrs;
 int i, p, ret;
 
-	if (cb == NULL || str == NULL || pos < 0 || pos > str->slen 
-	 || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
-	if (splitStr->slen == 0) {
-		if ((ret = cb (parm, 0, str->slen)) > 0) ret = 0;
-		return ret;
-	}
+    if (cb == NULL || str == NULL || pos < 0 || pos > str->slen 
+     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+    if (splitStr->slen == 0) {
+        if ((ret = cb (parm, 0, str->slen)) > 0) ret = 0;
+        return ret;
+    }
 
-	if (splitStr->slen == 1) 
-		return bsplitcb (str, splitStr->data[0], pos, cb, parm);
+    if (splitStr->slen == 1) 
+        return bsplitcb (str, splitStr->data[0], pos, cb, parm);
 
-	buildCharField (&chrs, splitStr);
+    buildCharField (&chrs, splitStr);
 
-	p = pos;
-	do {
-		for (i=p; i < str->slen; i++) {
-			if (testInCharField (&chrs, str->data[i])) break;
-		}
-		if ((ret = cb (parm, p, i - p)) < 0) return ret;
-		p = i + 1;
-	} while (p <= str->slen);
-	return BSTR_OK;
+    p = pos;
+    do {
+        for (i=p; i < str->slen; i++) {
+            if (testInCharField (&chrs, str->data[i])) break;
+        }
+        if ((ret = cb (parm, p, i - p)) < 0) return ret;
+        p = i + 1;
+    } while (p <= str->slen);
+    return BSTR_OK;
 }
 
 /*  int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
- *	int (* cb) (void * parm, int ofs, int len), void * parm)
+ *    int (* cb) (void * parm, int ofs, int len), void * parm)
  *
  *  Iterate the set of disjoint sequential substrings over str divided by the 
  *  substring splitStr.  An empty splitStr causes the whole str to be 
@@ -2577,59 +2577,59 @@ int i, p, ret;
  *  otherwise bsplitscb will continue in an undefined manner.
  */
 int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
-	int (* cb) (void * parm, int ofs, int len), void * parm) {
+    int (* cb) (void * parm, int ofs, int len), void * parm) {
 int i, p, ret;
 
-	if (cb == NULL || str == NULL || pos < 0 || pos > str->slen 
-	 || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
+    if (cb == NULL || str == NULL || pos < 0 || pos > str->slen 
+     || splitStr == NULL || splitStr->slen < 0) return BSTR_ERR;
 
-	if (0 == splitStr->slen) {
-		for (i=pos; i < str->slen; i++) {
-			if ((ret = cb (parm, i, 1)) < 0) return ret;
-		}
-		return BSTR_OK;
-	}
+    if (0 == splitStr->slen) {
+        for (i=pos; i < str->slen; i++) {
+            if ((ret = cb (parm, i, 1)) < 0) return ret;
+        }
+        return BSTR_OK;
+    }
 
-	if (splitStr->slen == 1) 
-		return bsplitcb (str, splitStr->data[0], pos, cb, parm);
+    if (splitStr->slen == 1) 
+        return bsplitcb (str, splitStr->data[0], pos, cb, parm);
 
-	for (i=p=pos; i <= str->slen - splitStr->slen; i++) {
-		if (0 == bstr__memcmp (splitStr->data, str->data + i, splitStr->slen)) {
-			if ((ret = cb (parm, p, i - p)) < 0) return ret;
-			i += splitStr->slen;
-			p = i;
-		}
-	}
-	if ((ret = cb (parm, p, str->slen - p)) < 0) return ret;
-	return BSTR_OK;
+    for (i=p=pos; i <= str->slen - splitStr->slen; i++) {
+        if (0 == bstr__memcmp (splitStr->data, str->data + i, splitStr->slen)) {
+            if ((ret = cb (parm, p, i - p)) < 0) return ret;
+            i += splitStr->slen;
+            p = i;
+        }
+    }
+    if ((ret = cb (parm, p, str->slen - p)) < 0) return ret;
+    return BSTR_OK;
 }
 
 struct genBstrList {
-	bstring b;
-	struct bstrList * bl;
+    bstring b;
+    struct bstrList * bl;
 };
 
 static int bscb (void * parm, int ofs, int len) {
 struct genBstrList * g = (struct genBstrList *) parm;
-	if (g->bl->qty >= g->bl->mlen) {
-		int mlen = g->bl->mlen * 2;
-		bstring * tbl;
+    if (g->bl->qty >= g->bl->mlen) {
+        int mlen = g->bl->mlen * 2;
+        bstring * tbl;
 
-		while (g->bl->qty >= mlen) {
-			if (mlen < g->bl->mlen) return BSTR_ERR;
-			mlen += mlen;
-		}
+        while (g->bl->qty >= mlen) {
+            if (mlen < g->bl->mlen) return BSTR_ERR;
+            mlen += mlen;
+        }
 
-		tbl = (bstring *) bstr__realloc (g->bl->entry, sizeof (bstring) * mlen);
-		if (tbl == NULL) return BSTR_ERR;
+        tbl = (bstring *) bstr__realloc (g->bl->entry, sizeof (bstring) * mlen);
+        if (tbl == NULL) return BSTR_ERR;
 
-		g->bl->entry = tbl;
-		g->bl->mlen = mlen;
-	}
+        g->bl->entry = tbl;
+        g->bl->mlen = mlen;
+    }
 
-	g->bl->entry[g->bl->qty] = bmidstr (g->b, ofs, len);
-	g->bl->qty++;
-	return BSTR_OK;
+    g->bl->entry[g->bl->qty] = bmidstr (g->b, ofs, len);
+    g->bl->qty++;
+    return BSTR_OK;
 }
 
 /*  struct bstrList * bsplit (const_bstring str, unsigned char splitChar)
@@ -2640,24 +2640,24 @@ struct genBstrList * g = (struct genBstrList *) parm;
 struct bstrList * bsplit (const_bstring str, unsigned char splitChar) {
 struct genBstrList g;
 
-	if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
+    if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
 
-	g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
-	if (g.bl == NULL) return NULL;
-	g.bl->mlen = 4;
-	g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
-	if (NULL == g.bl->entry) {
-		bstr__free (g.bl);
-		return NULL;
-	}
+    g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+    if (g.bl == NULL) return NULL;
+    g.bl->mlen = 4;
+    g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+    if (NULL == g.bl->entry) {
+        bstr__free (g.bl);
+        return NULL;
+    }
 
-	g.b = (bstring) str;
-	g.bl->qty = 0;
-	if (bsplitcb (str, splitChar, 0, bscb, &g) < 0) {
-		bstrListDestroy (g.bl);
-		return NULL;
-	}
-	return g.bl;
+    g.b = (bstring) str;
+    g.bl->qty = 0;
+    if (bsplitcb (str, splitChar, 0, bscb, &g) < 0) {
+        bstrListDestroy (g.bl);
+        return NULL;
+    }
+    return g.bl;
 }
 
 /*  struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr)
@@ -2668,24 +2668,24 @@ struct genBstrList g;
 struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr) {
 struct genBstrList g;
 
-	if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
+    if (str == NULL || str->data == NULL || str->slen < 0) return NULL;
 
-	g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
-	if (g.bl == NULL) return NULL;
-	g.bl->mlen = 4;
-	g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
-	if (NULL == g.bl->entry) {
-		bstr__free (g.bl);
-		return NULL;
-	}
+    g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+    if (g.bl == NULL) return NULL;
+    g.bl->mlen = 4;
+    g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+    if (NULL == g.bl->entry) {
+        bstr__free (g.bl);
+        return NULL;
+    }
 
-	g.b = (bstring) str;
-	g.bl->qty = 0;
-	if (bsplitstrcb (str, splitStr, 0, bscb, &g) < 0) {
-		bstrListDestroy (g.bl);
-		return NULL;
-	}
-	return g.bl;
+    g.b = (bstring) str;
+    g.bl->qty = 0;
+    if (bsplitstrcb (str, splitStr, 0, bscb, &g) < 0) {
+        bstrListDestroy (g.bl);
+        return NULL;
+    }
+    return g.bl;
 }
 
 /*  struct bstrList * bsplits (const_bstring str, bstring splitStr)
@@ -2697,26 +2697,26 @@ struct genBstrList g;
 struct bstrList * bsplits (const_bstring str, const_bstring splitStr) {
 struct genBstrList g;
 
-	if (     str == NULL ||      str->slen < 0 ||      str->data == NULL ||
-	    splitStr == NULL || splitStr->slen < 0 || splitStr->data == NULL)
-		return NULL;
+    if (     str == NULL ||      str->slen < 0 ||      str->data == NULL ||
+        splitStr == NULL || splitStr->slen < 0 || splitStr->data == NULL)
+        return NULL;
 
-	g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
-	if (g.bl == NULL) return NULL;
-	g.bl->mlen = 4;
-	g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
-	if (NULL == g.bl->entry) {
-		bstr__free (g.bl);
-		return NULL;
-	}
-	g.b = (bstring) str;
-	g.bl->qty = 0;
+    g.bl = (struct bstrList *) bstr__alloc (sizeof (struct bstrList));
+    if (g.bl == NULL) return NULL;
+    g.bl->mlen = 4;
+    g.bl->entry = (bstring *) bstr__alloc (g.bl->mlen * sizeof (bstring));
+    if (NULL == g.bl->entry) {
+        bstr__free (g.bl);
+        return NULL;
+    }
+    g.b = (bstring) str;
+    g.bl->qty = 0;
 
-	if (bsplitscb (str, splitStr, 0, bscb, &g) < 0) {
-		bstrListDestroy (g.bl);
-		return NULL;
-	}
-	return g.bl;
+    if (bsplitscb (str, splitStr, 0, bscb, &g) < 0) {
+        bstrListDestroy (g.bl);
+        return NULL;
+    }
+    return g.bl;
 }
 
 #if defined (__TURBOC__) && !defined (__BORLANDC__)
@@ -2772,40 +2772,40 @@ va_list arglist;
 bstring buff;
 int n, r;
 
-	if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0 
-	 || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+    if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0 
+     || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
 
-	/* Since the length is not determinable beforehand, a search is
-	   performed using the truncating "vsnprintf" call (to avoid buffer
-	   overflows) on increasing potential sizes for the output result. */
+    /* Since the length is not determinable beforehand, a search is
+       performed using the truncating "vsnprintf" call (to avoid buffer
+       overflows) on increasing potential sizes for the output result. */
 
-	if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
-	if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
-		n = 1;
-		if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
-	}
+    if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+    if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+        n = 1;
+        if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
+    }
 
-	for (;;) {
-		va_start (arglist, fmt);
-		exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
-		va_end (arglist);
+    for (;;) {
+        va_start (arglist, fmt);
+        exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+        va_end (arglist);
 
-		buff->data[n] = (unsigned char) '\0';
-		buff->slen = (int) (strlen) ((char *) buff->data);
+        buff->data[n] = (unsigned char) '\0';
+        buff->slen = (int) (strlen) ((char *) buff->data);
 
-		if (buff->slen < n) break;
+        if (buff->slen < n) break;
 
-		if (r > n) n = r; else n += n;
+        if (r > n) n = r; else n += n;
 
-		if (BSTR_OK != balloc (buff, n + 2)) {
-			bdestroy (buff);
-			return BSTR_ERR;
-		}
-	}
+        if (BSTR_OK != balloc (buff, n + 2)) {
+            bdestroy (buff);
+            return BSTR_ERR;
+        }
+    }
 
-	r = bconcat (b, buff);
-	bdestroy (buff);
-	return r;
+    r = bconcat (b, buff);
+    bdestroy (buff);
+    return r;
 }
 
 /*  int bassignformat (bstring b, const char * fmt, ...)
@@ -2820,40 +2820,40 @@ va_list arglist;
 bstring buff;
 int n, r;
 
-	if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0 
-	 || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+    if (b == NULL || fmt == NULL || b->data == NULL || b->mlen <= 0 
+     || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
 
-	/* Since the length is not determinable beforehand, a search is
-	   performed using the truncating "vsnprintf" call (to avoid buffer
-	   overflows) on increasing potential sizes for the output result. */
+    /* Since the length is not determinable beforehand, a search is
+       performed using the truncating "vsnprintf" call (to avoid buffer
+       overflows) on increasing potential sizes for the output result. */
 
-	if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
-	if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
-		n = 1;
-		if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
-	}
+    if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+    if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+        n = 1;
+        if (NULL == (buff = bfromcstralloc (n + 2, ""))) return BSTR_ERR;
+    }
 
-	for (;;) {
-		va_start (arglist, fmt);
-		exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
-		va_end (arglist);
+    for (;;) {
+        va_start (arglist, fmt);
+        exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+        va_end (arglist);
 
-		buff->data[n] = (unsigned char) '\0';
-		buff->slen = (int) (strlen) ((char *) buff->data);
+        buff->data[n] = (unsigned char) '\0';
+        buff->slen = (int) (strlen) ((char *) buff->data);
 
-		if (buff->slen < n) break;
+        if (buff->slen < n) break;
 
-		if (r > n) n = r; else n += n;
+        if (r > n) n = r; else n += n;
 
-		if (BSTR_OK != balloc (buff, n + 2)) {
-			bdestroy (buff);
-			return BSTR_ERR;
-		}
-	}
+        if (BSTR_OK != balloc (buff, n + 2)) {
+            bdestroy (buff);
+            return BSTR_ERR;
+        }
+    }
 
-	r = bassign (b, buff);
-	bdestroy (buff);
-	return r;
+    r = bassign (b, buff);
+    bdestroy (buff);
+    return r;
 }
 
 /*  bstring bformat (const char * fmt, ...)
@@ -2868,37 +2868,37 @@ va_list arglist;
 bstring buff;
 int n, r;
 
-	if (fmt == NULL) return NULL;
+    if (fmt == NULL) return NULL;
 
-	/* Since the length is not determinable beforehand, a search is
-	   performed using the truncating "vsnprintf" call (to avoid buffer
-	   overflows) on increasing potential sizes for the output result. */
+    /* Since the length is not determinable beforehand, a search is
+       performed using the truncating "vsnprintf" call (to avoid buffer
+       overflows) on increasing potential sizes for the output result. */
 
-	if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
-	if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
-		n = 1;
-		if (NULL == (buff = bfromcstralloc (n + 2, ""))) return NULL;
-	}
+    if ((n = (int) (2*strlen (fmt))) < START_VSNBUFF) n = START_VSNBUFF;
+    if (NULL == (buff = bfromcstralloc (n + 2, ""))) {
+        n = 1;
+        if (NULL == (buff = bfromcstralloc (n + 2, ""))) return NULL;
+    }
 
-	for (;;) {
-		va_start (arglist, fmt);
-		exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
-		va_end (arglist);
+    for (;;) {
+        va_start (arglist, fmt);
+        exvsnprintf (r, (char *) buff->data, n + 1, fmt, arglist);
+        va_end (arglist);
 
-		buff->data[n] = (unsigned char) '\0';
-		buff->slen = (int) (strlen) ((char *) buff->data);
+        buff->data[n] = (unsigned char) '\0';
+        buff->slen = (int) (strlen) ((char *) buff->data);
 
-		if (buff->slen < n) break;
+        if (buff->slen < n) break;
 
-		if (r > n) n = r; else n += n;
+        if (r > n) n = r; else n += n;
 
-		if (BSTR_OK != balloc (buff, n + 2)) {
-			bdestroy (buff);
-			return NULL;
-		}
-	}
+        if (BSTR_OK != balloc (buff, n + 2)) {
+            bdestroy (buff);
+            return NULL;
+        }
+    }
 
-	return buff;
+    return buff;
 }
 
 /*  int bvcformata (bstring b, int count, const char * fmt, va_list arglist)
@@ -2924,32 +2924,32 @@ int n, r;
 int bvcformata (bstring b, int count, const char * fmt, va_list arg) {
 int n, r, l;
 
-	if (b == NULL || fmt == NULL || count <= 0 || b->data == NULL
-	 || b->mlen <= 0 || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
+    if (b == NULL || fmt == NULL || count <= 0 || b->data == NULL
+     || b->mlen <= 0 || b->slen < 0 || b->slen > b->mlen) return BSTR_ERR;
 
-	if (count > (n = b->slen + count) + 2) return BSTR_ERR;
-	if (BSTR_OK != balloc (b, n + 2)) return BSTR_ERR;
+    if (count > (n = b->slen + count) + 2) return BSTR_ERR;
+    if (BSTR_OK != balloc (b, n + 2)) return BSTR_ERR;
 
-	exvsnprintf (r, (char *) b->data + b->slen, count + 2, fmt, arg);
+    exvsnprintf (r, (char *) b->data + b->slen, count + 2, fmt, arg);
 
-	/* Did the operation complete successfully within bounds? */
+    /* Did the operation complete successfully within bounds? */
 
-	if (n >= (l = b->slen + (int) (strlen) ((const char *) b->data + b->slen))) {
-		b->slen = l;
-		return BSTR_OK;
-	}
+    if (n >= (l = b->slen + (int) (strlen) ((const char *) b->data + b->slen))) {
+        b->slen = l;
+        return BSTR_OK;
+    }
 
-	/* Abort, since the buffer was not large enough.  The return value 
-	   tries to help set what the retry length should be. */
+    /* Abort, since the buffer was not large enough.  The return value 
+       tries to help set what the retry length should be. */
 
-	b->data[b->slen] = '\0';
-	if (r > count+1) l = r; else {
-		l = count+count;
-		if (count > l) l = INT_MAX;
-	}
-	n = -l;
-	if (n > BSTR_ERR-1) n = BSTR_ERR-1;
-	return n;
+    b->data[b->slen] = '\0';
+    if (r > count+1) l = r; else {
+        l = count+count;
+        if (count > l) l = INT_MAX;
+    }
+    n = -l;
+    if (n > BSTR_ERR-1) n = BSTR_ERR-1;
+    return n;
 }
 
 #endif
diff --git a/src/configuration.c b/src/configuration.c
new file mode 100644
index 0000000..1ac682b
--- /dev/null
+++ b/src/configuration.c
@@ -0,0 +1,183 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  configuration.c
+ *
+ *      Description:  Configuration file module.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <string.h>
+
+
+
+#include <configuration.h>
+
+Configuration config = {NULL,NULL,NULL,-1,MAX_NUM_THREADS,MAX_NUM_NODES};
+int init_config = 0;
+
+int init_configuration(void)
+{
+    int i;
+    FILE* fp;
+    char line[512];
+    char name[128];
+    char value[256];
+    char filename[1024];
+    filename[0] = '\0';
+    char preconfigured[1024];
+    preconfigured[0] = '\0';
+    if (init_config == 1)
+    {
+        return 0;
+    }
+    sprintf(preconfigured, "%s%s",TOSTRING(INSTALL_PREFIX),"/etc/likwid.cfg");
+
+    if (access(preconfigured, R_OK) != 0)
+    {
+        if (access(TOSTRING(CFGFILE), R_OK) != 0)
+        {
+            if (!access("/etc/likwid.cfg",R_OK))
+            {
+                sprintf(filename,"%s", "/etc/likwid.cfg");
+            }
+        }
+        else
+        {
+            sprintf(filename,"%s",TOSTRING(CFGFILE));
+        }
+    }
+    else
+    {
+        sprintf(filename, "%s",preconfigured);
+    }
+    if ((config.topologyCfgFileName == NULL) && (strlen(filename) == 0))
+    {
+        if (!access("/etc/likwid_topo.cfg", R_OK))
+        {
+            preconfigured[0] = '\0';
+            sprintf(preconfigured,"%s", "/etc/likwid_topo.cfg");
+        }
+        else
+        {
+            sprintf(preconfigured, "%s%s",TOSTRING(INSTALL_PREFIX),"/etc/likwid_topo.cfg");
+            if (access(preconfigured, R_OK))
+            {
+                preconfigured[0] = '\0';
+            }
+        }
+        if (preconfigured[0] != '\0')
+        {
+            config.topologyCfgFileName = (char*)malloc((strlen(preconfigured)+1) * sizeof(char));
+            strcpy(config.topologyCfgFileName, preconfigured);
+            config.topologyCfgFileName[strlen(preconfigured)] = '\0';
+        }
+    }
+
+    if ((strlen(filename) == 0) || (!access(filename, R_OK)))
+    {
+        return -EFAULT;
+    }
+    DEBUG_PRINT(DEBUGLEV_INFO, Reading configuration from %s, filename);
+    // Copy determined config filename to struct
+    config.configFileName = malloc((strlen(filename)+1)*sizeof(char));
+    strcpy(config.configFileName, filename);
+    config.configFileName[strlen(filename)] = '\0';
+
+    fp = fopen(config.configFileName, "r");
+    if (fp == NULL)
+    {
+        return -EFAULT;
+    }
+
+    while (fgets(line, 512, fp) != NULL) {
+        if (sscanf(line,"%s = %s", name, value) != 2)
+        {
+            continue;
+        }
+        if (strcmp(name, "topology_file") == 0)
+        {
+            config.topologyCfgFileName = (char*)malloc((strlen(value)+1) * sizeof(char));
+            strcpy(config.topologyCfgFileName, value);
+            config.topologyCfgFileName[strlen(value)] = '\0';
+        }
+        else if (strcmp(name, "daemon_path") == 0)
+        {
+            config.daemonPath = (char*)malloc((strlen(value)+1) * sizeof(char));
+            strcpy(config.daemonPath, value);
+            config.daemonPath[strlen(value)] = '\0';
+        }
+        else if (strcmp(name, "daemon_mode") == 0)
+        {
+            if (strcmp(value, "daemon") == 0) 
+            {
+                config.daemonMode = ACCESSMODE_DAEMON;
+            }
+            else if (strcmp(value, "direct") == 0)
+            {
+                config.daemonMode = ACCESSMODE_DIRECT;
+            }
+        }
+        else if (strcmp(name, "max_threads") == 0)
+        {
+            config.maxNumThreads = atoi(value);
+        }
+        else if (strcmp(name, "max_nodes") == 0)
+        {
+            config.maxNumNodes = atoi(value);
+        }
+    }
+    
+
+    init_config = 1;
+
+    fclose(fp);
+    return 0;
+}
+
+Configuration_t get_configuration(void)
+{
+    if (init_config == 1)
+    {
+        return &config;
+    }
+    return NULL;
+}
+
+int destroy_configuration(void)
+{
+    if (init_config == 0) 
+    {
+        return -EFAULT;
+    }
+    free(config.configFileName);
+    free(config.topologyCfgFileName);
+    free(config.daemonPath);
+    init_config = 0;
+    return 0;
+}
diff --git a/src/cpuFeatures.c b/src/cpuFeatures.c
index 4733a82..546a17a 100644
--- a/src/cpuFeatures.c
+++ b/src/cpuFeatures.c
@@ -9,13 +9,13 @@
  *                  Allows to turn on and off the Hardware prefetcher
  *                  available.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -39,8 +39,8 @@
 #include <stdint.h>
 
 #include <types.h>
-#include <msr.h>
-#include <cpuid.h>
+#include <access.h>
+#include <topology.h>
 #include <registers.h>
 #include <textcolor.h>
 #include <cpuFeatures.h>
@@ -73,7 +73,9 @@ CpuFeatureFlags cpuFeatureFlags;
 void
 cpuFeatures_init(int cpu)
 {
-    uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
+    int ret;
+    uint64_t flags;
+    ret = HPMread(cpu, MSR_DEV, MSR_IA32_MISC_ENABLE, &flags);
 
     TEST_FLAG(fastStrings,0);
     TEST_FLAG(thermalControl,3);
@@ -120,7 +122,9 @@ cpuFeatures_init(int cpu)
 void
 cpuFeatures_print(int cpu)
 {
-    uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
+    int ret;
+    uint64_t flags;
+    ret = HPMread(cpu, MSR_DEV, MSR_IA32_MISC_ENABLE, &flags);
 
     printf(HLINE);
     printf("Fast-Strings: \t\t\t");
@@ -154,7 +158,7 @@ cpuFeatures_print(int cpu)
     }
     printf("Branch Trace Storage: \t\t");
 
-    if (flags & (1ULL<<11))
+    if (flags & (1ULL<<11)) 
     {
         PRINT_VALUE(RED,notsupported);
     }
@@ -164,7 +168,7 @@ cpuFeatures_print(int cpu)
     }
 
     printf("PEBS: \t\t\t\t");
-    if (flags & (1ULL<<12))
+    if (flags & (1ULL<<12)) 
     {
         PRINT_VALUE(RED,notsupported);
     }
@@ -174,7 +178,7 @@ cpuFeatures_print(int cpu)
     }
 
     printf("Intel Enhanced SpeedStep: \t");
-    if (flags & (1ULL<<16))
+    if (flags & (1ULL<<16)) 
     {
         PRINT_VALUE(GREEN,enabled);
     }
@@ -184,7 +188,7 @@ cpuFeatures_print(int cpu)
     }
 
     printf("MONITOR/MWAIT: \t\t\t");
-    if (flags & (1ULL<<18))
+    if (flags & (1ULL<<18)) 
     {
         PRINT_VALUE(GREEN,supported);
     }
@@ -194,7 +198,7 @@ cpuFeatures_print(int cpu)
     }
 
     printf("Limit CPUID Maxval: \t\t");
-    if (flags & (1ULL<<22))
+    if (flags & (1ULL<<22)) 
     {
         PRINT_VALUE(RED,enabled);
     }
@@ -204,7 +208,7 @@ cpuFeatures_print(int cpu)
     }
 
     printf("XD Bit Disable: \t\t");
-    if (flags & (1ULL<<34))
+    if (flags & (1ULL<<34)) 
     {
         PRINT_VALUE(RED,disabled);
     }
@@ -212,53 +216,45 @@ cpuFeatures_print(int cpu)
     {
         PRINT_VALUE(GREEN,enabled);
     }
-    if ((cpuid_info.model == NEHALEM) ||
-            (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
-            (cpuid_info.model == NEHALEM_LYNNFIELD) ||
-            (cpuid_info.model == NEHALEM_WESTMERE) ||
-            (cpuid_info.model == NEHALEM_WESTMERE_M) ||
-            (cpuid_info.model == NEHALEM_EX) ||
-            (cpuid_info.model == CORE2_45) ||
-            (cpuid_info.model == CORE2_65))
+
+    printf("IP Prefetcher: \t\t\t");
+    if (flags & (1ULL<<39)) 
     {
-        printf("IP Prefetcher: \t\t\t");
-        if (flags & (1ULL<<39))
-        {
-            PRINT_VALUE(RED,disabled);
-        }
-        else
-        {
-            PRINT_VALUE(GREEN,enabled);
-        }
+        PRINT_VALUE(RED,disabled);
+    }
+    else
+    {
+        PRINT_VALUE(GREEN,enabled);
+    }
 
-        printf("Hardware Prefetcher: \t\t");
-        if (flags & (1ULL<<9))
-        {
-            PRINT_VALUE(RED,disabled);
-        }
-        else
-        {
-            PRINT_VALUE(GREEN,enabled);
-        }
-        printf("Adjacent Cache Line Prefetch: \t");
-        if (flags & (1ULL<<19))
-        {
-            PRINT_VALUE(RED,disabled);
-        }
-        else
-        {
-            PRINT_VALUE(GREEN,enabled);
-        }
+    printf("Hardware Prefetcher: \t\t");
+    if (flags & (1ULL<<9)) 
+    {
+        PRINT_VALUE(RED,disabled);
+    }
+    else
+    {
+        PRINT_VALUE(GREEN,enabled);
+    }
 
-        printf("DCU Prefetcher: \t\t");
-        if (flags & (1ULL<<37))
-        {
-            PRINT_VALUE(RED,disabled);
-        }
-        else
-        {
-            PRINT_VALUE(GREEN,enabled);
-        }
+    printf("Adjacent Cache Line Prefetch: \t");
+    if (flags & (1ULL<<19)) 
+    {
+        PRINT_VALUE(RED,disabled);
+    }
+    else
+    {
+        PRINT_VALUE(GREEN,enabled);
+    }
+
+    printf("DCU Prefetcher: \t\t");
+    if (flags & (1ULL<<37)) 
+    {
+        PRINT_VALUE(RED,disabled);
+    }
+    else
+    {
+        PRINT_VALUE(GREEN,enabled);
     }
 
     if ((cpuid_info.model == NEHALEM) ||
@@ -268,12 +264,12 @@ cpuFeatures_print(int cpu)
             (cpuid_info.model == NEHALEM_WESTMERE_M) ||
             (cpuid_info.model == NEHALEM_EX))
     {
-        printf("Intel Turbo Mode: \t\t");
-        if (flags & (1ULL<<38))
+        printf("Intel Turbo Mode: \t");
+        if (flags & (1ULL<<38)) 
         {
             PRINT_VALUE(RED,disabled);
         }
-        else
+        else 
         {
             PRINT_VALUE(GREEN,enabled);
         }
@@ -283,11 +279,11 @@ cpuFeatures_print(int cpu)
     {
 
         printf("Intel Dynamic Acceleration: \t");
-        if (flags & (1ULL<<38))
+        if (flags & (1ULL<<38)) 
         {
             PRINT_VALUE(RED,disabled);
         }
-        else
+        else 
         {
             PRINT_VALUE(GREEN,enabled);
         }
@@ -296,104 +292,82 @@ cpuFeatures_print(int cpu)
     printf(HLINE);
 }
 
-void
+void 
 cpuFeatures_enable(int cpu, CpuFeature type)
 {
-    if ((cpuid_info.model == NEHALEM) ||
-            (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
-            (cpuid_info.model == NEHALEM_LYNNFIELD) ||
-            (cpuid_info.model == NEHALEM_WESTMERE) ||
-            (cpuid_info.model == NEHALEM_WESTMERE_M) ||
-            (cpuid_info.model == NEHALEM_EX) ||
-            (cpuid_info.model == CORE2_45) ||
-            (cpuid_info.model == CORE2_65))
-    {
-        uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
-        switch ( type )
-        {
-            case HW_PREFETCHER:
-                printf("HW_PREFETCHER:\t");
-                flags &= ~(1ULL<<9);
-                break;
-
-            case CL_PREFETCHER:
-                printf("CL_PREFETCHER:\t");
-                flags &= ~(1ULL<<19);
-                break;
-
-            case DCU_PREFETCHER:
-                printf("DCU_PREFETCHER:\t");
-                flags &= ~(1ULL<<37);
-                break;
-
-            case IP_PREFETCHER:
-                printf("IP_PREFETCHER:\t");
-                flags &= ~(1ULL<<39);
-                break;
-
-            default:
-                printf("ERROR: CpuFeature not supported!\n");
-                break;
-        }
-        PRINT_VALUE(GREEN,enabled);
-        printf("\n");
-        msr_write(cpu, MSR_IA32_MISC_ENABLE, flags);
-    }
-    else
+    int ret;
+    uint64_t flags; 
+    ret = HPMread(cpu, MSR_DEV, MSR_IA32_MISC_ENABLE, &flags);
+
+    switch ( type )
     {
-        printf("ERROR: Architecture does not support the manipulation of prefetchers\n");
+        case HW_PREFETCHER:
+            printf("HW_PREFETCHER:\t");
+            flags &= ~(1ULL<<9);
+            break;
+
+        case CL_PREFETCHER:
+            printf("CL_PREFETCHER:\t");
+            flags &= ~(1ULL<<19);
+            break;
+
+        case DCU_PREFETCHER:
+            printf("DCU_PREFETCHER:\t");
+            flags &= ~(1ULL<<37);
+            break;
+
+        case IP_PREFETCHER:
+            printf("IP_PREFETCHER:\t");
+            flags &= ~(1ULL<<39);
+            break;
+
+        default:
+            printf("ERROR: CpuFeature not supported!\n");
+            break;
     }
+    PRINT_VALUE(GREEN,enabled);
+    printf("\n");
+
+    HPMwrite(cpu, MSR_DEV, MSR_IA32_MISC_ENABLE, flags);
 }
 
 
 void
 cpuFeatures_disable(int cpu, CpuFeature type)
 {
-    if ((cpuid_info.model == NEHALEM) ||
-            (cpuid_info.model == NEHALEM_BLOOMFIELD) ||
-            (cpuid_info.model == NEHALEM_LYNNFIELD) ||
-            (cpuid_info.model == NEHALEM_WESTMERE) ||
-            (cpuid_info.model == NEHALEM_WESTMERE_M) ||
-            (cpuid_info.model == NEHALEM_EX) ||
-            (cpuid_info.model == CORE2_45) ||
-            (cpuid_info.model == CORE2_65))
-    {
-        uint64_t flags = msr_read(cpu, MSR_IA32_MISC_ENABLE);
-
-        switch ( type )
-        {
-            case HW_PREFETCHER:
-                printf("HW_PREFETCHER:\t");
-                flags |= (1ULL<<9);
-                break;
-
-            case CL_PREFETCHER:
-                printf("CL_PREFETCHER:\t");
-                flags |= (1ULL<<19);
-                break;
-
-            case DCU_PREFETCHER:
-                printf("DCU_PREFETCHER:\t");
-                flags |= (1ULL<<37);
-                break;
-
-            case IP_PREFETCHER:
-                printf("IP_PREFETCHER:\t");
-                flags |= (1ULL<<39);
-                break;
-
-            default:
-                printf("ERROR: CpuFeature not supported!\n");
-                break;
-        }
-        PRINT_VALUE(RED,disabled);
-        printf("\n");
+    int ret;
+    uint64_t flags;
+    ret = HPMread(cpu, MSR_DEV, MSR_IA32_MISC_ENABLE, &flags);
 
-        msr_write(cpu, MSR_IA32_MISC_ENABLE, flags);
-    }
-    else
+    switch ( type ) 
     {
-        printf("ERROR: Architecture does not support the manipulation of prefetchers\n");
+        case HW_PREFETCHER:
+            printf("HW_PREFETCHER:\t");
+            flags |= (1ULL<<9);
+            break;
+
+        case CL_PREFETCHER:
+            printf("CL_PREFETCHER:\t");
+            flags |= (1ULL<<19);
+            break;
+
+        case DCU_PREFETCHER:
+            printf("DCU_PREFETCHER:\t");
+            flags |= (1ULL<<37);
+            break;
+
+        case IP_PREFETCHER:
+            printf("IP_PREFETCHER:\t");
+            flags |= (1ULL<<39);
+            break;
+
+        default:
+            printf("ERROR: CpuFeature not supported!\n");
+            break;
     }
+    PRINT_VALUE(RED,disabled);
+    printf("\n");
+
+    HPMwrite(cpu, MSR_DEV, MSR_IA32_MISC_ENABLE, flags);
 }
 
diff --git a/src/cpuid.c b/src/cpuid.c
deleted file mode 100644
index 6a9ac47..0000000
--- a/src/cpuid.c
+++ /dev/null
@@ -1,1244 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  cpuid.c
- *
- *      Description:  Implementation of cpuid module.
- *                  Provides API to extract cpuid info on x86 processors.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <sys/time.h>
-#include <sys/wait.h>
-#include <unistd.h>
-#include <sched.h>
-#include <time.h>
-#include <math.h>
-
-#include <error.h>
-#include <cpuid.h>
-#include <tree.h>
-#include <bitUtil.h>
-#include <strUtil.h>
-
-/* #####   EXPORTED VARIABLES   ########################################### */
-
-CpuInfo cpuid_info;
-CpuTopology cpuid_topology;
-
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static int largest_function = 0;
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-/* this was taken from the linux kernel */
-#define CPUID                              \
-    __asm__ volatile ("cpuid"                             \
-            : "=a" (eax),     \
-            "=b" (ebx),     \
-            "=c" (ecx),     \
-            "=d" (edx)      \
-            : "0" (eax), "2" (ecx))
-
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static char* pentium_m_b_str = "Intel Pentium M Banias processor";
-static char* pentium_m_d_str = "Intel Pentium M Dothan processor";
-static char* core_duo_str = "Intel Core Duo processor";
-static char* core_2a_str = "Intel Core 2 65nm processor";
-static char* core_2b_str = "Intel Core 2 45nm processor";
-static char* atom_45_str = "Intel Atom 45nm processor";
-static char* atom_32_str = "Intel Atom 32nm processor";
-static char* atom_22_str = "Intel Atom 22nm processor";
-static char* atom_silvermont_str = "Intel Atom (Silvermont) 22nm processor";
-static char* atom_saltwell_str = "Intel Atom (Saltwell) 32nm processor";
-static char* nehalem_bloom_str = "Intel Core Bloomfield processor";
-static char* nehalem_lynn_str = "Intel Core Lynnfield processor";
-static char* nehalem_west_str = "Intel Core Westmere processor";
-static char* sandybridge_str = "Intel Core SandyBridge processor";
-static char* ivybridge_str = "Intel Core IvyBridge processor";
-static char* ivybridge_ep_str = "Intel Core IvyBridge EP processor";
-static char* sandybridge_ep_str = "Intel Core SandyBridge EP processor";
-static char* haswell_str = "Intel Core Haswell processor";
-static char* haswell_ex_str = "Intel Core Haswell EX processor";
-static char* nehalem_ex_str = "Intel Nehalem EX processor";
-static char* westmere_ex_str = "Intel Westmere EX processor";
-static char* xeon_mp_string = "Intel Xeon MP processor";
-static char* xeon_phi_string = "Intel Xeon Phi Coprocessor";
-static char* barcelona_str = "AMD Barcelona processor";
-static char* shanghai_str = "AMD Shanghai processor";
-static char* istanbul_str = "AMD Istanbul processor";
-static char* magnycours_str = "AMD Magny Cours processor";
-static char* interlagos_str = "AMD Interlagos processor";
-static char* kabini_str = "AMD Family 16 model - Kabini processor";
-static char* opteron_sc_str = "AMD Opteron single core 130nm processor";
-static char* opteron_dc_e_str = "AMD Opteron Dual Core Rev E 90nm processor";
-static char* opteron_dc_f_str = "AMD Opteron Dual Core Rev F 90nm processor";
-static char* athlon64_str = "AMD Athlon64 X2 (AM2) Rev F 90nm processor";
-static char* athlon64_f_str = "AMD Athlon64 (AM2) Rev F 90nm processor";
-static char* athlon64_X2_g_str = "AMD Athlon64 X2 (AM2) Rev G 65nm processor";
-static char* athlon64_g_str = "AMD Athlon64 (AM2) Rev G 65nm processor";
-static char* amd_k8_str = "AMD K8 architecture";
-static char* unknown_intel_str = "Unknown Intel Processor";
-static char* unknown_amd_str = "Unknown AMD Processor";
-
-static volatile int init = 0;
-static uint32_t eax, ebx, ecx, edx;
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
-static void initTopology(FILE* file)
-{
-    size_t items;
-    HWThread* hwThreadPool;
-    CacheLevel* cacheLevels;
-    TreeNode* currentNode;
-
-    items = fread((void*) &cpuid_topology, sizeof(CpuTopology), 1, file);
-
-    hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
-    items = fread((void*) hwThreadPool, sizeof(HWThread), cpuid_topology.numHWThreads, file);
-    cpuid_topology.threadPool = hwThreadPool;
-
-    cacheLevels = (CacheLevel*) malloc(cpuid_topology.numCacheLevels * sizeof(CacheLevel));
-    items = fread((void*) cacheLevels, sizeof(CacheLevel), cpuid_topology.numCacheLevels, file);
-    cpuid_topology.cacheLevels = cacheLevels;
-    cpuid_topology.topologyTree = NULL;
-
-    tree_init(&cpuid_topology.topologyTree, 0);
-
-    for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
-    {
-        if (!tree_nodeExists(cpuid_topology.topologyTree,
-                    hwThreadPool[i].packageId))
-        {
-            tree_insertNode(cpuid_topology.topologyTree,
-                    hwThreadPool[i].packageId);
-        }
-        currentNode = tree_getNode(cpuid_topology.topologyTree,
-                hwThreadPool[i].packageId);
-
-        if (!tree_nodeExists(currentNode, hwThreadPool[i].coreId))
-        {
-            tree_insertNode(currentNode, hwThreadPool[i].coreId);
-        }
-        currentNode = tree_getNode(currentNode, hwThreadPool[i].coreId);
-
-        if (!tree_nodeExists(currentNode, i))
-        {
-            tree_insertNode(currentNode, i);
-        }
-    }
-}
-
-static uint32_t amdGetAssociativity(uint32_t flag)
-{
-    uint32_t asso= 0;
-
-    switch ( flag )
-    {
-        case 0x0:
-            asso = 0;
-            break;
-
-        case 0x1:
-            asso = 1;
-            break;
-
-        case 0x2:
-            asso = 2;
-            break;
-
-        case 0x4:
-            asso = 4;
-            break;
-
-        case 0x6:
-            asso = 8;
-            break;
-
-        case 0x8:
-            asso = 16;
-            break;
-
-        case 0xA:
-            asso = 32;
-            break;
-
-        case 0xB:
-            asso = 48;
-            break;
-
-        case 0xC:
-            asso = 64;
-            break;
-
-        case 0xD:
-            asso = 96;
-            break;
-
-        case 0xE:
-            asso = 128;
-            break;
-
-        case 0xF:
-            asso = 0;
-            break;
-
-        default:
-            break;
-    }
-    return asso;
-
-}
-
-static int intelCpuidFunc_4(CacheLevel** cachePool)
-{
-    int i;
-    int level=0;
-    int maxNumLevels=0;
-    uint32_t valid=1;
-    CacheLevel* pool;
-    int threadsPerCpu = 0;
-    int numThreadsPerSocket = cpuid_topology.numCoresPerSocket *
-                              cpuid_topology.numThreadsPerCore;
-
-    while (valid)
-    {
-        eax = 0x04;
-        ecx = level;
-        CPUID;
-        valid = extractBitField(eax,5,0);
-        if (!valid)
-        {
-            break;
-        }
-        level++;
-    }
-
-    maxNumLevels = level;
-    *cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
-    pool = *cachePool;
-
-    for (i=0; i < maxNumLevels; i++) 
-    {
-        eax = 0x04;
-        ecx = i;
-        CPUID;
-
-        pool[i].level = extractBitField(eax,3,5);
-        pool[i].type = (CacheType) extractBitField(eax,5,0);
-        pool[i].associativity = extractBitField(ebx,8,22)+1;
-        pool[i].sets = ecx+1;
-        pool[i].lineSize = extractBitField(ebx,12,0)+1;
-        pool[i].size = pool[i].sets *
-            pool[i].associativity *
-            pool[i].lineSize;
-        pool[i].threads = extractBitField(eax,10,14)+1;
-        pool[i].inclusive = edx&0x2;
-
-        /* WORKAROUND cpuid reports wrong number of threads on SMT processor with SMT
-         * turned off */
-        if (i < 3)
-        {
-            if ((cpuid_info.model == NEHALEM_BLOOMFIELD) ||
-                    (cpuid_info.model == NEHALEM_LYNNFIELD) ||
-                    (cpuid_info.model == NEHALEM_WESTMERE) ||
-                    (cpuid_info.model == NEHALEM_WESTMERE_M) ||
-                    (cpuid_info.model == SANDYBRIDGE) ||
-                    (cpuid_info.model == SANDYBRIDGE_EP) ||
-                    (cpuid_info.model == IVYBRIDGE) ||
-                    (cpuid_info.model == IVYBRIDGE_EP) ||
-                    (cpuid_info.model == HASWELL) ||
-                    (cpuid_info.model == HASWELL_EX) ||
-                    (cpuid_info.model == HASWELL_M1) ||
-                    (cpuid_info.model == HASWELL_M2) ||
-                    (cpuid_info.model == WESTMERE_EX) ||
-                    (cpuid_info.model == NEHALEM_EX))
-            {
-                if (cpuid_topology.numThreadsPerCore == 1)
-                {
-                    pool[i].threads = 1;
-                }
-            }
-        }
-
-        /* :WORKAROUND:08/13/2009 08:34:15 AM:jt: For L3 caches the value is sometimes 
-         * too large in here. 
-         * See Documentation: Threads contains maximum number of threads supported
-         * by the cache.
-         * Limit threads per Socket then to the maximum possible value. If the number
-         * of threads supported by the cache does not divide the threads on the socket
-         * without remainder, the threads are adjusted to fit the multiple caches.
-         */
-        if(pool[i].threads > numThreadsPerSocket)
-        {
-            pool[i].threads = numThreadsPerSocket;
-        }
-        else if (((double)numThreadsPerSocket)/((double)pool[i].threads) != 
-                  (double)(numThreadsPerSocket/pool[i].threads))
-        {
-            pool[i].threads = numThreadsPerSocket/
-                (int)ceil(((double)numThreadsPerSocket)/((double)pool[i].threads));
-        }
-        /* For Intel Silvermont this is not enough. It returns 4 threads and 8 cores
-         * for the L2 cache. But according to the data sheet, each 1MB L2 cache slice 
-         * is shared by 2 threads/cores.
-         */
-        else if (pool[i].level == 2 && 
-                ((cpuid_info.model == ATOM_SILVERMONT_C) ||
-                 (cpuid_info.model == ATOM_SILVERMONT_E) ||
-                 (cpuid_info.model == ATOM_SILVERMONT_F1) ||
-                 (cpuid_info.model == ATOM_SILVERMONT_F2) ||
-                 (cpuid_info.model == ATOM_SILVERMONT_F3)))
-        {
-            pool[i].threads = 2;
-        }
-    }
-
-    
-
-    return maxNumLevels;
-}
-
-static int recheck_numHWThreads()
-{
-    int cpucount = 0;
-    char line[1024];
-    FILE* fp = fopen("/proc/cpuinfo","r");
-    if (fp != NULL)
-    {
-        while( fgets(line,1024,fp) )
-        {
-            if (strncmp(line, "processor", 9) == 0)
-            {
-                cpucount++;
-            }
-        }
-    }
-    return cpucount;
-}
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-int cpuid_init (void)
-{
-    int isIntel = 1;
-
-    /* FIXME: Race condition??? */
-    if (init) return EXIT_SUCCESS;
-    init =1;
-
-    eax = 0x00;
-    CPUID;
-
-    largest_function = eax;
-    if (ebx == 0x68747541U)
-    {
-        isIntel = 0;
-    }
-
-    eax = 0x01;
-    CPUID;
-    cpuid_info.family = ((eax>>8)&0xFU) + ((eax>>20)&0xFFU);
-    cpuid_info.model = (((eax>>16)&0xFU)<<4) + ((eax>>4)&0xFU);
-    cpuid_info.stepping =  (eax&0xFU);
-
-    switch ( cpuid_info.family )
-    {
-        case P6_FAMILY:
-            switch ( cpuid_info.model )
-            {
-                case PENTIUM_M_BANIAS:
-                    cpuid_info.name = pentium_m_b_str;
-                    break;
-
-                case PENTIUM_M_DOTHAN:
-                    cpuid_info.name = pentium_m_d_str;
-                    break;
-
-                case CORE_DUO:
-                    cpuid_info.name = core_duo_str;
-                    break;
-
-                case CORE2_65:
-                    cpuid_info.name = core_2a_str;
-                    break;
-
-                case CORE2_45:
-                    cpuid_info.name = core_2b_str;
-                    break;
-
-                case NEHALEM_BLOOMFIELD:
-                    cpuid_info.name = nehalem_bloom_str;
-                    break;
-
-                case NEHALEM_LYNNFIELD:
-                    cpuid_info.name = nehalem_lynn_str;
-                    break;
-
-                case NEHALEM_WESTMERE_M:
-
-                case NEHALEM_WESTMERE:
-                    cpuid_info.name = nehalem_west_str;
-                    break;
-
-                case SANDYBRIDGE:
-                    cpuid_info.name = sandybridge_str;
-                    break;
-
-                case SANDYBRIDGE_EP:
-                    cpuid_info.name = sandybridge_ep_str;
-                    break;
-
-                case IVYBRIDGE:
-                    cpuid_info.name = ivybridge_str;
-                    break;
-
-                case IVYBRIDGE_EP:
-                    cpuid_info.name = ivybridge_ep_str;
-                    break;
-
-                case HASWELL:
-
-                case HASWELL_M1:
-
-                case HASWELL_M2:
-                    cpuid_info.name = haswell_str;
-                    break;
-
-                case HASWELL_EX:
-                    cpuid_info.name = haswell_ex_str;
-                    break;
-
-                case NEHALEM_EX:
-                    cpuid_info.name = nehalem_ex_str;
-                    break;
-
-                case WESTMERE_EX:
-                    cpuid_info.name = westmere_ex_str;
-                    break;
-
-                case XEON_MP:
-                    cpuid_info.name = xeon_mp_string;
-                    break;
-
-                case ATOM_45:
-
-                case ATOM:
-                    cpuid_info.name = atom_45_str;
-                    break;
-
-                case ATOM_32:
-                    cpuid_info.name = atom_32_str;
-                    break;
-
-                case ATOM_22:
-                    cpuid_info.name = atom_22_str;
-                    break;
-
-                case ATOM_SILVERMONT_C:
-                case ATOM_SILVERMONT_E:
-                case ATOM_SILVERMONT_F1:
-                case ATOM_SILVERMONT_F2:
-                case ATOM_SILVERMONT_F3:
-                    cpuid_info.name = atom_silvermont_str;
-                    break;
-
-                default:
-                    cpuid_info.name = unknown_intel_str;
-                    break;
-            }
-            break;
-
-        case MIC_FAMILY:
-            switch ( cpuid_info.model ) 
-            {
-                case XEON_PHI:
-                    cpuid_info.name = xeon_phi_string;
-                    break;
-
-            }
-            break;
-
-        case K8_FAMILY:
-
-            if (isIntel)
-            {
-                ERROR_PLAIN_PRINT(Netburst architecture is not supported);
-            }
-
-            switch ( cpuid_info.model )
-            {
-                case OPTERON_DC_E:
-                    cpuid_info.name = opteron_dc_e_str;
-                    break;
-
-                case OPTERON_DC_F:
-                    cpuid_info.name = opteron_dc_f_str;
-                    break;
-
-                case ATHLON64_X2:
-
-                case ATHLON64_X2_F:
-                    cpuid_info.name = athlon64_str;
-                    break;
-
-                case ATHLON64_F1:
-
-                case ATHLON64_F2:
-                    cpuid_info.name = athlon64_f_str;
-                    break;
-
-                case ATHLON64_X2_G:
-                    cpuid_info.name = athlon64_X2_g_str;
-                    break;
-
-                case ATHLON64_G1:
-
-                case ATHLON64_G2:
-                    cpuid_info.name = athlon64_g_str;
-                    break;
-
-                case OPTERON_SC_1MB:
-                    cpuid_info.name = opteron_sc_str;
-                    break;
-
-                default:
-                    cpuid_info.name = amd_k8_str;
-                    break;
-            }
-
-            break;
-
-        case K10_FAMILY:
-            switch ( cpuid_info.model )
-            {
-                case BARCELONA:
-                    cpuid_info.name = barcelona_str;
-                    break;
-
-                case SHANGHAI:
-                    cpuid_info.name = shanghai_str;
-                    break;
-
-                case ISTANBUL:
-                    cpuid_info.name = istanbul_str;
-                    break;
-
-                case MAGNYCOURS:
-                    cpuid_info.name = magnycours_str;
-                    break;
-
-                default:
-                    cpuid_info.name = unknown_amd_str;
-                    break;
-            }
-            break;
-
-        case K15_FAMILY:
-            cpuid_info.name = interlagos_str;
-            break;
-
-        case K16_FAMILY:
-            cpuid_info.name = kabini_str;
-            break;
-            
-        default:
-            return EXIT_FAILURE;
-            break;
-    }
-
-    cpuid_info.featureFlags = 0;
-    cpuid_info.features = (char*) malloc(200*sizeof(char));
-    cpuid_info.features[0] = 0;
-    if (ecx & (1<<0))
-    {
-        strcat(cpuid_info.features, "SSE3 ");
-        cpuid_info.featureFlags |= (1<<SSE3);
-    }
-    if (ecx & (1<<3))
-    {
-        strcat(cpuid_info.features, "MONITOR ");
-        cpuid_info.featureFlags |= (1<<MONITOR);
-    }
-    if (ecx & (1<<5))
-    {
-        strcat(cpuid_info.features, "VMX ");
-        cpuid_info.featureFlags |= (1<<VMX);
-    }
-    if (ecx & (1<<7))
-    {
-        strcat(cpuid_info.features, "EIST ");
-        cpuid_info.featureFlags |= (1<<EIST);
-    }
-    if (ecx & (1<<8))
-    {
-        strcat(cpuid_info.features, "TM2 ");
-        cpuid_info.featureFlags |= (1<<TM2);
-    }
-    if (ecx & (1<<9))
-    {
-        strcat(cpuid_info.features, "SSSE3 ");
-        cpuid_info.featureFlags |= (1<<SSSE3);
-    }
-    if (ecx & (1<<12))
-    {
-        strcat(cpuid_info.features, "FMA ");
-        cpuid_info.featureFlags |= (1<<FMA);
-    }
-    if (ecx & (1<<19))
-    {
-        strcat(cpuid_info.features, "SSE4.1 ");
-        cpuid_info.featureFlags |= (1<<SSE41);
-    }
-    if (ecx & (1<<20))
-    {
-        strcat(cpuid_info.features, "SSE4.2 ");
-        cpuid_info.featureFlags |= (1<<SSE42);
-    }
-    if (ecx & (1<<25))
-    {
-        strcat(cpuid_info.features, "AES ");
-        cpuid_info.featureFlags |= (1<<AES);
-    }
-    if (ecx & (1<<28))
-    {
-        strcat(cpuid_info.features, "AVX ");
-        cpuid_info.featureFlags |= (1<<AVX);
-    }
-    if (ecx & (1<<30))
-    {
-        strcat(cpuid_info.features, "RDRAND ");
-        cpuid_info.featureFlags |= (1<<RDRAND);
-    }
-    if (edx & (1<<22))
-    {
-        strcat(cpuid_info.features, "ACPI ");
-        cpuid_info.featureFlags |= (1<<ACPI);
-    }
-    if (edx & (1<<23))
-    {
-        strcat(cpuid_info.features, "MMX ");
-        cpuid_info.featureFlags |= (1<<MMX);
-    }
-    if (edx & (1<<25))
-    {
-        strcat(cpuid_info.features, "SSE ");
-        cpuid_info.featureFlags |= (1<<SSE);
-    }
-    if (edx & (1<<26))
-    {
-        strcat(cpuid_info.features, "SSE2 ");
-        cpuid_info.featureFlags |= (1<<SSE2);
-    }
-    if (edx & (1<<29))
-    {
-        strcat(cpuid_info.features, "TM ");
-        cpuid_info.featureFlags |= (1<<TM);
-    }
-
-    eax = 0x80000001;
-    CPUID;
-    if (edx & (1<<27))
-    {
-        strcat(cpuid_info.features, "RDTSCP ");
-        cpuid_info.featureFlags |= (1<<RDTSCP);
-    }
-
-    cpuid_info.perf_version   =  0;
-    if( cpuid_info.family == P6_FAMILY && 0x0A <= largest_function)
-    {
-        eax = 0x0A;
-        CPUID;
-        cpuid_info.perf_version   =  (eax&0xFFU);
-        cpuid_info.perf_num_ctr   =   ((eax>>8)&0xFFU);
-        cpuid_info.perf_width_ctr =  ((eax>>16)&0xFFU);
-        cpuid_info.perf_num_fixed_ctr =  (edx&0xFU);
-
-        eax = 0x06;
-        CPUID;
-        if (eax & (1<<1))
-        {
-            cpuid_info.turbo = 1;
-        }
-        else
-        {
-            cpuid_info.turbo = 0;
-        }
-    }
-
-    FILE *file;
-    char *filepath = TOSTRING(CFGFILE);
-
-    if ((file = fopen(filepath, "rb")) != NULL) 
-    {
-        //printf("Read config from file\n");
-        initTopology(file);
-        fclose(file);
-    }
-    else
-    {
-        cpuid_topology.numHWThreads = sysconf(_SC_NPROCESSORS_CONF);
-        if (recheck_numHWThreads() != cpuid_topology.numHWThreads)
-        {
-            cpuid_topology.numHWThreads = recheck_numHWThreads();
-        }
-        cpu_set_t cpuSet;
-        CPU_ZERO(&cpuSet);
-        sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
-        cpuid_initTopology();
-        cpuid_initCacheTopology();
-
-        /* restore affinity mask of process */
-        sched_setaffinity(0, sizeof(cpu_set_t), &cpuSet);
-    }
-
-    return EXIT_SUCCESS;
-}
-
-void cpuid_print (void)
-{
-    printf("\nSupported Intel processors:\n");
-    printf("\t%s\n",core_2a_str);
-    printf("\t%s\n",core_2b_str);
-    printf("\t%s\n",xeon_mp_string);
-    printf("\t%s\n",atom_45_str);
-    printf("\t%s\n",atom_32_str);
-    printf("\t%s\n",atom_22_str);
-    printf("\t%s\n",nehalem_bloom_str);
-    printf("\t%s\n",nehalem_lynn_str);
-    printf("\t%s\n",nehalem_west_str);
-    printf("\t%s (with Uncore support)\n",nehalem_ex_str);
-    printf("\t%s (with Uncore support)\n",westmere_ex_str);
-    printf("\t%s\n",sandybridge_str);
-    printf("\t%s (with Uncore support)\n",sandybridge_ep_str);
-    printf("\t%s\n",ivybridge_str);
-    printf("\t%s (with Uncore support)\n",ivybridge_ep_str);
-    printf("\t%s (with Uncore support)\n",haswell_str);
-    printf("\t%s (no Uncore support)\n",haswell_ex_str);
-    printf("\t%s\n",atom_silvermont_str);
-    printf("\t%s\n",atom_saltwell_str);
-    printf("\t%s\n\n",xeon_phi_string);
-
-    printf("Supported AMD processors:\n");
-    printf("\t%s\n",opteron_sc_str);
-    printf("\t%s\n",opteron_dc_e_str);
-    printf("\t%s\n",opteron_dc_f_str);
-    printf("\t%s\n",barcelona_str);
-    printf("\t%s\n",shanghai_str);
-    printf("\t%s\n",istanbul_str);
-    printf("\t%s\n",magnycours_str);
-    printf("\t%s\n",interlagos_str);
-    printf("\t%s\n\n",kabini_str);
-}
-
-
-
-
-
-#define freeStrings  \
-    bdestroy(filename);  \
-bdestroy(grepString); \
-bdestroy(cpulist)
-
-
-int cpuid_isInCpuset(void)
-{
-    int pos = 0;
-    bstring grepString = bformat("Cpus_allowed_list:");
-    bstring filename = bformat("/proc/%d/status",getpid());
-    FILE* fp = fopen(bdata(filename),"r");
-
-    if (fp == NULL)
-    {
-        bdestroy(filename);
-        bdestroy(grepString);
-        return 0;
-    } 
-    else
-    {
-        bstring  cpulist;
-        uint32_t tmpThreads[MAX_NUM_THREADS];
-        bstring src = bread ((bNread) fread, fp);
-        if ((pos = binstr(src,0,grepString)) != BSTR_ERR)
-        {
-            int end = bstrchrp(src, 10, pos);
-            pos = pos+blength(grepString);
-            cpulist = bmidstr(src,pos, end-pos);
-            btrimws(cpulist);
-
-            if (bstr_to_cpuset_physical(tmpThreads, cpulist) < cpuid_topology.numHWThreads)
-            {
-                freeStrings;
-                return 1;
-            }
-            else
-            {
-                freeStrings;
-                return 0;
-            }
-        }
-        return 0;
-    }
-}
-
-void cpuid_initTopology(void)
-{
-    uint32_t apicId;
-    uint32_t bitField;
-    int level;
-    int prevOffset = 0;
-    int currOffset = 0;
-    cpu_set_t set;
-    HWThread* hwThreadPool;
-    int hasBLeaf = 0;
-    int maxNumLogicalProcs;
-    int maxNumLogicalProcsPerCore;
-    int maxNumCores;
-    TreeNode* currentNode;
-    int width;
-
-    /* check if 0x0B cpuid leaf is supported */
-    if (largest_function >= 0x0B)
-    {
-        eax = 0x0B;
-        ecx = 0;
-        CPUID;
-
-        if (ebx)
-        {
-            hasBLeaf = 1;
-        }
-    }
-
-    hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
-    tree_init(&cpuid_topology.topologyTree, 0);
-
-    if (hasBLeaf)
-    {
-        for (uint32_t i=0; i < cpuid_topology.numHWThreads; i++)
-        {
-
-            CPU_ZERO(&set);
-            CPU_SET(i,&set);
-            sched_setaffinity(0, sizeof(cpu_set_t), &set);
-            eax = 0x0B;
-            ecx = 0;
-            CPUID;
-            apicId = edx;
-            hwThreadPool[i].apicId = apicId;
-
-            for (level=0; level < 3; level++)
-            {
-                eax = 0x0B;
-                ecx = level;
-                CPUID;
-                currOffset = eax&0xFU;
-
-                switch ( level ) {
-                    case 0:  /* SMT thread */
-                        bitField = extractBitField(apicId,
-                                currOffset,
-                                0);
-                        hwThreadPool[i].threadId = bitField;
-                        break;
-
-                    case 1:  /* Core */
-                        bitField = extractBitField(apicId,
-                                currOffset-prevOffset,
-                                prevOffset);
-                        hwThreadPool[i].coreId = bitField;
-                        break;
-
-                    case 2:  /* Package */
-                        bitField = extractBitField(apicId,
-                                32-prevOffset,
-                                prevOffset);
-                        hwThreadPool[i].packageId = bitField;
-                        break;
-
-                }
-                prevOffset = currOffset;
-            }
-        }
-    }
-    else
-    {
-        switch ( cpuid_info.family )
-        {
-
-            case MIC_FAMILY:
-
-            case P6_FAMILY:
-                eax = 0x01;
-                CPUID;
-                maxNumLogicalProcs = extractBitField(ebx,8,16);
-
-                /* Check number of cores per package */
-                eax = 0x04;
-                ecx = 0;
-                CPUID;
-                maxNumCores = extractBitField(eax,6,26)+1;
-
-                maxNumLogicalProcsPerCore = maxNumLogicalProcs/maxNumCores;
-
-                for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
-                {
-                    CPU_ZERO(&set);
-                    CPU_SET(i,&set);
-                    sched_setaffinity(0, sizeof(cpu_set_t), &set);
-
-                    eax = 0x01;
-                    CPUID;
-                    hwThreadPool[i].apicId = extractBitField(ebx,8,24);
-
-                    /* ThreadId is extracted from th apicId using the bit width
-                     * of the number of logical processors
-                     * */
-                    hwThreadPool[i].threadId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                getBitFieldWidth(maxNumLogicalProcsPerCore),0); 
-
-                    /* CoreId is extracted from th apicId using the bitWidth 
-                     * of the number of logical processors as offset and the
-                     * bit width of the number of cores as width
-                     * */
-                    hwThreadPool[i].coreId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                getBitFieldWidth(maxNumCores),
-                                getBitFieldWidth(maxNumLogicalProcsPerCore)); 
-
-                    hwThreadPool[i].packageId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                8-getBitFieldWidth(maxNumLogicalProcs),
-                                getBitFieldWidth(maxNumLogicalProcs)); 
-                }
-                break;
-
-            case K8_FAMILY:
-                /* AMD Bios manual Rev. 2.28 section 3.1
-                 * Legacy method */
-                /*FIXME: This is a bit of a hack */
-
-                maxNumLogicalProcsPerCore = 1;
-                maxNumLogicalProcs = 1;
-
-                eax = 0x80000008;
-                CPUID;
-
-                maxNumCores =  extractBitField(ecx,8,0)+1;
-
-                for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
-                {
-                    CPU_ZERO(&set);
-                    CPU_SET(i,&set);
-                    sched_setaffinity(0, sizeof(cpu_set_t), &set);
-
-                    eax = 0x01;
-                    CPUID;
-                    hwThreadPool[i].apicId = extractBitField(ebx,8,24);
-
-                    /* ThreadId is extracted from th apicId using the bit width
-                     * of the number of logical processors
-                     * */
-                    hwThreadPool[i].threadId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                getBitFieldWidth(maxNumLogicalProcsPerCore),0); 
-
-                    /* CoreId is extracted from th apicId using the bitWidth 
-                     * of the number of logical processors as offset and the
-                     * bit width of the number of cores as width
-                     * */
-                    hwThreadPool[i].coreId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                getBitFieldWidth(maxNumCores),
-                                0); 
-
-                    hwThreadPool[i].packageId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                8-getBitFieldWidth(maxNumCores),
-                                getBitFieldWidth(maxNumCores)); 
-                }
-                break;
-
-            case K16_FAMILY:
-
-            case K15_FAMILY:
-
-            case K10_FAMILY:
-                /* AMD Bios manual Rev. 2.28 section 3.2
-                 * Extended method */
-                eax = 0x80000008;
-                CPUID;
-
-                width =  extractBitField(ecx,4,12);
-
-                if (width == 0)
-                {
-                    width =  extractBitField(ecx,8,0)+1;
-                }
-
-                eax = 0x01;
-                CPUID;
-                maxNumLogicalProcs =  extractBitField(ebx,8,16);
-                maxNumCores = extractBitField(ecx,8,0)+1;
-
-
-                for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
-                {
-                    CPU_ZERO(&set);
-                    CPU_SET(i,&set);
-                    sched_setaffinity(0, sizeof(cpu_set_t), &set);
-
-                    eax = 0x01;
-                    CPUID;
-                    hwThreadPool[i].apicId = extractBitField(ebx,8,24);
-                    /* AMD only knows cores */
-                    hwThreadPool[i].threadId = 0;
-
-                    hwThreadPool[i].coreId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                width, 0); 
-                    hwThreadPool[i].packageId =
-                        extractBitField(hwThreadPool[i].apicId,
-                                (8-width), width); 
-                }
-
-                break;
-        }
-    }
-
-    for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
-    {
-        /* Add node to Topology tree */
-        if (!tree_nodeExists(cpuid_topology.topologyTree,
-                    hwThreadPool[i].packageId))
-        {
-            tree_insertNode(cpuid_topology.topologyTree,
-                    hwThreadPool[i].packageId);
-        }
-        currentNode = tree_getNode(cpuid_topology.topologyTree,
-                hwThreadPool[i].packageId);
-
-        if (!tree_nodeExists(currentNode, hwThreadPool[i].coreId))
-        {
-            tree_insertNode(currentNode, hwThreadPool[i].coreId);
-        }
-        currentNode = tree_getNode(currentNode, hwThreadPool[i].coreId);
-
-        if (!tree_nodeExists(currentNode, i))
-        {
-            /*
-               printf("WARNING: Thread already exists!\n");
-               */
-            tree_insertNode(currentNode, i);
-        }
-
-    }
-
-    cpuid_topology.threadPool = hwThreadPool;
-    cpuid_topology.numSockets = tree_countChildren(cpuid_topology.topologyTree);
-    currentNode = tree_getChildNode(cpuid_topology.topologyTree);
-    cpuid_topology.numCoresPerSocket = tree_countChildren(currentNode);
-    currentNode = tree_getChildNode(currentNode);
-    cpuid_topology.numThreadsPerCore = tree_countChildren(currentNode);
-}
-
-void cpuid_initCacheTopology()
-{
-    int maxNumLevels=0;
-    int id=0;
-    CacheLevel* cachePool = NULL;
-    CacheType type = DATACACHE;
-
-    switch ( cpuid_info.family ) 
-    {
-        case MIC_FAMILY:
-
-        case P6_FAMILY:
-
-            if (largest_function >= 4)
-            {
-                maxNumLevels = intelCpuidFunc_4(&cachePool);
-            }
-            else
-            {
-                //				intelCpuidFunc_2(&cachePool);
-            }
-
-            break;
-
-        case K8_FAMILY:
-            maxNumLevels = 2;
-            cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
-
-            eax = 0x80000005;
-            CPUID;
-            cachePool[0].level = 1;
-            cachePool[0].type = DATACACHE;
-            cachePool[0].associativity = extractBitField(ecx,8,16);
-            cachePool[0].lineSize = extractBitField(ecx,8,0);
-            cachePool[0].size =  extractBitField(ecx,8,24) * 1024;
-            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
-            {
-                cachePool[0].sets = cachePool[0].size/
-                    (cachePool[0].associativity * cachePool[0].lineSize);
-            }
-            cachePool[0].threads = 1;
-            cachePool[0].inclusive = 1;
-
-            eax = 0x80000006;
-            CPUID;
-            cachePool[1].level = 2;
-            cachePool[1].type = UNIFIEDCACHE;
-            cachePool[1].associativity = 
-                amdGetAssociativity(extractBitField(ecx,4,12));
-            cachePool[1].lineSize = extractBitField(ecx,8,0);
-            cachePool[1].size =  extractBitField(ecx,16,16) * 1024;
-            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
-            {
-                cachePool[1].sets = cachePool[1].size/
-                    (cachePool[1].associativity * cachePool[1].lineSize);
-            }
-            cachePool[1].threads = 1;
-            cachePool[1].inclusive = 1;
-
-            break;
-
-
-        case K10_FAMILY:
-            /* FIXME: Adds one level for the instruction cache on Intel
-             * This fixes the level for the cores
-             */
-            maxNumLevels = 3;
-            cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
-
-            eax = 0x80000005;
-            CPUID;
-            cachePool[0].level = 1;
-            cachePool[0].type = DATACACHE;
-            cachePool[0].associativity = extractBitField(ecx,8,16);
-            cachePool[0].lineSize = extractBitField(ecx,8,0);
-            cachePool[0].size =  extractBitField(ecx,8,24) * 1024;
-            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
-            {
-                cachePool[0].sets = cachePool[0].size/
-                    (cachePool[0].associativity * cachePool[0].lineSize);
-            }
-            cachePool[0].threads = 1;
-            cachePool[0].inclusive = 1;
-
-            eax = 0x80000006;
-            CPUID;
-            cachePool[1].level = 2;
-            cachePool[1].type = UNIFIEDCACHE;
-            cachePool[1].associativity = 
-                amdGetAssociativity(extractBitField(ecx,4,12));
-            cachePool[1].lineSize = extractBitField(ecx,8,0);
-            cachePool[1].size =  extractBitField(ecx,16,16) * 1024;
-            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
-            {
-                cachePool[1].sets = cachePool[1].size/
-                    (cachePool[1].associativity * cachePool[1].lineSize);
-            }
-            cachePool[1].threads = 1;
-            cachePool[1].inclusive = 1;
-
-            cachePool[2].level = 3;
-            cachePool[2].type = UNIFIEDCACHE;
-            cachePool[2].associativity =
-                amdGetAssociativity(extractBitField(edx,4,12));
-            cachePool[2].lineSize = extractBitField(edx,8,0);
-            cachePool[2].size =  (extractBitField(edx,14,18)+1) * 524288;
-            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
-            {
-                cachePool[2].sets = cachePool[1].size/
-                    (cachePool[1].associativity * cachePool[1].lineSize);
-            }
-
-            if (cpuid_info.model != MAGNYCOURS)
-            {
-                cachePool[2].threads = cpuid_topology.numCoresPerSocket;
-            }
-            else
-            {
-                cachePool[2].threads = cpuid_topology.numCoresPerSocket/2;
-                cachePool[2].size /= 2 ;
-            }
-
-            cachePool[2].inclusive = 1;
-
-            break;
-
-        case K16_FAMILY:
-
-        case K15_FAMILY:
-
-            maxNumLevels = 0;
-            cachePool = (CacheLevel*) malloc(3 * sizeof(CacheLevel));
-
-            while (type)
-            {
-                ecx = id;
-                eax = 0x8000001D;
-                CPUID;
-                type = (CacheType) extractBitField(eax,4,0);
-
-                if ((type == DATACACHE) || (type == UNIFIEDCACHE))
-                {
-                    cachePool[maxNumLevels].level =   extractBitField(eax,3,5);
-                    cachePool[maxNumLevels].type = type;
-                    cachePool[maxNumLevels].associativity = extractBitField(ebx,10,22)+1;
-                    cachePool[maxNumLevels].lineSize = extractBitField(ebx,12,0)+1;
-                    cachePool[maxNumLevels].sets =  extractBitField(ecx,32,0)+1;
-                    cachePool[maxNumLevels].size = cachePool[maxNumLevels].associativity *
-                        cachePool[maxNumLevels].lineSize * cachePool[maxNumLevels].sets;
-                    cachePool[maxNumLevels].threads =  extractBitField(eax,12,14)+1;
-                    cachePool[maxNumLevels].inclusive =  (edx & (0x1<<1));
-                    maxNumLevels++;
-                }
-                id++;
-            }
-            break;
-
-        default:
-            ERROR_PLAIN_PRINT(Processor is not supported);
-            break;
-    }
-
-    cpuid_topology.numCacheLevels = maxNumLevels;
-    cpuid_topology.cacheLevels = cachePool;
-}
-
-
-
diff --git a/src/daemon.c b/src/daemon.c
deleted file mode 100644
index de5bfa5..0000000
--- a/src/daemon.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  daemon.c
- *
- *      Description:  C Module implementing a daemon time loop
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <signal.h>
-#include <sys/time.h>
-#include <time.h>
-
-#include <timer.h>
-#include <perfmon.h>
-#include <daemon.h>
-
-static volatile int daemon_run = 0;
-static bstring eventString;
-static TimerData timeData;
-static pid_t daemonpid = 0;
-
-
-void
-daemon_start(bstring str, struct timespec interval)
-{
-    daemonpid = fork();
-    if (daemonpid == 0)
-    {
-        eventString = bstrcpy(str);
-        signal(SIGINT, daemon_interrupt);
-        signal(SIGUSR1, daemon_interrupt);
-        daemon_run = 1;
-        perfmon_setupEventSet(eventString, NULL);
-        perfmon_startCounters();
-        timer_start(&timeData);
-
-        while (1)
-        {
-            if (daemon_run)
-            {
-                timer_stop(&timeData);
-                perfmon_readCounters();
-                perfmon_logCounterResults( timer_print(&timeData) );
-                timer_start(&timeData);
-            }
-            else
-            {
-                break;
-            }
-            nanosleep( &interval, NULL);
-        }
-        signal(SIGINT, SIG_DFL);
-        signal(SIGUSR1, SIG_DFL);
-        exit(EXIT_SUCCESS);
-    }
-}
-
-void
-daemon_stop(int sig)
-{
-    if (daemonpid > 0)
-    {
-        printf("PARENT: KILL daemon with signal %d\n", sig);
-        kill(daemonpid, sig);
-        //perfmon_stopCounters();
-    }
-}
-
-void
-daemon_interrupt(int sig)
-{
-    if (sig == SIGUSR1)
-    {
-        if (daemon_run)
-        {
-            perfmon_stopCounters();
-            daemon_run = 0;
-            printf("DAEMON: STOP on %d\n",sig);
-            exit(EXIT_SUCCESS);
-        }
-        else
-        {
-            perfmon_setupEventSet(eventString, NULL);
-            perfmon_startCounters();
-            daemon_run = 1;
-            printf("DAEMON: START with events %s\n",bdata(eventString));
-        }
-    } else
-    {
-        printf("DAEMON: EXIT on %d\n", sig);
-        daemon_run = 0;
-        exit(EXIT_SUCCESS);
-    }
-}
-
-
diff --git a/src/ghash.c b/src/ghash.c
index 87e0ed0..a6b5821 100644
--- a/src/ghash.c
+++ b/src/ghash.c
@@ -1,19 +1,20 @@
-/*
- * =======================================================================================
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
+/* GLIB - Library of useful routines for C programming
+ * Copyright (C) 1995-1997  Peter Mattis, Spencer Kimball and Josh MacDonald
  *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * =======================================================================================
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
  */
 
 /*
@@ -42,30 +43,30 @@
 #define HASH_IS_TOMBSTONE(h_) ((h_) == TOMBSTONE_HASH_VALUE)
 #define HASH_IS_REAL(h_) ((h_) >= 2)
 
-#ifndef	FALSE
-#define	FALSE	(0)
+#ifndef    FALSE
+#define    FALSE    (0)
 #endif
 
-#ifndef	TRUE
-#define	TRUE	(!FALSE)
+#ifndef    TRUE
+#define    TRUE    (!FALSE)
 #endif
 
-#undef	MAX
+#undef    MAX
 #define MAX(a, b)  (((a) > (b)) ? (a) : (b))
 
-#undef	MIN
+#undef    MIN
 #define MIN(a, b)  (((a) < (b)) ? (a) : (b))
 
-#undef	ABS
-#define ABS(a)	   (((a) < 0) ? -(a) : (a))
+#undef    ABS
+#define ABS(a)       (((a) < 0) ? -(a) : (a))
 #define G_LIKELY(expr) (expr)
 #define G_UNLIKELY(expr) (expr)
 
 #define _G_NEW(struct_type, n_structs, func) \
         ((struct_type *) g_##func##_n ((n_structs), sizeof (struct_type)))
 
-#define g_new(struct_type, n_structs)			_G_NEW (struct_type, n_structs, malloc)
-#define g_new0(struct_type, n_structs)			_G_NEW (struct_type, n_structs, malloc0)
+#define g_new(struct_type, n_structs)            _G_NEW (struct_type, n_structs, malloc)
+#define g_new0(struct_type, n_structs)            _G_NEW (struct_type, n_structs, malloc0)
 
 struct _GHashTable
 {
diff --git a/src/hashTable.c b/src/hashTable.c
index bf6c3d8..b6cdfa7 100644
--- a/src/hashTable.c
+++ b/src/hashTable.c
@@ -6,13 +6,13 @@
  *      Description: Hashtable implementation based on SGLIB.
  *                   Used for Marker API result handling.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -60,6 +60,20 @@ void hashTable_init()
     }
 }
 
+void hashTable_initThread(int coreID)
+{
+    ThreadList* resPtr = threadList[coreID];
+    /* check if thread was already initialized */
+    if (resPtr == NULL)
+    {
+        resPtr = (ThreadList*) malloc(sizeof(ThreadList));
+        /* initialize structure */
+        resPtr->tid =  pthread_self();
+        resPtr->coreId  = coreID;
+        resPtr->hashTable = g_hash_table_new(g_str_hash, g_str_equal);
+        threadList[coreID] = resPtr;
+    }
+}
 
 int hashTable_get(bstring label, LikwidThreadResults** resEntry)
 {
@@ -86,7 +100,7 @@ int hashTable_get(bstring label, LikwidThreadResults** resEntry)
         (*resEntry)->label = bstrcpy (label);
         (*resEntry)->time = 0.0;
         (*resEntry)->count = 0;
-        for (int i=0; i< NUM_PMC; i++) 
+        for (int i=0; i< NUM_PMC; i++)
         {
             (*resEntry)->PMcounters[i] = 0.0;
             (*resEntry)->StartPMcounters[i] = 0.0;
@@ -109,7 +123,6 @@ void hashTable_finalize(int* numThreads, int* numRegions, LikwidResults** result
     GHashTable* regionLookup;
 
     regionLookup = g_hash_table_new(g_str_hash, g_str_equal);
-
     /* determine number of active threads */
     for (int i=0; i<MAX_NUM_THREADS; i++)
     {
@@ -128,22 +141,50 @@ void hashTable_finalize(int* numThreads, int* numRegions, LikwidResults** result
 
     /* allocate data structures */
     (*results) = (LikwidResults*) malloc(numberOfRegions * sizeof(LikwidResults));
-
-    for ( uint32_t i=0; i < numberOfRegions; i++ )
+    if (!(*results))
     {
-        (*results)[i].time = (double*) malloc(numberOfThreads * sizeof(double));
-        (*results)[i].count = (uint32_t*) malloc(numberOfThreads * sizeof(uint32_t));
-        (*results)[i].counters = (double**) malloc(numberOfThreads * sizeof(double*));
-
-        for ( uint32_t j=0; j < numberOfThreads; j++ )
+        fprintf(stderr, "Failed to allocate %lu bytes for the results\n", numberOfRegions * sizeof(LikwidResults));
+    }
+    else
+    {
+        for ( uint32_t i=0; i < numberOfRegions; i++ )
         {
-            (*results)[i].time[j] = 0.0;
-            (*results)[i].count[j] = 0;
-            (*results)[i].counters[j] = (double*) malloc(NUM_PMC * sizeof(double));
+            (*results)[i].time = (double*) malloc(numberOfThreads * sizeof(double));
+            if (!(*results)[i].time)
+            {
+                fprintf(stderr, "Failed to allocate %lu bytes for the time storage\n", numberOfThreads * sizeof(double));
+                break;
+            }
+            (*results)[i].count = (uint32_t*) malloc(numberOfThreads * sizeof(uint32_t));
+            if (!(*results)[i].count)
+            {
+                fprintf(stderr, "Failed to allocate %lu bytes for the count storage\n", numberOfThreads * sizeof(uint32_t));
+                break;
+            }
+            (*results)[i].counters = (double**) malloc(numberOfThreads * sizeof(double*));
+            if (!(*results)[i].counters)
+            {
+                fprintf(stderr, "Failed to allocate %lu bytes for the counter result storage\n", numberOfThreads * sizeof(double*));
+                break;
+            }
 
-            for ( uint32_t k=0; k < NUM_PMC; k++ )
+            for ( uint32_t j=0; j < numberOfThreads; j++ )
             {
-                (*results)[i].counters[j][k] = 0.0;
+                (*results)[i].time[j] = 0.0;
+                (*results)[i].count[j] = 0;
+                (*results)[i].counters[j] = (double*) malloc(NUM_PMC * sizeof(double));
+                if (!(*results)[i].counters)
+                {
+                    fprintf(stderr, "Failed to allocate %lu bytes for the counter result storage for thread %d\n", NUM_PMC * sizeof(double), j);
+                    break;
+                }
+                else
+                {
+                    for ( uint32_t k=0; k < NUM_PMC; k++ )
+                    {
+                        (*results)[i].counters[j][k] = 0.0;
+                    }
+                }
             }
         }
     }
@@ -174,6 +215,7 @@ void hashTable_finalize(int* numThreads, int* numRegions, LikwidResults** result
                 if ( regionId == NULL )
                 {
                     (*results)[currentRegion].tag = bstrcpy (threadResult->label);
+                    (*results)[currentRegion].groupID = threadResult->groupID;
                     regionIds[currentRegion] = currentRegion;
                     regionId = regionIds + currentRegion;
                     g_hash_table_insert(regionLookup, g_strdup(key), (regionIds+currentRegion));
diff --git a/src/includes/access.h b/src/includes/access.h
new file mode 100644
index 0000000..1e30674
--- /dev/null
+++ b/src/includes/access.h
@@ -0,0 +1,41 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  access.h
+ *
+ *      Description:  Header File HPM access Module
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef ACCESS_H
+#define ACCESS_H
+
+int HPMinit(void);
+int HPMinitialized(void);
+int HPMaddThread(int cpu_id);
+void HPMfinalize(void);
+int HPMread(int cpu_id, PciDeviceIndex dev, uint32_t reg, uint64_t* data);
+int HPMwrite(int cpu_id, PciDeviceIndex dev, uint32_t reg, uint64_t data);
+
+#endif
diff --git a/src/includes/accessClient.h b/src/includes/accessClient.h
index 0058182..b88c583 100644
--- a/src/includes/accessClient.h
+++ b/src/includes/accessClient.h
@@ -3,15 +3,15 @@
  *
  *      Filename:  accessClient.h
  *
- *      Description:  Header File accessClient Module. 
+ *      Description:  Header File accessClient Module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -33,11 +33,10 @@
 
 #include <types.h>
 
-extern int accessClient_mode;
 
 /* This needs to be called BEFORE msr_init and
  * sets how the module tries to access the MSR registers. */
-extern void accessClient_setaccessmode(int mode);
+
 
 /* This needs to be called BEFORE msr_init and
  * sets the priority the module reports to the daemon.
@@ -46,10 +45,10 @@ extern void accessClient_setlowaccesspriority(void);
 
 /* Initializes the MSR module, trying to open either the MSR files or
  * the connection to the msr daemon. */
-extern void accessClient_init(int* socket_fd);
+
 extern void accessClient_initThread(int* socket_fd);
-extern void accessClient_finalize(int socket_fd);
-extern uint64_t accessClient_read(int socket_fd, int cpu, int device, uint32_t reg);
-extern void accessClient_write(int socket_fd, int cpu, int device, uint32_t reg, uint64_t data);
+
+extern int accessClient_read(int socket_fd, int cpu, int device, uint32_t reg, uint64_t *data);
+extern int accessClient_write(int socket_fd, int cpu, int device, uint32_t reg, uint64_t data);
 
 #endif /* ACCESSCLIENT_H */
diff --git a/src/includes/accessClient_types.h b/src/includes/accessClient_types.h
index a0c7a84..1cc8605 100644
--- a/src/includes/accessClient_types.h
+++ b/src/includes/accessClient_types.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Types file for accessClient module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -33,12 +33,6 @@
 
 #include <stdint.h>
 
-/* This naming with AccessType and AccessMode is admittedly a bit confusing */
-typedef enum {
-    DAEMON_AM_DIRECT = 0,
-    DAEMON_AM_ACCESS_D
-} AccessMode;
-
 typedef enum {
     DAEMON_READ = 0,
     DAEMON_WRITE,
@@ -70,9 +64,7 @@ typedef enum {
     ERR_OPENFAIL,     /* failure to open msr files */
     ERR_RWFAIL,       /* failure to read/write msr */
     ERR_DAEMONBUSY,   /* daemon already has another client */
-    ERR_LOCKED,       /* access to HPM is locked */
-    ERR_UNSUPPORTED,   /* unsupported processor */
-    ERR_NODEV	/* No such device */
+    ERR_NODEV         /* No such device */
 } AccessErrorType;
 
 typedef struct {
@@ -84,4 +76,6 @@ typedef struct {
     AccessErrorType errorcode; /* Only in replies - 0 if no error. */
 } AccessDataRecord;
 
+extern int accessClient_mode;
+
 #endif /*ACCESSCLIENT_TYPES_H*/
diff --git a/src/includes/affinity.h b/src/includes/affinity.h
index f347e64..bc7a7fd 100644
--- a/src/includes/affinity.h
+++ b/src/includes/affinity.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Header File affinity Module
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -32,17 +33,18 @@
 #define AFFINITY_H
 
 #include <types.h>
+#include <likwid.h>
+
+int socket_lock[MAX_NUM_NODES];
+int tile_lock[MAX_NUM_THREADS];
+extern AffinityDomains affinityDomains;
 
 extern int affinity_core2node_lookup[MAX_NUM_THREADS];
 
-extern void affinity_init();
-extern void affinity_finalize();
-extern int  affinity_processGetProcessorId();
-extern int  affinity_threadGetProcessorId();
-extern void  affinity_pinProcess(int processorId);
-extern void  affinity_pinThread(int processorId);
+extern int affinity_processGetProcessorId();
+extern int affinity_threadGetProcessorId();
 extern const AffinityDomain* affinity_getDomain(bstring domain);
-extern void affinity_printDomains(FILE* OUTSTREAM);
+
 
 #endif /*AFFINITY_H*/
 
diff --git a/src/includes/affinity_types.h b/src/includes/affinity_types.h
deleted file mode 100644
index 2b08bfe..0000000
--- a/src/includes/affinity_types.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  affinity_types.h
- *
- *      Description:  Type Definitions for affinity Module
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef AFFINITY_TYPES_H
-#define AFFINITY_TYPES_H
-
-typedef struct {
-    bstring tag;
-    uint32_t numberOfProcessors;
-    uint32_t numberOfCores;
-    int* processorList;
-} AffinityDomain;
-
-
-#endif /*AFFINITY_TYPES_H*/
diff --git a/src/includes/allocator.h b/src/includes/allocator.h
deleted file mode 100644
index a21555c..0000000
--- a/src/includes/allocator.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  allocator.h
- *
- *      Description:  Header File allocator Module. 
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  none
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ALLOCATOR_H
-#define ALLOCATOR_H
-
-#include <types.h>
-#include <bstrlib.h>
-
-extern void allocator_init(int numVectors);
-extern void allocator_finalize();
-extern void allocator_allocateVector(FILE* OUTSTREAM,
-                                     void** ptr,
-                                     int alignment,
-                                     uint64_t size,
-                                     int offset,
-                                     DataType type,
-                                     bstring domain);
-
-#endif /*ALLOCATOR_H*/
-
diff --git a/src/includes/asciiBoxes.h b/src/includes/asciiBoxes.h
deleted file mode 100644
index dd37a05..0000000
--- a/src/includes/asciiBoxes.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  asciiBoxes.h
- *
- *      Description:  Module to draw nested ascii art boxes.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ASCIIBOXES_H
-#define ASCIIBOXES_H
-
-#include <types.h>
-#include <bstrlib.h>
-
-extern BoxContainer* asciiBoxes_allocateContainer(int numLines,int numColumns);
-extern void asciiBoxes_addBox(BoxContainer* container, int line, int column, bstring label);
-extern void asciiBoxes_addJoinedBox(BoxContainer* container, int line, int startColumn, int endColumn, bstring label);
-extern void asciiBoxes_print(FILE* OUTSTREAM, BoxContainer* container);
-
-#endif /*ASCIIBOXES_H*/
diff --git a/src/includes/asciiBoxes_types.h b/src/includes/asciiBoxes_types.h
deleted file mode 100644
index f09c4b3..0000000
--- a/src/includes/asciiBoxes_types.h
+++ /dev/null
@@ -1,47 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  asciiBoxes_types.h
- *
- *      Description:  Types file for asciiBoxes module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ASCIIBOXES_TYPES_H
-#define ASCIIBOXES_TYPES_H
-
-#include  <bstrlib.h>
-
-typedef struct box {
-    int width;
-    bstring label;
-} Box;
-
-typedef struct boxContainer {
-    int numLines;
-    int numColumns;
-    Box** boxes;
-} BoxContainer;
-
-#endif /*ASCIIBOXES_TYPES_H*/
diff --git a/src/includes/asciiTable.h b/src/includes/asciiTable.h
deleted file mode 100644
index 6096c4a..0000000
--- a/src/includes/asciiTable.h
+++ /dev/null
@@ -1,45 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  asciiTable.h
- *
- *      Description:  Module to create and print a ascii table
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ASCIITABLE_H
-#define ASCIITABLE_H
-
-#include <types.h>
-#include <bstrlib.h>
-
-extern TableContainer* asciiTable_allocate(int numRows,int numColumns, bstrList* headerLabels);
-extern void asciiTable_free(TableContainer* container);
-extern void asciiTable_insertRow(TableContainer* container, int row,  bstrList* fields);
-extern void asciiTable_appendRow(TableContainer* container, bstrList* fields);
-extern void asciiTable_setCurrentRow(TableContainer* container, int row);
-extern void asciiTable_print(TableContainer* container);
-extern void asciiTable_setOutput(FILE* stream);
-
-#endif /*ASCIITABLE_H*/
diff --git a/src/includes/asciiTable_types.h b/src/includes/asciiTable_types.h
deleted file mode 100644
index 986a8a2..0000000
--- a/src/includes/asciiTable_types.h
+++ /dev/null
@@ -1,48 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  asciiTable_types.h
- *
- *      Description:  Types file for asciiTable module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef ASCIITABLE_TYPES_H
-#define ASCIITABLE_TYPES_H
-
-#include  <bstrlib.h>
-
-typedef struct bstrList bstrList; 
-
-typedef struct {
-    int numRows;
-    int numColumns;
-    int currentRow;
-    int printed;
-    bstrList*  header;
-    bstrList** rows;
-} TableContainer;
-
-
-#endif /*ASCIITABLE_TYPES_H*/
diff --git a/src/includes/barrier.h b/src/includes/barrier.h
deleted file mode 100644
index 5f4142d..0000000
--- a/src/includes/barrier.h
+++ /dev/null
@@ -1,62 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  barrier.h
- *
- *      Description:  Header File barrier Module
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef BARRIER_H
-#define BARRIER_H
-
-#include <types.h>
-
-/**
- * @brief  Initialize the barrier module
- * @param  numberOfThreads The total number of threads in the barrier
- */
-extern void barrier_init(int numberOfGroups);
-
-/**
- * @brief  Destroy data structures of the barrier module
- */
-extern void barrier_destroy(void);
-
-/**
- * @brief  Register a thread for a barrier
- * @param  threadId The id of the thread to register
- */
-extern int barrier_registerGroup(int numThreads);
-extern void barrier_registerThread(BarrierData* barr, int groupsId, int threadId);
-
-/**
- * @brief  Synchronize threads
- * @param  threadId The id of the calling thread
- * @param  numberOfThreads Total number of threads in the barrier
- */
-extern void  barrier_synchronize(BarrierData* barr);
-
-
-#endif /*BARRIER_H*/
diff --git a/src/includes/barrier_types.h b/src/includes/barrier_types.h
deleted file mode 100644
index d0abb55..0000000
--- a/src/includes/barrier_types.h
+++ /dev/null
@@ -1,49 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  barrier_types.h
- *
- *      Description:  Type Definitions for barrier Module
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef BARRIER_TYPES_H
-#define BARRIER_TYPES_H
-
-#include <stdint.h>
-
-typedef struct {
-    int        numberOfThreads;
-    int        offset;
-    int        val;
-    int*       index;
-    volatile int*  bval;
-} BarrierData;
-
-typedef struct {
-    int*       groupBval;
-    int        numberOfThreads;
-} BarrierGroup;
-
-#endif /*BARRIER_TYPES_H*/
diff --git a/src/includes/bitUtil.h b/src/includes/bitUtil.h
index c876eea..5ebcb43 100644
--- a/src/includes/bitUtil.h
+++ b/src/includes/bitUtil.h
@@ -6,13 +6,13 @@
  *      Description:  Header File bitUtil Module. 
  *                    Helper routines for dealing with bit manipulations
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/bstrlib.h b/src/includes/bstrlib.h
index abdbef3..a1160b6 100644
--- a/src/includes/bstrlib.h
+++ b/src/includes/bstrlib.h
@@ -113,11 +113,11 @@ extern struct bstrList * bsplits (const_bstring str, const_bstring splitStr);
 extern struct bstrList * bsplitstr (const_bstring str, const_bstring splitStr);
 extern bstring bjoin (const struct bstrList * bl, const_bstring sep);
 extern int bsplitcb (const_bstring str, unsigned char splitChar, int pos,
-	int (* cb) (void * parm, int ofs, int len), void * parm);
+    int (* cb) (void * parm, int ofs, int len), void * parm);
 extern int bsplitscb (const_bstring str, const_bstring splitStr, int pos,
-	int (* cb) (void * parm, int ofs, int len), void * parm);
+    int (* cb) (void * parm, int ofs, int len), void * parm);
 extern int bsplitstrcb (const_bstring str, const_bstring splitStr, int pos,
-	int (* cb) (void * parm, int ofs, int len), void * parm);
+    int (* cb) (void * parm, int ofs, int len), void * parm);
 
 /* Miscellaneous functions */
 extern int bpattern (bstring b, int len);
@@ -137,21 +137,21 @@ extern int bvcformata (bstring b, int count, const char * fmt, va_list arglist);
 bstring bstrtmp_b = (b); \
 const char * bstrtmp_fmt = (fmt); \
 int bstrtmp_r = BSTR_ERR, bstrtmp_sz = 16; \
-	for (;;) { \
-		va_list bstrtmp_arglist; \
-		va_start (bstrtmp_arglist, lastarg); \
-		bstrtmp_r = bvcformata (bstrtmp_b, bstrtmp_sz, bstrtmp_fmt, bstrtmp_arglist); \
-		va_end (bstrtmp_arglist); \
-		if (bstrtmp_r >= 0) { /* Everything went ok */ \
-			bstrtmp_r = BSTR_OK; \
-			break; \
-		} else if (-bstrtmp_r <= bstrtmp_sz) { /* A real error? */ \
-			bstrtmp_r = BSTR_ERR; \
-			break; \
-		} \
-		bstrtmp_sz = -bstrtmp_r; /* Doubled or target size */ \
-	} \
-	ret = bstrtmp_r; \
+    for (;;) { \
+        va_list bstrtmp_arglist; \
+        va_start (bstrtmp_arglist, lastarg); \
+        bstrtmp_r = bvcformata (bstrtmp_b, bstrtmp_sz, bstrtmp_fmt, bstrtmp_arglist); \
+        va_end (bstrtmp_arglist); \
+        if (bstrtmp_r >= 0) { /* Everything went ok */ \
+            bstrtmp_r = BSTR_OK; \
+            break; \
+        } else if (-bstrtmp_r <= bstrtmp_sz) { /* A real error? */ \
+            bstrtmp_r = BSTR_ERR; \
+            break; \
+        } \
+        bstrtmp_sz = -bstrtmp_r; /* Doubled or target size */ \
+    } \
+    ret = bstrtmp_r; \
 }
 
 #endif
@@ -179,15 +179,15 @@ extern int bsreada (bstring b, struct bStream * s, int n);
 extern int bsunread (struct bStream * s, const_bstring b);
 extern int bspeek (bstring r, const struct bStream * s);
 extern int bssplitscb (struct bStream * s, const_bstring splitStr, 
-	int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
 extern int bssplitstrcb (struct bStream * s, const_bstring splitStr, 
-	int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
+    int (* cb) (void * parm, int ofs, const_bstring entry), void * parm);
 extern int bseof (const struct bStream * s);
 
 struct tagbstring {
-	int mlen;
-	int slen;
-	unsigned char * data;
+    int mlen;
+    int slen;
+    unsigned char * data;
 };
 
 /* Accessor macros */
diff --git a/src/includes/configuration.h b/src/includes/configuration.h
new file mode 100644
index 0000000..da8b896
--- /dev/null
+++ b/src/includes/configuration.h
@@ -0,0 +1,46 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  configuration.h
+ *
+ *      Description:  Header File of Module configuration.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at gmail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef CONFIGURATION_H
+#define CONFIGURATION_H
+
+#include <types.h>
+#include <likwid.h>
+#include <error.h>
+
+
+extern Configuration config;
+extern int init_config;
+
+
+
+
+
+#endif
diff --git a/src/includes/cpuFeatures.h b/src/includes/cpuFeatures.h
index 9274e40..d1f87d5 100644
--- a/src/includes/cpuFeatures.h
+++ b/src/includes/cpuFeatures.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Header File of Module cpuFeatures.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/cpuFeatures_types.h b/src/includes/cpuFeatures_types.h
index 3e7ec5d..7593220 100644
--- a/src/includes/cpuFeatures_types.h
+++ b/src/includes/cpuFeatures_types.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Types file for CpuFeature module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -38,23 +38,23 @@ typedef enum {
     IP_PREFETCHER} CpuFeature;
 
 typedef struct {
-    unsigned int fastStrings:1;
-    unsigned int thermalControl:1;
-    unsigned int perfMonitoring:1;
-    unsigned int hardwarePrefetcher:1;
-    unsigned int ferrMultiplex:1;
-    unsigned int branchTraceStorage:1;
-    unsigned int pebs:1;
-    unsigned int speedstep:1;
-    unsigned int monitor:1;
-    unsigned int clPrefetcher:1;
-    unsigned int speedstepLock:1;
-    unsigned int cpuidMaxVal:1;
-    unsigned int xdBit:1;
-    unsigned int dcuPrefetcher:1;
-    unsigned int dynamicAcceleration:1;
-    unsigned int turboMode:1;
-    unsigned int ipPrefetcher:1;
+	unsigned int fastStrings:1;
+	unsigned int thermalControl:1;
+	unsigned int perfMonitoring:1;
+	unsigned int hardwarePrefetcher:1;
+	unsigned int ferrMultiplex:1;
+	unsigned int branchTraceStorage:1;
+	unsigned int pebs:1;
+	unsigned int speedstep:1;
+	unsigned int monitor:1;
+	unsigned int clPrefetcher:1;
+	unsigned int speedstepLock:1;
+	unsigned int cpuidMaxVal:1;
+	unsigned int xdBit:1;
+	unsigned int dcuPrefetcher:1;
+	unsigned int dynamicAcceleration:1;
+	unsigned int turboMode:1;
+	unsigned int ipPrefetcher:1;
     } CpuFeatureFlags;
 
 
diff --git a/src/includes/cpuid.h b/src/includes/cpuid.h
deleted file mode 100644
index 80c426a..0000000
--- a/src/includes/cpuid.h
+++ /dev/null
@@ -1,130 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  cpuid.h
- *
- *      Description:  Header File cpuid Module. 
- *                    Reads out cpuid information and initilaizes a global 
- *                    data structure cpuid_info.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef CPUID_H
-#define CPUID_H
-
-#include <types.h>
-
-/* Intel P6 */
-#define PENTIUM_M_BANIAS     0x09U
-#define PENTIUM_M_DOTHAN     0x0DU
-#define CORE_DUO             0x0EU
-#define CORE2_65             0x0FU
-#define CORE2_45             0x17U
-#define ATOM                 0x1CU
-#define ATOM_45              0x26U
-#define ATOM_32              0x36U
-#define ATOM_22              0x27U
-#define ATOM_SILVERMONT_E    0x37U
-#define ATOM_SILVERMONT_C    0x4DU
-#define ATOM_SILVERMONT_F1   0x4AU
-#define ATOM_SILVERMONT_F2   0x5AU
-#define ATOM_SILVERMONT_F3   0x5DU
-#define NEHALEM              0x1AU
-#define NEHALEM_BLOOMFIELD   0x1AU
-#define NEHALEM_LYNNFIELD    0x1EU
-#define NEHALEM_LYNNFIELD_M  0x1FU
-#define NEHALEM_WESTMERE     0x2CU
-#define NEHALEM_WESTMERE_M   0x25U
-#define SANDYBRIDGE          0x2AU
-#define SANDYBRIDGE_EP       0x2DU
-#define HASWELL              0x3CU
-#define HASWELL_EX           0x3FU
-#define HASWELL_M1           0x45U
-#define HASWELL_M2           0x46U
-#define IVYBRIDGE            0x3AU
-#define IVYBRIDGE_EP         0x3EU
-#define NEHALEM_EX           0x2EU
-#define WESTMERE_EX          0x2FU
-#define XEON_MP              0x1DU
-
-/* Intel MIC */
-#define XEON_PHI           0x01U
-
-/* AMD K10 */
-#define BARCELONA      0x02U
-#define SHANGHAI       0x04U
-#define ISTANBUL       0x08U
-#define MAGNYCOURS     0x09U
-
-/* AMD K8 */
-#define OPTERON_SC_1MB  0x05U
-#define OPTERON_DC_E    0x21U
-#define OPTERON_DC_F    0x41U
-#define ATHLON64_X2     0x43U
-#define ATHLON64_X2_F   0x4BU
-#define ATHLON64_F1     0x4FU
-#define ATHLON64_F2     0x5FU
-#define ATHLON64_X2_G   0x6BU
-#define ATHLON64_G1     0x6FU
-#define ATHLON64_G2     0x7FU
-
-
-#define  P6_FAMILY        0x6U
-#define  MIC_FAMILY       0xBU
-#define  NETBURST_FAMILY  0xFFU
-#define  K15_FAMILY       0x15U
-#define  K16_FAMILY       0x16U
-#define  K10_FAMILY       0x10U
-#define  K8_FAMILY        0xFU
-
-/** Structure holding cpuid information
- *
- */
-extern CpuInfo cpuid_info;
-extern CpuTopology cpuid_topology;
-
-/** Init routine to intialize global structure.
- *
- *  Determines: 
- *  - cpu family
- *  - cpu model
- *  - cpu stepping
- *  - cpu clock
- *  - Instruction Set Extension Flags
- *  - Performance counter features (Intel P6 only)
- *
- */
-extern int cpuid_init (void);
-extern void cpuid_print (void);
-extern void cpuid_initTopology (void);
-extern void cpuid_initCacheTopology (void);
-extern int  cpuid_isInCpuset(void);
-
-static inline int cpuid_hasFeature(FeatureBit bit)
-{
-    return (cpuid_info.featureFlags & (1<<bit));
-}
-
-
-#endif /*CPUID_H*/
diff --git a/src/includes/cpuid_types.h b/src/includes/cpuid_types.h
deleted file mode 100644
index cccc22d..0000000
--- a/src/includes/cpuid_types.h
+++ /dev/null
@@ -1,115 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  cpuid_types.h
- *
- *      Description:  Types file for cpuid module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef CPUID_TYPES_H
-#define CPUID_TYPES_H
-
-typedef enum {
-    NOCACHE=0,
-    DATACACHE,
-    INSTRUCTIONCACHE,
-    UNIFIEDCACHE,
-    ITLB,
-    DTLB} CacheType;
-
-typedef enum {
-    NODE=0,
-    SOCKET,
-    CORE,
-    THREAD} NodeLevel;
-
-typedef enum {
-    SSE3=0,
-    VSX,
-    MMX,
-    SSE,
-    SSE2,
-    MONITOR,
-    ACPI,
-    RDTSCP,
-    VMX,
-    EIST,
-    TM,
-    TM2,
-    AES,
-    RDRAND,
-    SSSE3,
-    SSE41,
-    SSE42,
-    AVX,
-    FMA} FeatureBit;
-
-typedef struct {
-    uint32_t family;
-    uint32_t model;
-    uint32_t stepping;
-    uint64_t clock;
-    int      turbo;
-    char*  name;
-    char*  features;
-    uint32_t featureFlags;
-    uint32_t perf_version;
-    uint32_t perf_num_ctr;
-    uint32_t perf_width_ctr;
-    uint32_t perf_num_fixed_ctr;
-    int supportUncore;
-} CpuInfo;
-
-typedef struct {
-    uint32_t threadId;
-    uint32_t coreId;
-    uint32_t packageId;
-    uint32_t apicId;
-} HWThread;
-
-typedef struct {
-    int level;
-    CacheType type;
-    int associativity;
-    int sets;
-    int lineSize;
-    int size;
-    int threads;
-    int inclusive;
-} CacheLevel;
-
-typedef struct {
-    uint32_t numHWThreads;
-    uint32_t numSockets;
-    uint32_t numCoresPerSocket;
-    uint32_t numThreadsPerCore;
-    uint32_t numCacheLevels;
-    HWThread* threadPool;
-    CacheLevel*  cacheLevels;
-    TreeNode* topologyTree;
-} CpuTopology;
-
-
-#endif /*CPUID_TYPES_H*/
diff --git a/src/includes/daemon.h b/src/includes/daemon.h
deleted file mode 100644
index 3272636..0000000
--- a/src/includes/daemon.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  daemon.h
- *
- *      Description:  Header File daemon Module. 
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef DAEMON_H
-#define DAEMON_H
-
-#include <types.h>
-#include <time.h>
-
-extern void daemon_init();
-extern void daemon_start(bstring str, struct timespec interval);
-extern void daemon_stop(int sig);
-extern void daemon_interrupt(int sig);
-
-#endif /* DAEMON_H */
diff --git a/src/includes/error.h b/src/includes/error.h
index 3c1526f..1dea94d 100644
--- a/src/includes/error.h
+++ b/src/includes/error.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Central error handling macros
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,60 +32,81 @@
 #ifndef ERROR_H
 #define ERROR_H
 
-#include <errno.h>
-#include <string.h>
-#include <msr.h>
+
+#include <likwid.h>
+
+
 
 #define str(x) #x
 
-#define FINALIZE  msr_finalize()
 
 #define ERRNO_PRINT fprintf(stderr, "ERROR - [%s:%d] %s\n", __FILE__, __LINE__, strerror(errno))
 
 #define ERROR  \
     ERRNO_PRINT; \
-    FINALIZE; \
     exit(EXIT_FAILURE)
 
 #define ERROR_PLAIN_PRINT(msg) \
-   fprintf(stderr,  "ERROR - [%s:%d] " str(msg) "\n", __FILE__, __LINE__);  \
-   FINALIZE; \
-   exit(EXIT_FAILURE)
+   fprintf(stderr,  "ERROR - [%s:%s:%d] " str(msg) "\n", __FILE__, __func__,__LINE__);
 
 
 #define ERROR_PRINT(fmt, ...) \
-   fprintf(stderr,  "ERROR - [%s:%d] " str(fmt) "\n", __FILE__, __LINE__, __VA_ARGS__);  \
-   FINALIZE; \
-   exit(EXIT_FAILURE)
+   fprintf(stderr,  "ERROR - [%s:%s:%d] %s.\n" str(fmt) "\n", __FILE__,  __func__,__LINE__, strerror(errno), __VA_ARGS__);
 
 #define CHECK_ERROR(func, msg)  \
     if ((func) < 0) { \
         fprintf(stderr, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno));  \
     }
 
+#define CHECK_AND_RETURN_ERROR(func, msg)  \
+    if ((func) < 0) { \
+        fprintf(stderr, "ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno));  \
+        return errno; \
+    }
+
 #define EXIT_IF_ERROR(func, msg)  \
     if ((func) < 0) {  \
         fprintf(stderr,"ERROR - [%s:%d] " str(msg) " - %s \n", __FILE__, __LINE__, strerror(errno)); \
-        FINALIZE; \
         exit(EXIT_FAILURE); \
     }
 
-#ifndef DEBUGLEV
-#define DEBUGLEV 0
-#endif
+
 
 #define VERBOSEPRINTREG(cpuid,reg,flags,msg) \
-    if (perfmon_verbose) {  \
+    if (perfmon_verbosity >= DEBUGLEV_DETAIL) \
+    { \
         printf("DEBUG - [%s:%d] "  str(msg) " [%d] Register 0x%llX , Flags: 0x%llX \n",  \
-                __FILE__, __LINE__,  (cpuid), LLU_CAST (reg), LLU_CAST (flags)); \
+                __func__, __LINE__,  (cpuid), LLU_CAST (reg), LLU_CAST (flags)); \
         fflush(stdout);  \
-    } 
+    }
+    
+#define VERBOSEPRINTPCIREG(cpuid,dev,reg,flags,msg) \
+    if (perfmon_verbosity >= DEBUGLEV_DETAIL) \
+    { \
+        printf("DEBUG - [%s:%d] "  str(msg) " [%d] Device %d Register 0x%llX , Flags: 0x%llX \n",  \
+                __func__, __LINE__,  (cpuid), dev, LLU_CAST (reg), LLU_CAST (flags)); \
+        fflush(stdout);  \
+    }
 
 
 #define DEBUG_PRINT(lev, fmt, ...) \
-    if (DEBUGLEV > lev) { \
-        printf(fmt, __VA_ARGS__); \
+    if ((lev >= 0) && (lev <= perfmon_verbosity)) { \
+        fprintf(stdout, "DEBUG - [%s:%d] " str(fmt) "\n", __func__, __LINE__,__VA_ARGS__); \
+        fflush(stdout); \
+    }
+
+#define DEBUG_PLAIN_PRINT(lev, msg) \
+    if ((lev >= 0) && (lev <= perfmon_verbosity)) { \
+        fprintf(stdout, "DEBUG - [%s:%d] " str(msg) "\n",__func__, __LINE__);  \
         fflush(stdout); \
     }
 
+
+#define CHECK_MSR_WRITE_ERROR(func) CHECK_AND_RETURN_ERROR(func, MSR write operation failed);
+#define CHECK_MSR_READ_ERROR(func) CHECK_AND_RETURN_ERROR(func, MSR read operation failed);
+#define CHECK_PCI_WRITE_ERROR(func) CHECK_AND_RETURN_ERROR(func, PCI write operation failed);
+#define CHECK_PCI_READ_ERROR(func) CHECK_AND_RETURN_ERROR(func, PCI read operation failed);
+#define CHECK_POWER_READ_ERROR(func) CHECK_AND_RETURN_ERROR(func, Power register read operation failed);
+#define CHECK_TEMP_READ_ERROR(func) CHECK_AND_RETURN_ERROR(func, Temperature register read operation failed);
+
 #endif /*ERROR_H*/
diff --git a/src/includes/ghash.h b/src/includes/ghash.h
index f33e9fb..75a17fd 100644
--- a/src/includes/ghash.h
+++ b/src/includes/ghash.h
@@ -1,20 +1,20 @@
-/*
- * =======================================================================================
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
+/* GLIB - Library of useful routines for C programming
+ * Copyright (C) 1995-1997  Peter Mattis, Spencer Kimball and Josh MacDonald
  *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2 of the License, or (at your option) any later version.
  *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
  *
- * =======================================================================================
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 02111-1307, USA.
  */
 
 /*
@@ -59,13 +59,13 @@ typedef struct _GHashTableIter GHashTableIter;
 
 struct _GHashTableIter
 {
-    /*< private >*/
-    gpointer      dummy1;
-    gpointer      dummy2;
-    gpointer      dummy3;
-    int           dummy4;
-    gboolean      dummy5;
-    gpointer      dummy6;
+  /*< private >*/
+  gpointer      dummy1;
+  gpointer      dummy2;
+  gpointer      dummy3;
+  int           dummy4;
+  gboolean      dummy5;
+  gpointer      dummy6;
 };
 
 char* g_strdup (const char *str);
diff --git a/src/includes/hashTable.h b/src/includes/hashTable.h
index 078fff9..23a5dff 100644
--- a/src/includes/hashTable.h
+++ b/src/includes/hashTable.h
@@ -7,13 +7,13 @@
  *                    Wrapper for HAshTable data structure holding thread
  *                    specific region information.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -37,6 +37,7 @@
 #include <types.h>
 
 extern void hashTable_init();
+void hashTable_initThread(int coreID);
 extern int hashTable_get(bstring regionTag, LikwidThreadResults** result);
 extern void hashTable_finalize(int* numberOfThreads, int* numberOfRegions, LikwidResults** results);
 
diff --git a/src/includes/libperfctr_types.h b/src/includes/libperfctr_types.h
index 99a38dc..91f123b 100644
--- a/src/includes/libperfctr_types.h
+++ b/src/includes/libperfctr_types.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Types file for libperfctr module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -37,6 +37,7 @@ typedef struct LikwidThreadResults{
     bstring  label;
     double time;
     TimerData startTime;
+    int groupID;
     uint32_t count;
     double StartPMcounters[NUM_PMC];
     double PMcounters[NUM_PMC];
@@ -44,6 +45,7 @@ typedef struct LikwidThreadResults{
 
 typedef struct {
     bstring  tag;
+    int groupID;
     double*  time;
     uint32_t*  count;
     double** counters;
diff --git a/src/includes/likwid.h b/src/includes/likwid.h
index dd4cdfd..82d3b4e 100644
--- a/src/includes/likwid.h
+++ b/src/includes/likwid.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  likwid.h
  *
- *      Description:  Header File of likwid marker API
+ *      Description:  Header File of likwid API
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Authors:  Thomas Roehl (tr), thomas.roehl at googlemail.com
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,36 +32,1036 @@
 #ifndef LIKWID_H
 #define LIKWID_H
 
+#include <stdint.h>
+#include <errno.h>
+#include <string.h>
+
+#include <bstrlib.h>
+
+#define DEBUGLEV_ONLY_ERROR 0
+#define DEBUGLEV_INFO 1
+#define DEBUGLEV_DETAIL 2
+#define DEBUGLEV_DEVELOP 3
+
+extern int perfmon_verbosity;
+
+/** \addtogroup MarkerAPI Marker API module
+*  @{
+*/
+/*!
+\def LIKWID_MARKER_INIT
+Shortcut for likwid_markerInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_THREADINIT
+Shortcut for likwid_markerThreadInit() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_REGISTER(regionTag)
+Shortcut for likwid_markerRegisterRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_START(regionTag)
+Shortcut for likwid_markerStartRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_STOP(regionTag)
+Shortcut for likwid_markerStopRegion() with \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
+Shortcut for likwid_markerGetResults() for \a regionTag if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_SWITCH
+Shortcut for likwid_markerNextGroup() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/*!
+\def LIKWID_MARKER_CLOSE
+Shortcut for likwid_markerClose() if compiled with -DLIKWID_PERFMON. Otherwise no operation is performed
+*/
+/** @}*/
+
 #ifdef LIKWID_PERFMON
 #define LIKWID_MARKER_INIT likwid_markerInit()
 #define LIKWID_MARKER_THREADINIT likwid_markerThreadInit()
-#define LIKWID_MARKER_START(reg) likwid_markerStartRegion(reg)
-#define LIKWID_MARKER_STOP(reg) likwid_markerStopRegion(reg)
+#define LIKWID_MARKER_SWITCH likwid_markerNextGroup()
+#define LIKWID_MARKER_REGISTER(regionTag) likwid_markerRegisterRegion(regionTag)
+#define LIKWID_MARKER_START(regionTag) likwid_markerStartRegion(regionTag)
+#define LIKWID_MARKER_STOP(regionTag) likwid_markerStopRegion(regionTag)
 #define LIKWID_MARKER_CLOSE likwid_markerClose()
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count) likwid_markerGetRegion(regionTag, nevents, events, time, count)
 #else
 #define LIKWID_MARKER_INIT
 #define LIKWID_MARKER_THREADINIT
-#define LIKWID_MARKER_START(reg)
-#define LIKWID_MARKER_STOP(reg)
+#define LIKWID_MARKER_SWITCH
+#define LIKWID_MARKER_REGISTER(regionTag)
+#define LIKWID_MARKER_START(regionTag)
+#define LIKWID_MARKER_STOP(regionTag)
 #define LIKWID_MARKER_CLOSE
+#define LIKWID_MARKER_GET(regionTag, nevents, events, time, count)
 #endif
 
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-/* marker API routines */
+
+
+/*
+################################################################################
+# Marker API related functions
+################################################################################
+*/
+/** \addtogroup MarkerAPI Marker API module
+*  @{
+*/
+/*! \brief Initialize LIKWID's marker API
+
+Must be called in serial region of the application to set up basic data structures
+of LIKWID. 
+Reads environment variables: 
+- LIKWID_MODE (access mode)
+- LIKWID_MASK (event bitmask)
+- LIKWID_EVENTS (event string)
+- LIKWID_THREADS (cpu list separated by ,)
+- LIKWID_GROUPS (amount of groups)
+*/
 extern void likwid_markerInit(void);
+/*! \brief Initialize LIKWID's marker API for the current thread
+
+Must be called in parallel region of the application to set up basic data structures
+of LIKWID. Before you can call likwid_markerThreadInit() you have to call likwid_markerInit().
+
+*/
 extern void likwid_markerThreadInit(void);
+/*! \brief Select next group to measure
+
+Must be called in parallel region of the application to switch group on every CPU.
+*/
+extern void likwid_markerNextGroup(void);
+/*! \brief Close LIKWID's marker API
+
+Must be called in serial region of the application. It gathers all data of regions and
+writes them out to a file (filepath in env variable LIKWID_FILEPATH).
+*/
 extern void likwid_markerClose(void);
-extern void likwid_markerStartRegion(const char* regionTag);
-extern void likwid_markerStopRegion(const char* regionTag);
+/*! \brief Register a measurement region
+
+Initializes the hashTable entry in order to reduce execution time of likwid_markerStartRegion()
+ at param regionTag [in] Initialize data using this string
+ at return Error code
+*/
+extern int likwid_markerRegisterRegion(const char* regionTag);
+/*! \brief Start a measurement region
 
+Reads the values of all configured counters and saves the results under the name given
+in regionTag.
+ at param regionTag [in] Store data using this string
+ at return Error code of start operation
+*/
+extern int likwid_markerStartRegion(const char* regionTag);
+/*! \brief Stop a measurement region
+
+Reads the values of all configured counters and saves the results under the name given
+in regionTag. The measurement data of the stopped region gets summed up in global region counters.
+ at param regionTag [in] Store data using this string
+ at return Error code of stop operation
+*/
+extern int likwid_markerStopRegion(const char* regionTag);
+
+/*! \brief Get accumulated data of a code region
+
+Get the accumulated data of the current thread for the given regionTag.
+ at param regionTag [in] Print data using this string
+ at param nr_events [in,out] Length of events array
+ at param events [out] Events array for the intermediate results
+ at param time [out] Accumulated measurement time
+ at param count [out] Call count of the code region
+*/
+extern void likwid_markerGetRegion(const char* regionTag, int* nr_events, double* events, double *time, int *count);
 /* utility routines */
+/*! \brief Get CPU ID of the current process/thread
+
+Returns the ID of the CPU the current process or thread is running on.
+ at return current CPU ID
+*/
 extern int  likwid_getProcessorId();
+/*! \brief Pin the current process to given CPU
+
+Pin the current process to the given CPU ID. The process cannot be scheduled to 
+another CPU after pinning but the pinning can be changed anytime with this function.
+ at param [in] processorId CPU ID to pin the current process to
+ at return error code (1 for success, 0 for error)
+*/
 extern int  likwid_pinProcess(int processorId);
+/*! \brief Pin the current thread to given CPU
+
+Pin the current thread to the given CPU ID. The thread cannot be scheduled to 
+another CPU after pinning but the pinning can be changed anytime with this function
+ at param [in] processorId CPU ID to pin the current thread to
+ at return error code (1 for success, 0 for error)
+*/
 extern int  likwid_pinThread(int processorId);
+/** @}*/
+
+/* 
+################################################################################
+# Access client related functions
+################################################################################
+*/
+/** \addtogroup AccessClient Access client module
+ *  @{
+ */
+
+/*! \brief Enum for the access modes
+
+LIKWID supports multiple access modes to the MSR and PCI performance monitoring 
+registers. For direct access the user must have enough priviledges to access the
+MSR and PCI devices. The daemon mode forwards the operations to a daemon with 
+higher priviledges.
+*/
+typedef enum {
+    ACCESSMODE_DIRECT = 0, /*!< \brief Access performance monitoring registers directly */
+    ACCESSMODE_DAEMON = 1 /*!< \brief Use the access daemon to access the registers */
+} AccessMode;
+
+/*! \brief Set accessClient mode
+
+Sets the mode how the MSR and PCI registers should be accessed. 0 for direct access (propably root priviledges required) and 1 for accesses through the access daemon. It must be called before accessClient_init()
+ at param [in] mode (0=direct, 1=daemon)
+*/
+extern void accessClient_setaccessmode(int mode);
+/*! \brief Initialize socket for communication with the access daemon
+
+Initializes the file descriptor by starting and connecting to a new access daemon.
+ at param [out] socket_fd Pointer to socket file descriptor
+*/
+extern void accessClient_init(int* socket_fd);
+/*! \brief Destroy socket for communication with the access daemon
+
+Destroys the file descriptor by disconnecting and shutting down the access daemon
+ at param [in] socket_fd socket file descriptor
+*/
+extern void accessClient_finalize(int socket_fd);
+/** @}*/
+
+/*
+################################################################################
+# Config file related functions
+################################################################################
+*/
+/** \addtogroup Config Config file module
+*  @{
+*/
+/*! \brief Structure holding values of the configuration file
+
+LIKWID supports the definition of runtime values in a configuration file. The 
+most important configurations in most cases are the path the access daemon and 
+the corresponding access mode. In order to avoid reading in the system topology
+at each start, a path to a topology file can be set. The other values are mostly
+used internally.
+*/
+typedef struct {
+    char* configFileName; /*!< \brief Path to the configuration file */
+    char* topologyCfgFileName; /*!< \brief Path to the topology file */
+    char* daemonPath; /*!< \brief Path of the access daemon */
+    AccessMode daemonMode; /*!< \brief Access mode to the MSR and PCI registers */
+    int maxNumThreads; /*!< \brief Maximum number of HW threads */
+    int maxNumNodes; /*!< \brief Maximum number of NUMA nodes */
+} Configuration;
+
+/** \brief Pointer for exporting the Configuration data structure */
+typedef Configuration* Configuration_t;
+/*! \brief Read the config file of LIKWID, if it exists
+
+Search for LIKWID config file and read the values in
+Currently the paths /usr/local/etc/likwid.cfg, /etc/likwid.cfg and the path
+defined in config.mk are checked.
+ at return error code (0 for success, -EFAULT if no file can be found)
+*/
+extern int init_configuration(void);
+/*! \brief Destroy the config structure
+
+Destroys the current config structure and frees all allocated memory for path names
+ at return error code (0 for success, -EFAULT if config structure not initialized)
+*/
+extern int destroy_configuration(void);
+
+
+/*! \brief Retrieve the config structure
+
+Get the initialized configuration
+\sa Configuration_t
+ at return Configuration_t (pointer to internal Configuration structure)
+*/
+extern Configuration_t get_configuration(void);
+/** @}*/
+/* 
+################################################################################
+# CPU topology related functions
+################################################################################
+*/
+/** \addtogroup CPUTopology CPU information module
+*  @{
+*/
+/*! \brief Structure with general CPU information
+
+General information covers CPU family, model, name and current clock and vendor 
+specific information like the version of Intel's performance monitoring facility.
+*/
+typedef struct {
+    uint32_t family; /*!< \brief CPU family ID*/
+    uint32_t model; /*!< \brief CPU model ID */
+    uint32_t stepping; /*!< \brief Stepping (version) of the CPU */
+    uint64_t clock; /*!< \brief Current clock frequency of the executing CPU*/
+    int      turbo; /*!< \brief Flag if CPU has a turbo mode */
+    char*  osname; /*!< \brief Name of the CPU reported by OS */
+    char*  name; /*!< \brief Name of the CPU as identified by LIKWID */
+    char*  short_name; /*!< \brief Short name of the CPU*/
+    char*  features; /*!< \brief String with all features supported by the CPU*/
+    int         isIntel; /*!< \brief Flag if it is an Intel CPU*/
+    int     supportUncore; /*!< \brief Flag if system has Uncore performance monitors */
+    uint32_t featureFlags; /*!< \brief Mask of all features supported by the CPU*/
+    uint32_t perf_version; /*!< \brief Version of Intel's performance monitoring facility */
+    uint32_t perf_num_ctr; /*!< \brief Number of general purpose core-local performance monitoring counters */
+    uint32_t perf_width_ctr; /*!< \brief Bit width of fixed and general purpose counters */
+    uint32_t perf_num_fixed_ctr; /*!< \brief Number of fixed purpose core-local performance monitoring counters */
+} CpuInfo;
+
+/*! \brief Structure with IDs of a HW thread
+
+For each HW thread this structure stores the ID of the thread inside a CPU, the
+CPU core ID of the HW thread and the CPU socket ID.
+\extends CpuTopology
+*/
+typedef struct {
+    uint32_t threadId; /*!< \brief ID of HW thread inside the CPU core */
+    uint32_t coreId; /*!< \brief ID of CPU core that executes the HW thread */
+    uint32_t packageId; /*!< \brief ID of CPU socket containing the HW thread */
+    uint32_t apicId; /*!< \brief ID of HW thread retrieved through the Advanced Programmable Interrupt Controller */
+    uint32_t inCpuSet; /*!< \brief ID of HW thread inside the CPU core */
+} HWThread;
+
+/*! \brief Enum of possible caches
+
+CPU caches can have different tasks and hold different kind of data. This enum lists all shapes used in all supported CPUs
+\extends CacheLevel
+*/
+typedef enum {
+    NOCACHE=0, /*!< \brief No cache used as undef value */
+    DATACACHE, /*!< \brief Cache holding data cache lines */
+    INSTRUCTIONCACHE, /*!< \brief Cache holding instruction cache lines */
+    UNIFIEDCACHE, /*!< \brief Cache holding both instruction and data cache lines */
+    ITLB, /*!< \brief Translation Lookaside Buffer cache for instruction pages */
+    DTLB /*!< \brief Translation Lookaside Buffer cache for data pages */
+} CacheType;
+
+/*! \brief Structure describing a cache level
+
+CPUs are connected to a cache hierarchy with different amount of caches at each level. The CacheLevel structure holds general information about the cache.
+\extends CpuTopology
+*/
+typedef struct {
+    uint32_t level; /*!< \brief Level of the cache in the hierarchy */
+    CacheType type; /*!< \brief Type of the cache */
+    uint32_t associativity; /*!< \brief Amount of cache lines hold by each set */
+    uint32_t sets; /*!< \brief Amount of sets */
+    uint32_t lineSize; /*!< \brief Size in bytes of one cache line */
+    uint32_t size; /*!< \brief Size in bytes of the cache */
+    uint32_t threads; /*!< \brief Number of HW thread connected to the cache */
+    uint32_t inclusive; /*!< \brief Flag if cache is inclusive (holds also cache lines available in caches nearer to the CPU) or exclusive */
+} CacheLevel;
+
+/*! \brief Structure describing the topology of the HW threads in the system
+
+This structure describes the topology at HW thread level like the amount of HW threads, how they are distributed over the CPU sockets/packages and how the caching hierarchy is assembled.
+*/
+typedef struct {
+    uint32_t numHWThreads; /*!< \brief Amount of HW threads in the system and length of \a threadPool */
+    uint32_t activeHWThreads; /*!< \brief Amount of HW threads in the system and length of \a threadPool */
+    uint32_t numSockets; /*!< \brief Amount of CPU sockets/packages in the system */
+    uint32_t numCoresPerSocket; /*!< \brief Amount of physical cores in one CPU socket/package */
+    uint32_t numThreadsPerCore; /*!< \brief Amount of HW threads in one physical CPU core */
+    uint32_t numCacheLevels; /*!< \brief Amount of caches for each HW thread and length of \a cacheLevels */
+    HWThread* threadPool; /*!< \brief List of all HW thread descriptions */
+    CacheLevel*  cacheLevels; /*!< \brief List of all caches in the hierarchy */
+    struct treeNode* topologyTree; /*!< \brief Anchor for a tree structure describing the system topology */
+} CpuTopology;
+
+/*! \brief Variable holding the global cpu information structure */
+extern CpuInfo cpuid_info;
+/*! \brief Variable holding the global cpu topology structure */
+extern CpuTopology cpuid_topology;
+
+/** \brief Pointer for exporting the CpuInfo data structure */
+typedef CpuInfo* CpuInfo_t;
+/** \brief Pointer for exporting the CpuTopology data structure */
+typedef CpuTopology* CpuTopology_t;
+/*! \brief Initialize topology information
+
+CpuInfo_t and CpuTopology_t are initialized by either HWLOC, CPUID/ProcFS or topology file if present. The topology file name can be configured in the configuration file. Furthermore, the paths /etc/likwid_topo.cfg and <PREFIX>/etc/likwid_topo.cfg are checked.
+\sa CpuInfo_t and CpuTopology_t
+ at return always 0
+*/
+extern int topology_init(void);
+/*! \brief Retrieve CPU topology of the current machine
+
+\sa CpuTopology_t
+ at return CpuTopology_t (pointer to internal cpuid_topology structure)
+*/
+extern CpuTopology_t get_cpuTopology(void);
+/*! \brief Retrieve CPU information of the current machine
+
+Get the previously initialized CPU info structure containing number of CPUs/Threads
+\sa CpuInfo_t
+ at return CpuInfo_t (pointer to internal cpuid_info structure)
+*/
+extern CpuInfo_t get_cpuInfo(void);
+/*! \brief Destroy topology structures CpuInfo_t and CpuTopology_t.
+
+Retrieved pointers to the structures are not valid anymore after this function call
+\sa CpuInfo_t and CpuTopology_t
+*/
+extern void topology_finalize(void);
+/*! \brief Print all supported architectures
+*/
+extern void print_supportedCPUs(void);
+/** @}*/
+/* 
+################################################################################
+# NUMA related functions
+################################################################################
+*/
+/** \addtogroup NumaTopology NUMA memory topology module
+ *  @{
+ */
+/*! \brief CPUs in NUMA node and general information about a NUMA domain
+
+The NumaNode structure describes the topology and holds general information of a
+NUMA node. The structure is filled by calling numa_init() by either the HWLOC 
+library or by evaluating the /proc filesystem.
+\extends NumaTopology
+*/
+typedef struct {
+    uint32_t id; /*!< \brief ID of the NUMA node */
+    uint64_t totalMemory; /*!< \brief Amount of memory in the NUMA node */
+    uint64_t freeMemory; /*!< \brief Amount of free memory in the NUMA node */
+    uint32_t numberOfProcessors; /*!< \brief umber of processors covered by the NUMA node and length of \a processors */
+    uint32_t*  processors; /*!< \brief List of HW threads in the NUMA node */
+    uint32_t*  processorsCompact; /*!< \brief Currently unused */
+    uint32_t numberOfDistances; /*!< \brief Amount of distances to the other NUMA nodes in the system and self  */
+    uint32_t*  distances; /*!< \brief List of distances to the other NUMA nodes and self */
+} NumaNode;
+
+
+/*! \brief  The NumaTopology structure describes all NUMA nodes in the current system.
+*/
+typedef struct {
+    uint32_t numberOfNodes; /*!< \brief Number of NUMA nodes in the system and length of \a nodes  */
+    NumaNode* nodes; /*!< \brief List of NUMA nodes */
+} NumaTopology;
+
+/*! \brief Variable holding the global NUMA information structure */
+extern NumaTopology numa_info;
+
+/** \brief Pointer for exporting the NumaTopology data structure */
+typedef NumaTopology* NumaTopology_t;
+
+/*! \brief Initialize NUMA information
+
+Initialize NUMA information NumaTopology_t using either HWLOC or CPUID/ProcFS. If
+a topology config file is present it is read at topology_init() and fills \a NumaTopology_t
+\sa NumaTopology_t
+ at return error code (0 for success, -1 if initialization failed)
+*/
+extern int numa_init(void);
+/*! \brief Retrieve NUMA information of the current machine
+
+Get the previously initialized NUMA info structure
+\sa NumaTopology_t
+ at return NumaTopology_t (pointer to internal numa_info structure)
+*/
+extern NumaTopology_t get_numaTopology(void);
+/*! \brief Set memory allocation policy to interleaved
+
+Set the memory allocation policy to interleaved for given list of CPUs
+ at param [in] processorList List of processors
+ at param [in] numberOfProcessors Length of processor list
+*/
+extern void numa_setInterleaved(int* processorList, int numberOfProcessors);
+/*! \brief Allocate memory from a specific specific NUMA node
+ at param [in,out] ptr Start pointer of memory
+ at param [in] size Size for the allocation
+ at param [in] domainId ID of NUMA node for the allocation
+*/
+extern void numa_membind(void* ptr, size_t size, int domainId);
+/*! \brief Destroy NUMA information structure
+
+Destroys the NUMA information structure NumaTopology_t. Retrieved pointers
+to the structures are not valid anymore after this function call
+\sa NumaTopology_t
+*/
+extern void numa_finalize(void);
+/*! \brief Retrieve the number of NUMA nodes
+
+Returns the number of NUMA nodes of the current machine. Can also be read out of
+NumaTopology_t
+\sa NumaTopology_t
+ at return Number of NUMA nodes
+*/
+extern int likwid_getNumberOfNodes(void);
+/** @}*/
+/* 
+################################################################################
+# Affinity domains related functions
+################################################################################
+*/
+/** \addtogroup AffinityDomains Thread affinity module
+ *  @{
+ */
+
+/*! \brief The AffinityDomain data structure describes a single domain in the current system
+
+The AffinityDomain data structure describes a single domain in the current system. Example domains are NUMA nodes, CPU sockets/packages or LLC (Last Level Cache) cache domains.
+\extends AffinityDomains
+*/
+typedef struct {
+    bstring tag; /*!< \brief Bstring with the ID for the affinity domain. Currently possible values: N (node), SX (socket/package X), CX (LLC cache domain X) and MX (memory domain X) */
+    uint32_t numberOfProcessors; /*!< \brief Number of HW threads in the domain and length of \a processorList */
+    uint32_t numberOfCores; /*!< \brief Number of CPU cores in the domain */
+    int*  processorList; /*!< \brief List of HW thread IDs in the domain */
+} AffinityDomain;
+
+/*! \brief The AffinityDomains data structure holds different count variables describing the
+various system layers
+
+Affinity domains are for example the amount of NUMA domains, CPU sockets/packages or LLC 
+(Last Level Cache) cache domains of the current machine. Moreover a list of
+\a domains holds the processor lists for each domain that are used for
+scheduling processes to domain specific HW threads. Some amounts are duplicates
+or derivation of values in \a CpuInfo, \a CpuTopology and \a NumaTopology.
+*/
+typedef struct {
+    uint32_t numberOfSocketDomains; /*!< \brief Number of CPU sockets/packages in the system */
+    uint32_t numberOfNumaDomains; /*!< \brief Number of NUMA nodes in the system */
+    uint32_t numberOfProcessorsPerSocket; /*!< \brief Number of HW threads per socket/package in the system */
+    uint32_t numberOfCacheDomains; /*!< \brief Number of LLC caches in the system */
+    uint32_t numberOfCoresPerCache; /*!< \brief Number of HW threads per LLC cache in the system */
+    uint32_t numberOfProcessorsPerCache; /*!< \brief Number of CPU cores per LLC cache in the system */
+    uint32_t numberOfAffinityDomains; /*!< \brief Number of affinity domains in the current system  and length of \a domains array */
+    AffinityDomain* domains; /*!< \brief List of all domains in the system */
+} AffinityDomains;
+
+/** \brief Pointer for exporting the AffinityDomains data structure */
+typedef AffinityDomains* AffinityDomains_t;
+
+/*! \brief Initialize affinity information
+
+Initialize affinity information AffinityDomains_t using the data of the structures
+\a CpuInfo_t, CpuTopology_t and NumaTopology_t
+\sa AffinityDomains_t
+*/
+extern void affinity_init();
+/*! \brief Retrieve affinity structure
+
+Get the previously initialized affinity info structure
+\sa AffinityDomains_t
+ at return AffinityDomains_t (pointer to internal affinityDomains structure)
+*/
+extern AffinityDomains_t get_affinityDomains(void);
+/*! \brief Pin process to a CPU
+
+Pin process to a CPU. Duplicate of likwid_pinProcess()
+ at param [in] processorId CPU ID for pinning
+*/
+extern void affinity_pinProcess(int processorId);
+/*! \brief Pin thread to a CPU
+
+Pin thread to a CPU. Duplicate of likwid_pinThread()
+ at param [in] processorId CPU ID for pinning
+*/
+extern void affinity_pinThread(int processorId);
+/*! \brief Return the CPU ID where the current process runs.
+
+ at return CPU ID
+*/
+extern int affinity_processGetProcessorId();
+/*! \brief Return the CPU ID where the current thread runs.
+
+ at return CPU ID
+*/
+extern int affinity_threadGetProcessorId();
+/*! \brief Destroy affinity information structure
+
+Destroys the affinity information structure AffinityDomains_t. Retrieved pointers
+to the structures are not valid anymore after this function call
+\sa AffinityDomains_t
+*/
+extern void affinity_finalize();
+/** @}*/
+
+/*
+################################################################################
+# Performance monitoring related functions
+################################################################################
+*/
+/** \addtogroup PerfMon Performance monitoring module
+ *  @{
+ */
+/*! \brief Initialize performance monitoring facility
+
+Initialize the performance monitoring feature by creating basic data structures.
+The access mode must already be set when calling perfmon_init()
+ at param [in] nrThreads Amount of threads
+ at param [in] threadsToCpu List of CPUs
+ at return error code (0 on success, -ERRORCODE on failure)
+*/
+extern int perfmon_init(int nrThreads, int threadsToCpu[]);
+
+/*! \brief Initialize performance monitoring maps
+
+Initialize the performance monitoring maps for counters, events and Uncore boxes#
+for the current architecture. topology_init() and numa_init() must be called before calling
+perfmon_init_maps()
+\sa RegisterMap list, PerfmonEvent list and BoxMap list
+*/
+extern void perfmon_init_maps(void);
+/*! \brief Add an event string to LIKWID
+
+A event string looks like Eventname:Countername(:Option1:Option2:...),...
+The eventname, countername and options are checked if they are available.
+ at param [in] eventCString Event string
+ at return Returns the ID of the new eventSet
+*/
+extern int perfmon_addEventSet(char* eventCString);
+/*! \brief Setup all performance monitoring counters of an eventSet
+
+A event string looks like Eventname:Countername(:Option1:Option2:...),...
+The eventname, countername and options are checked if they are available.
+ at param [in] groupId (returned from perfmon_addEventSet()
+ at return error code (-ENOENT if groupId is invalid and -1 if the counters of one CPU cannot be set up)
+*/
+extern int perfmon_setupCounters(int groupId);
+/*! \brief Start performance monitoring counters
+
+Start the counters that have been previously set up by perfmon_setupCounters().
+The counter registered are zeroed before enabling the counters
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_startCounters(void);
+/*! \brief Stop performance monitoring counters 
+
+Stop the counters that have been previously started by perfmon_startCounters().
+All config registers get zeroed before reading the counter register.
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_stopCounters(void);
+/*! \brief Read the performance monitoring counters on all CPUs
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again.
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readCounters(void);
+/*! \brief Read the performance monitoring counters on one CPU
+
+Read the counters that have been previously started by perfmon_startCounters().
+The counters are stopped directly to avoid interference of LIKWID with the measured
+code. Before returning, the counters are started again. Only one CPU is read.
+ at param [in] cpu_id CPU ID of the CPU that should be read
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_readCountersCpu(int cpu_id);
+/*! \brief Switch the active eventSet to a new one
+
+Stops the currently running counters, switches the eventSet by setting up the
+counters and start the counters.
+ at param [in] new_group ID of group that should be switched to.
+ at return 0 on success and -(thread_id+1) for error
+*/
+extern int perfmon_switchActiveGroup(int new_group);
+/*! \brief Close the perfomance monitoring facility of LIKWID
+
+Deallocates all internal data that is used during performance monitoring. Also
+the counter values are not accessible after this function.
+*/
+extern void perfmon_finalize(void);
+/*! \brief Get the results of the specified group, counter and thread
+
+Get the result of the last measurement cycle. The function takes care of happened
+overflows and if the counter values need to be calculated with multipliers.
+ at param [in] groupId ID of the group that should be read
+ at param [in] eventId ID of the event that should be read
+ at param [in] threadId ID of the thread/cpu that should be read
+ at return The counter result
+*/
+extern double perfmon_getResult(int groupId, int eventId, int threadId);
+/*! \brief Get the number of configured event groups
+
+ at return Number of groups
+*/
+extern int perfmon_getNumberOfGroups(void);
+/*! \brief Get the number of configured eventSets in group
+
+ at param [in] groupId ID of group
+ at return Number of eventSets
+*/
+extern int perfmon_getNumberOfEvents(int groupId);
+/*! \brief Get the measurement time a group
+
+ at param [in] groupId ID of group
+ at return Time in seconds the event group was measured
+*/
+extern double perfmon_getTimeOfGroup(int groupId);
+/*! \brief Get the ID of the currently set up event group
+
+ at return Number of active group
+*/
+extern int perfmon_getIdOfActiveGroup(void);
+/*! \brief Get the number of threads specified at perfmon_init()
+
+ at return Number of threads
+*/
+extern int perfmon_getNumberOfThreads(void);
+/** @}*/
+
+/*
+################################################################################
+# Time measurements related functions
+################################################################################
+*/
+
+/** \addtogroup TimerMon Time measurement module
+ *  @{
+ */
+
+/*! \brief Struct defining the start and stop time of a time interval
+\extends TimerData
+*/
+typedef union
+{
+    uint64_t int64; /*!< \brief Cycle count in 64 bit */
+    struct {uint32_t lo, hi;} int32; /*!< \brief Cycle count stored in two 32 bit fields */
+} TscCounter;
+
+/*! \brief Struct defining the start and stop time of a time interval
+*/
+typedef struct {
+    TscCounter start; /*!< \brief Cycles at start */
+    TscCounter stop; /*!< \brief Cycles at stop */
+} TimerData;
+
+/*! \brief Initialize timer by retrieving baseline frequency and cpu clock
+*/
+extern void timer_init( void );
+/*! \brief Return the measured interval in seconds
+
+ at param [in] time Structure holding the cycle count at start and stop
+ at return Time in seconds
+*/
+extern double timer_print( TimerData* time);
+/*! \brief Return the measured interval in cycles
+
+ at param [in] time Structure holding the cycle count at start and stop
+ at return Time in cycles
+*/
+extern uint64_t timer_printCycles( TimerData* time);
+/*! \brief Return the CPU clock determined at timer_init
+
+ at return CPU clock
+*/
+extern uint64_t timer_getCpuClock( void );
+/*! \brief Return the baseline CPU clock determined at timer_init
+
+ at return Baseline CPU clock
+*/
+extern uint64_t timer_getBaseline( void );
+/*! \brief Start time measurement
+
+ at param [in,out] time Structure holding the cycle count at start
+*/
+extern void timer_start( TimerData* time );
+/*! \brief Stop time measurement
+
+ at param [in,out] time Structure holding the cycle count at stop
+*/
+extern void timer_stop ( TimerData* time);
+
+/** @}*/
+
+/* 
+################################################################################
+# Power measurements related functions
+################################################################################
+*/
+/** \addtogroup PowerMon Power and Energy monitoring module
+ *  @{
+ */
+
+/*!
+\def NUM_POWER_DOMAINS
+Amount of currently supported RAPL domains
+*/
+#define NUM_POWER_DOMAINS 4
+/*! \brief List of all RAPL domain names
+*/
+extern const char* power_names[NUM_POWER_DOMAINS];
+
+/*!
+\def POWER_DOMAIN_SUPPORT_STATUS
+Flag to check in PowerDomain's supportFlag if the status msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_STATUS (1ULL<<0)
+/*!
+\def POWER_DOMAIN_SUPPORT_LIMIT
+Flag to check in PowerDomain's supportFlag if the limit msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_LIMIT (1ULL<<1)
+/*!
+\def POWER_DOMAIN_SUPPORT_POLICY
+Flag to check in PowerDomain's supportFlag if the policy msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_POLICY (1ULL<<2)
+/*!
+\def POWER_DOMAIN_SUPPORT_PERF
+Flag to check in PowerDomain's supportFlag if the perf msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_PERF (1ULL<<3)
+/*!
+\def POWER_DOMAIN_SUPPORT_INFO
+Flag to check in PowerDomain's supportFlag if the info msr registers are available
+*/
+#define POWER_DOMAIN_SUPPORT_INFO (1ULL<<4)
+
+
+/*! \brief Information structure of CPU's turbo mode
+\extends PowerInfo
+*/
+typedef struct {
+    int numSteps; /*!< \brief Amount of turbo mode steps/frequencies */
+    double* steps; /*!< \brief List of turbo mode steps */
+} TurboBoost;
+
+/*! \brief Enum for all supported RAPL domains
+\extends PowerDomain
+*/
+typedef enum {
+    PKG = 0, /*!< \brief PKG domain, mostly one CPU socket/package */
+    PP0 = 1, /*!< \brief PP0 domain, not clearly defined by Intel */
+    PP1 = 2, /*!< \brief PP1 domain, not clearly defined by Intel */
+    DRAM = 3 /*!< \brief DRAM domain, the memory modules */
+} PowerType;
+
+/*! \brief Structure describing an RAPL power domain
+\extends PowerInfo
+*/
+typedef struct {
+    PowerType type; /*!< \brief Identifier which RAPL domain is managed by this struct */
+    uint32_t supportFlags; /*!< \brief Bitmask which features are supported by the power domain */
+    double energyUnit; /*!< \brief Multiplier for energy measurements */
+    double tdp; /*!< \brief Thermal Design Power (maximum amount of heat generated by the CPU) */
+    double minPower; /*!< \brief Minimal power consumption of the CPU */
+    double maxPower; /*!< \brief Maximal power consumption of the CPU */
+    double maxTimeWindow; /*!< \brief Minimal power measurement interval */
+} PowerDomain;
+
+/*! \brief Information structure of CPU's power measurement facility
+*/
+typedef struct {
+    double baseFrequency; /*!< \brief Base frequency of the CPU */
+    double minFrequency; /*!< \brief Minimal frequency of the CPU */
+    TurboBoost turbo; /*!< \brief Turbo boost information */
+    int hasRAPL; /*!< \brief RAPL support flag */
+    double powerUnit; /*!< \brief Multiplier for power measurements */
+    double timeUnit; /*!< \brief Multiplier for time information */
+    PowerDomain domains[NUM_POWER_DOMAINS]; /*!< \brief List of power domains */
+} PowerInfo;
+
+/*! \brief Power measurement data for start/stop measurements
+*/
+typedef struct {
+    int domain; /*!< \brief RAPL domain identifier */
+    uint32_t before; /*!< \brief Counter state at start */
+    uint32_t after; /*!< \brief Counter state at stop */
+} PowerData;
+
+/*! \brief Variable holding the global power information structure */
+extern PowerInfo power_info;
+
+/** \brief Pointer for exporting the PowerInfo data structure */
+typedef PowerInfo* PowerInfo_t;
+/** \brief Pointer for exporting the PowerData data structure */
+typedef PowerData* PowerData_t;
+
+/*! \brief Initialize power measurements on specific CPU
+
+Additionally, it reads basic information about the power measurements like 
+minimal measurement time.
+ at param [in] cpuId Initialize power facility for this CPU
+ at return error code
+*/
+extern int power_init(int cpuId);
+/*! \brief Get a pointer to the power facility information
+
+ at return PowerInfo_t pointer
+\sa PowerInfo_t
+*/
+extern PowerInfo_t get_powerInfo(void);
+/*! \brief Read the current power value
+
+ at param [in] cpuId Read power facility for this CPU
+ at param [in] reg Power register
+ at param [out] data Power data
+*/
+extern int power_read(int cpuId, uint64_t reg, uint32_t *data);
+/*! \brief Read the current power value using a specific communication socket
+
+ at param [in] socket_fd Communication socket for the read operation
+ at param [in] cpuId Read power facility for this CPU
+ at param [in] reg Power register
+ at param [out] data Power data
+*/
+extern int power_tread(int socket_fd, int cpuId, uint64_t reg, uint32_t *data);
+/*! \brief Start power measurements
+
+ at param [in,out] data Data structure holding start and stop values for power measurements
+ at param [in] cpuId Start power facility for this CPU
+ at param [in] type Which type should be measured
+ at return error code
+*/
+extern int power_start(PowerData_t data, int cpuId, PowerType type);
+/*! \brief Stop power measurements
+
+ at param [in,out] data Data structure holding start and stop values for power measurements
+ at param [in] cpuId Start power facility for this CPU
+ at param [in] type Which type should be measured
+ at return error code
+*/
+extern int power_stop(PowerData_t data, int cpuId, PowerType type);
+/*! \brief Print power measurements gathered by power_start() and power_stop()
+
+ at param [in] data Data structure holding start and stop values for power measurements
+ at return Consumed energy in Joules
+*/
+extern double power_printEnergy(PowerData* data);
+/*! \brief Get energy Unit
+
+ at param [in] domain RAPL domain ID
+ at return Power unit of the given RAPL domain
+*/
+extern double power_getEnergyUnit(int domain);
+
+/*! \brief Get the values of the limit register of a domain
+
+ at param [in] cpuId CPU ID
+ at param [in] domain RAPL domain ID
+ at param [out] power Power limit
+ at param [out] time Time limit
+ at return error code
+*/
+int power_limitGet(int cpuId, PowerType domain, double* power, double* time);
+
+/*! \brief Set the values of the limit register of a domain
+
+ at param [in] cpuId CPU ID
+ at param [in] domain RAPL domain ID
+ at param [in] power Power limit
+ at param [in] time Time limit
+ at param [in] doClamping Activate clamping (going below OS-requested power level)
+ at return error code
+*/
+int power_limitSet(int cpuId, PowerType domain, double power, double time, int doClamping);
+
+/*! \brief Get the state of a power limit, activated or deactivated
+
+ at param [in] cpuId CPU ID
+ at param [in] domain RAPL domain ID
+ at return state, 1 for active, 0 for inactive
+*/
+int power_limitState(int cpuId, PowerType domain);
+
+/*! \brief Free space of power_unit
+*/
+extern void power_finalize(void);
+/** @}*/
+
+/* 
+################################################################################
+# Thermal measurements related functions
+################################################################################
+*/
+/** \addtogroup ThermalMon Thermal monitoring module
+ *  @{
+ */
+/*! \brief Initialize thermal measurements on specific CPU
+
+ at param [in] cpuId Initialize thermal facility for this CPU
+*/
+extern void thermal_init(int cpuId);
+/*! \brief Read the current thermal value
+
+ at param [in] cpuId Read thermal facility for this CPU
+ at param [out] data Thermal data
+*/
+extern int thermal_read(int cpuId, uint32_t *data);
+/*! \brief Read the current thermal value using a specific communication socket
+
+ at param [in] socket_fd Communication socket for the read operation
+ at param [in] cpuId Read thermal facility for this CPU
+ at param [out] data Thermal data
+*/
+extern int thermal_tread(int socket_fd, int cpuId, uint32_t *data);
+/** @}*/
+
+/* 
+################################################################################
+# Timeline daemon related functions
+################################################################################
+*/
+/** \addtogroup Daemon Timeline daemon module
+ *  @{
+ */
+/*! \brief Start timeline daemon
+
+Starts the timeline daemon which reads and prints the counter values after each \a duration time
+ at param [in] duration Time interval in ns
+ at param [in] outfile File to write the intermediate readings or NULL to write to stderr
+ at return 0 on success and -EFAULT if counters cannot be started
+*/
+extern int daemon_start(uint64_t duration, const char* outfile);
+/*! \brief Stop timeline daemon
+
+Stop the timeline daemon using the signal \a sig
+ at param [in] sig Signal code to kill the daemon (see signal.h for signal codes)
+ at return 0 on success and the negative error code at failure
+*/
+extern int daemon_stop(int sig);
+/** @}*/
+
+/* 
+################################################################################
+# Memory sweeping related functions
+################################################################################
+*/
+/** \addtogroup MemSweep Memory sweeping module
+ *  @{
+ */
+/*! \brief Sweeping the memory of a NUMA node
+
+Sweeps (zeros) the memory of NUMA node with ID \a domainId
+ at param [in] domainId NUMA node ID
+*/
+extern void memsweep_domain(int domainId);
+/*! \brief Sweeping the memory of all NUMA nodes covered by CPU list
 
+Sweeps (zeros) the memory of all NUMA nodes containing the CPUs in \a processorList
+ at param [in] processorList List of CPU IDs
+ at param [in] numberOfProcessors Number of CPUs in list
+*/
+extern void memsweep_threadGroup(int* processorList, int numberOfProcessors);
+/** @}*/
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/includes/lock.h b/src/includes/lock.h
index 87d1593..b47ed73 100644
--- a/src/includes/lock.h
+++ b/src/includes/lock.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Header File Locking primitive Module
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -41,7 +41,7 @@
 #define LOCK_INIT -1
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
-#define LIKWIDLOCK  /var/run/likwid.lock
+#define LIKWIDLOCK  /tmp/likwid.lock
 
 static inline int lock_acquire(int* var, int newval)
 {
diff --git a/src/includes/memsweep.h b/src/includes/memsweep.h
index e29d4d8..1e714b7 100644
--- a/src/includes/memsweep.h
+++ b/src/includes/memsweep.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  memsweep.h
  *
- *      Description:  Header File memsweep Module. 
+ *      Description:  Header File memsweep module for internal use. External functions are
+ *                    defined in likwid.h
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -34,9 +35,7 @@
 #include <types.h>
 
 extern void memsweep_setMemoryFraction(uint64_t fraction);
-extern void memsweep_node(FILE* OUTSTREAM);
-extern void memsweep_domain(FILE* OUTSTREAM, int domainId);
-extern void memsweep_threadGroup(FILE* OUTSTREAM, int* processorList, int numberOfProcessors);
+extern void memsweep_node(void);
 
 #endif /* MEMSWEEP_H */
 
diff --git a/src/includes/msr.h b/src/includes/msr.h
index 45f8069..e93e0a5 100644
--- a/src/includes/msr.h
+++ b/src/includes/msr.h
@@ -3,15 +3,15 @@
  *
  *      Filename:  msr.h
  *
- *      Description:  Header File msr Module. 
+ *      Description:  Header File msr Module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -35,13 +35,13 @@
 
 /* Initializes the MSR module, trying to open either the MSR files or
  * the connection to the msr daemon. */
-extern void msr_init(int socket_fd);
+extern int msr_init(int socket_fd);
 extern void msr_finalize(void);
-extern uint64_t msr_read(int cpu, uint32_t reg);
-extern void msr_write(int cpu, uint32_t reg, uint64_t data);
+extern int msr_read(int cpu, uint32_t reg, uint64_t *data);
+extern int msr_write(int cpu, uint32_t reg, uint64_t data);
 
 /* variants for thread safe execution with a per thread socket */
-extern uint64_t msr_tread(int socket_fd, int cpu, uint32_t reg);
-extern void msr_twrite(int socket_fd, int cpu, uint32_t reg, uint64_t data);
+extern int msr_tread(int socket_fd, int cpu, uint32_t reg, uint64_t *data);
+extern int msr_twrite(int socket_fd, int cpu, uint32_t reg, uint64_t data);
 
 #endif /* MSR_H */
diff --git a/src/includes/multiplex.h b/src/includes/multiplex.h
deleted file mode 100644
index c34cac8..0000000
--- a/src/includes/multiplex.h
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  multiplex.h
- *
- *      Description:  Header File multiplex Module. 
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef MULTIPLEX_H
-#define MULTIPLEX_H
-
-#include <types.h>
-
-extern void multiplex_init(MultiplexCollections* set);
-extern void multiplex_start();
-extern void multiplex_stop();
-
-#endif /* MULTIPLEX_H */
diff --git a/src/includes/multiplex_types.h b/src/includes/multiplex_types.h
deleted file mode 100644
index 8578a8f..0000000
--- a/src/includes/multiplex_types.h
+++ /dev/null
@@ -1,42 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  multiplex_types.h
- *
- *      Description:  Types file for multiplex  module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef MULTIPLEX_TYPES_H
-#define MULTIPLEX_TYPES_H
-
-typedef struct {
-    PerfmonEventSet* collections;
-    int numberOfCollections;
-    double time;
-} MultiplexCollections;
-
-
-
-#endif /* MULTIPLEX_TYPES_H */
diff --git a/src/includes/numa.h b/src/includes/numa.h
index 3a2d0f1..9cf0458 100644
--- a/src/includes/numa.h
+++ b/src/includes/numa.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  numa.h
  *
- *      Description:  Header File numa Module. 
+ *      Description:  Header File NUMA module for internal use. External functions are
+ *                    defined in likwid.h
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,18 +29,30 @@
  * =======================================================================================
  */
 
-#ifndef NUMA_H
-#define NUMA_H
+#ifndef LIKWID_NUMA
+#define LIKWID_NUMA
+
+#include <stdlib.h>
+#include <stdio.h>
 
 #include <types.h>
+#include <likwid.h>
+#include <numa_hwloc.h>
+#include <numa_proc.h>
+
+
+
+
+extern int str2int(const char* str);
+
+struct numa_functions {
+    int (*numa_init) (void);
+    void (*numa_setInterleaved) (int*, int);
+    void (*numa_membind) (void*, size_t, int);
+};
+
+
 
-/** Structure holding numa information
- *
- */
-extern NumaTopology numa_info;
 
-extern int numa_init (void);
-extern void numa_setInterleaved(int* processorList, int numberOfProcessors);
-extern void numa_membind(void* ptr, size_t size, int domainId);
 
-#endif /*NUMA_H*/
+#endif
diff --git a/src/includes/numa_hwloc.h b/src/includes/numa_hwloc.h
new file mode 100644
index 0000000..1545c08
--- /dev/null
+++ b/src/includes/numa_hwloc.h
@@ -0,0 +1,40 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  numa_hwloc.h
+ *
+ *      Description:  Header File hwloc NUMA backend
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+
+#ifndef LIKWID_NUMA_HWLOC
+#define LIKWID_NUMA_HWLOC
+
+extern int hwloc_numa_init(void);
+extern void hwloc_numa_membind(void* ptr, size_t size, int domainId);
+extern void hwloc_numa_setInterleaved(int* processorList, int numberOfProcessors);
+
+
+#endif
diff --git a/src/includes/numa_proc.h b/src/includes/numa_proc.h
new file mode 100644
index 0000000..e52ed86
--- /dev/null
+++ b/src/includes/numa_proc.h
@@ -0,0 +1,39 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  numa_proc.h
+ *
+ *      Description:  Header File procfs/sysfs NUMA backend
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_NUMA_PROC
+#define LIKWID_NUMA_PROC
+
+extern int proc_numa_init(void);
+extern void proc_numa_membind(void* ptr, size_t size, int domainId);
+extern void proc_numa_setInterleaved(int* processorList, int numberOfProcessors);
+
+
+#endif
diff --git a/src/includes/numa_types.h b/src/includes/numa_types.h
deleted file mode 100644
index bd4afda..0000000
--- a/src/includes/numa_types.h
+++ /dev/null
@@ -1,52 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  numa_types.h
- *
- *      Description:  Types file for numa module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef NUMA_TYPES_H
-#define NUMA_TYPES_H
-
-
-typedef struct {
-    int id;
-    uint64_t totalMemory;
-    uint64_t freeMemory;
-    int numberOfProcessors;
-    uint32_t* processors;
-    uint32_t* processorsCompact;
-    int numberOfDistances;
-    uint32_t* distances;
-} NumaNode;
-
-typedef struct {
-    uint32_t numberOfNodes;
-    NumaNode* nodes;
-} NumaTopology;
-
-
-#endif /*NUMA_TYPES_H*/
diff --git a/src/includes/pci.h b/src/includes/pci.h
index 1672f1c..166609a 100644
--- a/src/includes/pci.h
+++ b/src/includes/pci.h
@@ -3,15 +3,15 @@
  *
  *      Filename:  pci.h
  *
- *      Description:  Header File pci Module. 
+ *      Description:  Header File pci Module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -39,11 +39,12 @@
  * Listing for Uncore devices DEVICE.FUNCTION
  */
 
-extern void pci_init();
+extern int pci_init(int socket_fd);
 extern void pci_finalize();
-extern uint32_t pci_read(int cpu, PciDeviceIndex index, uint32_t reg);
-extern void pci_write(int cpu, PciDeviceIndex index, uint32_t reg, uint32_t data);
-extern uint32_t pci_tread(int socket_fd, int cpu, PciDeviceIndex index, uint32_t reg);
-extern void pci_twrite(int socket_fd, int cpu, PciDeviceIndex index, uint32_t reg, uint32_t data);
+extern int pci_checkDevice(PciDeviceIndex index, int cpu);
+extern int pci_read(int cpu, PciDeviceIndex index, uint32_t reg, uint32_t *data);
+extern int pci_write(int cpu, PciDeviceIndex index, uint32_t reg, uint32_t data);
+extern int pci_tread(int socket_fd, int cpu, PciDeviceIndex index, uint32_t reg, uint32_t *data);
+extern int pci_twrite(int socket_fd, int cpu, PciDeviceIndex index, uint32_t reg, uint32_t data);
 
 #endif /* PCI_H */
diff --git a/src/includes/pci_hwloc.h b/src/includes/pci_hwloc.h
new file mode 100644
index 0000000..c8a66c6
--- /dev/null
+++ b/src/includes/pci_hwloc.h
@@ -0,0 +1,37 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  pci_hwloc.h
+ *
+ *      Description:  Header File hwloc based PCI lookup backend
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef PCI_HWLOC_H
+#define PCI_HWLOC_H
+
+extern int hwloc_pci_init(uint16_t testDevice, char** socket_bus, int* nrSockets);
+
+
+#endif
diff --git a/src/includes/pci_proc.h b/src/includes/pci_proc.h
new file mode 100644
index 0000000..e680b7a
--- /dev/null
+++ b/src/includes/pci_proc.h
@@ -0,0 +1,37 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  pci_proc.h
+ *
+ *      Description:  Header File procfs based PCI lookup backend
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef PCI_PROC_H
+#define PCI_PROC_H
+
+extern int proc_pci_init(uint16_t testDevice, char** socket_bus, int* nrSockets);
+
+
+#endif
diff --git a/src/includes/pci_types.h b/src/includes/pci_types.h
index cfb9657..1fb1fec 100644
--- a/src/includes/pci_types.h
+++ b/src/includes/pci_types.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Types file for pci module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -34,23 +35,69 @@
 
 #include <stdint.h>
 
+
+
+typedef enum {
+    NONE = 0,
+    R3QPI,
+    R2PCIE,
+    IMC,
+    HA,
+    QPI,
+    IRP,
+    MAX_NUM_PCI_TYPES
+} PciDeviceType;
+
 typedef enum {
-    PCI_R3QPI_DEVICE_LINK_0 = 0,
+    MSR_DEV = 0,
+    PCI_R3QPI_DEVICE_LINK_0,
     PCI_R3QPI_DEVICE_LINK_1,
+    PCI_R3QPI_DEVICE_LINK_2,
     PCI_R2PCIE_DEVICE,
-    PCI_IMC_DEVICE_CH_0,
-    PCI_IMC_DEVICE_CH_1,
-    PCI_IMC_DEVICE_CH_2,
-    PCI_IMC_DEVICE_CH_3,
-    PCI_HA_DEVICE,
+    PCI_IMC_DEVICE_0_CH_0,
+    PCI_IMC_DEVICE_0_CH_1,
+    PCI_IMC_DEVICE_0_CH_2,
+    PCI_IMC_DEVICE_0_CH_3,
+    PCI_HA_DEVICE_0,
+    PCI_HA_DEVICE_1,
     PCI_QPI_DEVICE_PORT_0,
     PCI_QPI_DEVICE_PORT_1,
+    PCI_QPI_DEVICE_PORT_2,
     PCI_QPI_MASK_DEVICE_PORT_0,
     PCI_QPI_MASK_DEVICE_PORT_1,
+    PCI_QPI_MASK_DEVICE_PORT_2,
     PCI_QPI_MISC_DEVICE_PORT_0,
     PCI_QPI_MISC_DEVICE_PORT_1,
-    MAX_NUM_DEVICES
+    PCI_QPI_MISC_DEVICE_PORT_2,
+    PCI_IMC_DEVICE_1_CH_0,
+    PCI_IMC_DEVICE_1_CH_1,
+    PCI_IMC_DEVICE_1_CH_2,
+    PCI_IMC_DEVICE_1_CH_3,
+    PCI_IRP_DEVICE,
+    MAX_NUM_PCI_DEVICES
 } PciDeviceIndex;
 
+typedef struct {
+    PciDeviceType type;
+    char *path;
+    char *name;
+    char *likwid_name;
+    uint32_t devid;
+    int  online;
+} PciDevice;
+
+typedef struct {
+    char* name;
+    char* desc;
+} PciType;
+
 
+static PciType pci_types[MAX_NUM_PCI_TYPES] = {
+    [R3QPI] = {"R3QPI", "R3QPI is the interface between the Intel QPI Link Layer and the Ring."},
+    [R2PCIE] = {"R2PCIE", "R2PCIe represents the interface between the Ring and IIO traffic to/from PCIe."},
+    [IMC] = {"IMC", "The integrated Memory Controller provides the interface to DRAM and communicates to the rest of the uncore through the Home Agent."},
+    [HA] = {"HA", "The HA is responsible for the protocol side of memory interactions."},
+    [QPI] = {"QPI", "The Intel QPI Link Layer is responsible for packetizing requests from the caching agent on the way out to the system interface."},
+    [IRP] = {"IRP", "IRP is responsible for maintaining coherency for IIO traffic e.g. crosssocket P2P."}
+};
 #endif /*PCI_TYPES_H*/
diff --git a/src/includes/perfmon.h b/src/includes/perfmon.h
index 6e9d9f9..34b75e2 100644
--- a/src/includes/perfmon.h
+++ b/src/includes/perfmon.h
@@ -7,13 +7,14 @@
  *                    Configures and reads out performance counters
  *                    on x86 based architectures. Supports multi threading.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -33,76 +34,25 @@
 #ifndef PERFMON_H
 #define PERFMON_H
 
-#include <bstrlib.h>
-#include <types.h>
-
-extern int perfmon_verbose;
-
-extern void (*perfmon_startCountersThread) (int thread_id);
-extern void (*perfmon_stopCountersThread) (int thread_id);
-extern int  (*perfmon_getIndex) (bstring reg, PerfmonCounterIndex* index);
-extern void (*perfmon_setupCounterThread) (int thread_id, PerfmonEvent* event , PerfmonCounterIndex index);
-
-extern void perfmon_initEventSet(StrUtilEventSet* eventSetConfig, PerfmonEventSet* set);
-extern void perfmon_setCSVMode(int v);
-extern void perfmon_printAvailableGroups(void);
-extern void perfmon_printGroupHelp(bstring group);
-extern void perfmon_init(int numThreads, int threads[],FILE* outstream);
-extern void perfmon_finalize(void);
-extern void perfmon_setupEventSet(bstring eventString, BitMask* mask);
-extern double perfmon_getEventResult(int thread, int index);
-extern int perfmon_setupEventSetC(char* eventCString, const char*** eventnames);
-
-
-/*
-The following structure and set of functions provide an efficient and easy interface to
-access counters from different groups and switch between them.
-
-TODO: The internals need some cleanup, but the interface should remain rather stable.
-
-Usage:
-setup = perfmon_prepareEventSetup("VIEW"), etc..
-Whenever you want to use one of the prepared setups call:
-perfmon_setupCountersForEventSet(setup)
-
-then you can startCounters, stopCounters and then
-perfmon_getEventCounterValues() and/or
-perfmon_getDerivedCounterValues()
- */
-typedef struct {
-    const char* groupName;
-    int numberOfEvents;
-    const char** eventNames;
-    int numberOfDerivedCounters;
-    const char** derivedNames;    
-
-    // Internal structures DO NOT ACCESS THEM, they need cleanup.
-    StrUtilEventSet* eventSetConfig;
-    PerfmonEventSet* perfmon_set;
-    PerfmonGroup groupSet;
-    int groupIndex;
-} EventSetup;
 
+#include <types.h>
+#include <likwid.h>
 
-extern EventSetup perfmon_prepareEventSetup(char* eventGroupString);
-extern void perfmon_setupCountersForEventSet(EventSetup * setup);
+#define FREEZE_FLAG_ONLYFREEZE 0x0ULL
+#define FREEZE_FLAG_CLEAR_CTR (1ULL<<1)
+#define FREEZE_FLAG_CLEAR_CTL (1ULL<<0)
 
-// obtain values for all cores, average, min and max for the cores.
-extern void perfmon_getEventCounterValues(uint64_t* avg_values, uint64_t* max, uint64_t* min);
-extern void perfmon_getDerivedCounterValues(float* avg_values, float* max, float* min);
-/////////////////////////
+extern int (*perfmon_startCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*perfmon_stopCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*perfmon_setupCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*perfmon_readCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*perfmon_finalizeCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+extern int (*initThreadArch) (int cpu_id);
 
-extern void perfmon_setupCounters(void);
-extern void perfmon_startCounters(void);
-extern void perfmon_stopCounters(void);
-extern void perfmon_readCounters(void);
-extern double perfmon_getResult(int threadId, char* counterString);
-extern void perfmon_printMarkerResults(bstring filepath);
-extern void perfmon_logCounterResults(double time);
-extern void perfmon_printCounterResults(void);
 
+/* Internal helpers */
+extern int getCounterTypeOffset(int index);
+extern uint64_t perfmon_getMaxCounterValue(RegisterType type);
 
-extern void perfmon_printCounters(void);
-extern void perfmon_printEvents(void);
 
 #endif /*PERFMON_H*/
diff --git a/src/includes/perfmon_atom.h b/src/includes/perfmon_atom.h
index 201cea6..38330a9 100644
--- a/src/includes/perfmon_atom.h
+++ b/src/includes/perfmon_atom.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Header file of perfmon module for Atom
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,8 +29,7 @@
  */
 
 #include <perfmon_atom_events.h>
-#include <perfmon_atom_groups.h>
+#include <error.h>
 
-static int perfmon_numGroupsAtom = NUM_GROUPS_ATOM;
 static int perfmon_numArchEventsAtom = NUM_ARCH_EVENTS_ATOM;
 
diff --git a/src/includes/perfmon_atom_events.txt b/src/includes/perfmon_atom_events.txt
index 4ca18e4..336bb4a 100644
--- a/src/includes/perfmon_atom_events.txt
+++ b/src/includes/perfmon_atom_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_atom_events.txt
-# 
+#
 #      Description:  Event list for Intel Atom
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/perfmon_broadwell.h b/src/includes/perfmon_broadwell.h
new file mode 100644
index 0000000..a1cdbcd
--- /dev/null
+++ b/src/includes/perfmon_broadwell.h
@@ -0,0 +1,506 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_broadwell.h
+ *
+ *      Description:  Header File of perfmon module for Intel Broadwell.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <perfmon_broadwell_events.h>
+#include <perfmon_broadwell_counters.h>
+#include <error.h>
+#include <affinity.h>
+#include <limits.h>
+#include <topology.h>
+#include <access.h>
+
+
+static int perfmon_numCountersBroadwell = NUM_COUNTERS_BROADWELL;
+static int perfmon_numCoreCountersBroadwell = NUM_COUNTERS_CORE_BROADWELL;
+static int perfmon_numArchEventsBroadwell = NUM_ARCH_EVENTS_BROADWELL;
+
+
+int perfmon_init_broadwell(int cpu_id)
+{
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
+}
+
+
+uint32_t bdw_fixed_setup(RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+            default:
+                break;
+        }
+    }
+    return flags;
+}
+
+int bdw_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t offcore_flags = 0x0ULL;
+
+    flags = (1ULL<<22)|(1ULL<<16);
+    /* Intel with standard 8 bit event mask: [7:0] */
+    flags |= (event->umask<<8) + event->eventId;
+
+    /* set custom cfg and cmask */
+    if ((event->cfgBits != 0) &&
+        (event->eventId != 0xB7) &&
+        (event->eventId != 0xBB))
+    {
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_IN_TRANS:
+                    flags |= (1ULL<<32);
+                    break;
+                case EVENT_OPTION_IN_TRANS_ABORT:
+                    flags |= (1ULL<<33);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0x8FFFULL);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value<<16);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+    }
+    else if (event->eventId == 0xBB)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister , flags));
+    return 0;
+}
+
+
+
+int perfmon_setupCounterThread_broadwell(
+        int thread_id,
+        PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t flags;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0xC00000070000000F));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        uint64_t reg = counter_map[index].configRegister;
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        flags = 0x0ULL;
+        switch (type)
+        {
+            case PMC:
+                bdw_pmc_setup(cpu_id, index, event);
+                break;
+
+            case FIXED:
+                fixed_flags |= bdw_fixed_setup(index, event);
+                break;
+
+            case POWER:
+            case THERMAL:
+                break;
+
+            default:
+                break;
+        }
+    }
+    if (fixed_flags > 0x0ULL)
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+    }
+    return 0;
+}
+
+int perfmon_startCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t flags = 0x0ULL;
+    uint64_t tmp = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            PciDeviceIndex dev = counter_map[index].device;
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                    flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));  /* enable counter */
+                    break;
+
+                case FIXED:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                    flags |= (1ULL<<(index+32));  /* enable fixed counter */
+                    break;
+
+                case POWER:
+                    if (haveLock)
+                    {
+                        tmp = 0x0ULL;
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1,(uint32_t*)&tmp));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST tmp, START_POWER)
+                        eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+                    }
+                    break;
+
+                default:
+                    break;
+            }
+        }
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST (1ULL<<63)|(1ULL<<62)|flags, CLEAR_PMC_AND_FIXED_OVERFLOW)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+    }
+
+    return 0;
+}
+
+
+
+int perfmon_stopCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            PerfmonEvent *event = &(eventSet->events[i].event);
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    HASEP_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+                    break;
+
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    HASEP_CHECK_CORE_OVERFLOW(index+32);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
+                    break;
+
+                case POWER:
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
+                    {
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+                        if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                        }
+                    }
+                    break;
+
+                case THERMAL:
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+                    break;
+
+                default:
+                    break;
+            }
+            *current = field64(counter_result, 0, box_map[type].regWidth);
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
+    }
+
+
+    return 0;
+}
+
+
+int perfmon_readCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSet)
+{
+    uint64_t flags = 0x0ULL;
+    int haveLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, SAFE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, RESET_PMC_FLAGS)
+    }
+    HASEP_FREEZE_UNCORE;
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            counter_result= 0x0ULL;
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            PerfmonEvent *event = &(eventSet->events[i].event);
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    HASEP_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+                    break;
+
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    HASEP_CHECK_CORE_OVERFLOW(index+32);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
+                    break;
+
+                case POWER:
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
+                    {
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+                        if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                        }
+                    }
+                    break;
+
+                case THERMAL:
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+                    break;
+                default:
+                    break;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+        }
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, RESTORE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+    }
+
+    return 0;
+}
+
+int perfmon_finalizeCountersThread_broadwell(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    int haveTileLock = 0;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+    uint64_t ovf_values_uncore = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterIndex index = eventSet->events[i].index;
+        PciDeviceIndex dev = counter_map[index].device;
+        uint64_t reg = counter_map[index].configRegister;
+        RegisterType type = eventSet->events[i].type;
+        if (type == NOTYPE)
+        {
+            continue;
+        }
+        switch (type)
+        {
+            case PMC:
+                ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                }
+                else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                }
+                break;
+            case FIXED:
+                ovf_values_core |= (1ULL<<(index+32));
+                break;
+            default:
+                break;
+        }
+        if ((reg) && (((type == PMC)||(type == FIXED))))
+        {
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, reg, &ovf_values_uncore));
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, ovf_values_uncore, SHOW_CTL);
+            ovf_values_uncore = 0x0ULL;
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_broadwell_counters.h b/src/includes/perfmon_broadwell_counters.h
new file mode 100644
index 0000000..93e63c7
--- /dev/null
+++ b/src/includes/perfmon_broadwell_counters.h
@@ -0,0 +1,64 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_broadwell_counters.h
+ *
+ *      Description:  Counter Header File of perfmon module for Broadwell.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_BROADWELL 12
+#define NUM_COUNTERS_CORE_BROADWELL 8
+#define NUM_COUNTERS_UNCORE_BROADWELL 12
+
+#define BDW_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define BDW_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap broadwell_counter_map[NUM_COUNTERS_BROADWELL] = {
+    /* Fixed Counters: instructions retired, cycles unhalted core */
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, BDW_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, BDW_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, BDW_VALID_OPTIONS_FIXED},
+    /* PMC Counters: 4 48bit wide */
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, BDW_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, BDW_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, BDW_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, BDW_VALID_OPTIONS_PMC},
+    /* Temperature Sensor*/
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* RAPL counters */
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+};
+
+
+static BoxMap broadwell_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, 0, 8},
+    [FIXED] =  {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [POWER] = {0, 0, 0, 0, 0, 0, 32},
+};
diff --git a/src/includes/perfmon_broadwell_events.txt b/src/includes/perfmon_broadwell_events.txt
new file mode 100644
index 0000000..073520b
--- /dev/null
+++ b/src/includes/perfmon_broadwell_events.txt
@@ -0,0 +1,393 @@
+# =======================================================================================
+#
+#      Filename:  perfmon_broadwell_events.txt
+#
+#      Description:  Event list for Intel Broadwell
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE          0x00   TMP0
+UMASK_TEMP_CORE          0x00
+
+EVENT_PWR_PKG_ENERGY          0x00   PWR0
+UMASK_PWR_PKG_ENERGY          0x00
+
+EVENT_PWR_PP0_ENERGY          0x00   PWR1
+UMASK_PWR_PP0_ENERGY          0x00
+
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
+
+EVENT_PWR_DRAM_ENERGY          0x00   PWR3
+UMASK_PWR_DRAM_ENERGY          0x00
+
+EVENT_INSTR_RETIRED              0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY          0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE      0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF       0x00
+
+EVENT_LD_BLOCKS                 0x03  PMC
+UMASK_LD_BLOCKS_STORE_FORWARD   0x02
+UMASK_LD_BLOCKS_NO_SR           0x08
+
+EVENT_MISALIGN_MEM_REF            0x05  PMC
+UMASK_MISALIGN_MEM_REF_LOADS      0x01
+UMASK_MISALIGN_MEM_REF_STORES     0x02
+UMASK_MISALIGN_MEM_REF_ANY        0x03
+
+EVENT_LD_BLOCKS_PARTIAL                 0x07  PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01
+
+EVENT_DTLB_LOAD_MISSES                       0x08  PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK         0x01
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_4K     0x02
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION         0x10
+UMASK_DTLB_LOAD_MISSES_STLB_HIT_4K           0x20
+
+EVENT_INT_MISC                      0x0D  PMC
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RECOVERY_CYCLES      0x03
+DEFAULT_OPTIONS_INT_MISC_RECOVERY_OCCURRENCES EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_INT_MISC_RECOVERY_OCCURRENCES 0x03
+
+EVENT_UOPS_ISSUED                0x0E  PMC
+UMASK_UOPS_ISSUED_ANY            0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE    0x10
+UMASK_UOPS_ISSUED_SLOW_LEA       0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL     0x40
+DEFAULT_OPTIONS_UOPS_ISSUED_STALLED      EVENT_OPTION_INVERT=1,EVENT_OPTION_ANYTHREAD=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_UOPS_ISSUED_STALLED        0x01
+
+EVENT_ARITH_FPU_DIV_ACTIVE       0x14  PMC
+UMASK_ARITH_FPU_DIV_ACTIVE       0x01
+
+EVENT_L2_RQSTS                     0x24   PMC
+UMASK_L2_RQSTS_DEMAND_DATA_RD_MISS 0x21
+UMASK_L2_RQSTS_DEMAND_DATA_RD_HIT  0x41
+UMASK_L2_RQSTS_L2_PF_HIT           0x50
+UMASK_L2_RQSTS_L2_PF_MISS          0x30
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD  0xE1
+UMASK_L2_RQSTS_ALL_RFO             0xE2
+UMASK_L2_RQSTS_ALL_CODE_RD         0xE4
+UMASK_L2_RQSTS_ALL_PF              0xF8
+
+EVENT_L2_DEMAND_RQST_WB_HIT        0x27   PMC
+UMASK_L2_DEMAND_RQST_WB_HIT        0x50
+
+EVENT_LONGEST_LAT_CACHE            0x2E   PMC
+UMASK_LONGEST_LAT_CACHE_REFERENCE  0x4F
+UMASK_LONGEST_LAT_CACHE_MISS       0x41
+
+EVENT_CPU_CLOCK_UNHALTED           0x3C   PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_XCLK  0x01
+
+EVENT_L1D_PEND_MISS                  0x48   PMC1
+UMASK_L1D_PEND_MISS_PENDING          0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_PENDING_CYCLES   EVENT_OPTION_THRESHOLD=0x01
+UMASK_L1D_PEND_MISS_PENDING_CYCLES   0x01
+DEFAULT_OPTIONS_L1D_PEND_MISS_OCCURRENCES EVENT_OPTION_EDGE=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_L1D_PEND_MISS_OCCURRENCES      0x01
+
+EVENT_DTLB_STORE_MISSES                    0x49   PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK      0x01
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_4K  0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION      0x10
+UMASK_DTLB_STORE_MISSES_STLB_HIT_4K        0x20
+
+EVENT_LOAD_HIT_PRE               0x4C    PMC
+UMASK_LOAD_HIT_PRE_HW_PF         0x02
+
+EVENT_L1D                        0x51   PMC
+UMASK_L1D_REPLACEMENT            0x01
+
+EVENT_MOVE_ELIMINATION                        0x58   PMC
+UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED     0x04
+UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED    0x08
+UMASK_MOVE_ELIMINATION_INT_ELIMINATED         0x01
+UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED        0x02
+
+EVENT_CPL_CYCLES                   0x5C    PMC
+UMASK_CPL_CYCLES_RING0             0x01
+UMASK_CPL_CYCLES_RING123           0x02
+
+EVENT_RS_EVENTS                 0x5E    PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING                  0x60   PMC
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD      0x08
+
+EVENT_CACHE_LOCK_CYCLES                             0x63   PMC
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION 0x01
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION         0x02
+
+EVENT_IDQ                              0x79   PMC
+UMASK_IDQ_EMPTY                        0x02
+UMASK_IDQ_MITE_UOPS                    0x04
+DEFAULT_OPTIONS_IDQ_MITE_UOPS_CYCLES   EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MITE_UOPS_CYCLES             0x04
+UMASK_IDQ_DSB_UOPS                     0x08
+DEFAULT_OPTIONS_IDQ_DSB_UOPS_CYCLES    EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_DSB_UOPS_CYCLES              0x08
+UMASK_IDQ_MS_DSB_UOPS                  0x10
+DEFAULT_OPTIONS_IDQ_MS_DSB_UOPS_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_DSB_UOPS_CYCLES           0x10
+UMASK_IDQ_MS_MITE_UOPS                 0x20
+DEFAULT_OPTIONS_IDQ_MS_MITE_UOPS_CYCLES EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_MITE_UOPS_CYCLES          0x20
+UMASK_IDQ_MS_UOPS                      0x30
+DEFAULT_OPTIONS_IDQ_MS_UOPS_CYCLES     EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_MS_UOPS_CYCLES               0x30
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS      0x18
+DEFAULT_OPTIONS_IDQ_ALL_DSB_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS        0x18
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_ANY_UOPS EVENT_OPTION_THRESHOLD=0x01
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS     0x24
+DEFAULT_OPTIONS_IDQ_ALL_MITE_CYCLES_4_UOPS EVENT_OPTION_THRESHOLD=0x04
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24
+UMASK_IDQ_MITE_ALL_UOPS       0x3C
+
+EVENT_ICACHE                  0x80   PMC
+UMASK_ICACHE_MISSES           0x02
+
+EVENT_ITLB_MISSES                   0x85      PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK     0x01
+UMASK_ITLB_MISSES_WALK_COMPLETED_4K 0x02
+UMASK_ITLB_MISSES_WALK_DURATION     0x10
+UMASK_ITLB_MISSES_STLB_HIT_4K       0x20
+
+EVENT_ILD_STALL                 0x87      PMC
+UMASK_ILD_STALL_LCP             0x01
+
+EVENT_BR_INST_EXEC                                      0x88   PMC
+UMASK_BR_INST_EXEC_COND_TAKEN                           0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN                     0x82
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_INST_EXEC_ALL_BRANCHES                         0xFF
+
+EVENT_BR_MISP_EXEC                                      0x89   PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN                           0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_MISP_EXEC_ALL_BRANCHES                         0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE               0x01
+
+EVENT_UOPS_EXECUTED_PORT                  0xA1   PMC
+UMASK_UOPS_EXECUTED_PORT_PORT_0           0x01
+UMASK_UOPS_EXECUTED_PORT_PORT_1           0x02
+UMASK_UOPS_EXECUTED_PORT_PORT_2           0x04
+UMASK_UOPS_EXECUTED_PORT_PORT_3           0x08
+UMASK_UOPS_EXECUTED_PORT_PORT_4           0x10
+UMASK_UOPS_EXECUTED_PORT_PORT_5           0x20
+UMASK_UOPS_EXECUTED_PORT_PORT_6           0x40
+UMASK_UOPS_EXECUTED_PORT_PORT_7           0x80
+
+EVENT_RESOURCE_STALLS                 0xA2   PMC
+UMASK_RESOURCE_STALLS_ANY             0x01
+UMASK_RESOURCE_STALLS_RS              0x04
+UMASK_RESOURCE_STALLS_SB              0x08
+UMASK_RESOURCE_STALLS_ROB             0x10
+
+EVENT_LSD_UOPS                 0xA8   PMC
+UMASK_LSD_UOPS                 0x01
+
+EVENT_ITLB                       0xAE   PMC
+UMASK_ITLB_ITLB_FLUSH            0x01
+
+EVENT_OFFCORE_REQUESTS     0xB0   PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD      0x08
+
+EVENT_UOPS_EXECUTED               0xB1   PMC
+UMASK_UOPS_EXECUTED_THREAD        0x01
+UMASK_UOPS_EXECUTED_CORE          0x02
+
+EVENT_PAGE_WALKER_LOADS             0xBC  PMC
+UMASK_PAGE_WALKER_LOADS_DTLB_L1     0x11
+UMASK_PAGE_WALKER_LOADS_ITLB_L1     0x21
+UMASK_PAGE_WALKER_LOADS_DTLB_L2     0x12
+UMASK_PAGE_WALKER_LOADS_ITLB_L2     0x22
+UMASK_PAGE_WALKER_LOADS_DTLB_L3     0x14
+UMASK_PAGE_WALKER_LOADS_ITLB_L3     0x24
+UMASK_PAGE_WALKER_LOADS_DTLB_MEMORY 0x18
+
+EVENT_INST_RETIRED                  0xC0  PMC1
+UMASK_INST_RETIRED_ANY_P            0x00
+UMASK_INST_RETIRED_X87              0x02
+EVENT_INST_RETIRED_PREC             0xC0  PMC1
+UMASK_INST_RETIRED_PREC_DIST        0x01
+
+EVENT_OTHER_ASSISTS                  0xC1  PMC
+UMASK_OTHER_ASSISTS_AVX_TO_SSE       0x08
+UMASK_OTHER_ASSISTS_SSE_TO_AVX       0x10
+UMASK_OTHER_ASSISTS_ANY_WB_ASSIST    0x40
+
+EVENT_UOPS_RETIRED                  0xC2  PMC
+UMASK_UOPS_RETIRED_ALL              0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS     0x02
+DEFAULT_OPTIONS_UOPS_RETIRED_ALL_ACTIVE EVENT_OPTION_THRESHOLD=0x01
+UMASK_UOPS_RETIRED_ALL_ACTIVE       0x01
+DEFAULT_OPTIONS_UOPS_RETIRED_ALL_INACTIVE EVENT_OPTION_INVERT=1,EVENT_OPTION_THRESHOLD=0x01
+UMASK_UOPS_RETIRED_ALL_INACTIVE     0x01
+
+EVENT_MACHINE_CLEARS                    0xC3  PMC
+UMASK_MACHINE_CLEARS_CYCLES             0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
+UMASK_MACHINE_CLEARS_SMC                0x04
+UMASK_MACHINE_CLEARS_MASKMOV            0x20
+
+EVENT_BR_INST_RETIRED               0xC4  PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x04
+UMASK_BR_INST_RETIRED_NEAR_RETURN   0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH    0x40
+
+EVENT_BR_MISP_RETIRED                0xC5  PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES_1 0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL    0x01
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES_2 0x04
+
+EVENT_FP_ARITH_INST_RETIRED               0xC7 PMC
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_DOUBLE      0x01
+UMASK_FP_ARITH_INST_RETIRED_SCALAR_SINGLE      0x02
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_DOUBLE 0x04
+UMASK_FP_ARITH_INST_RETIRED_128B_PACKED_SINGLE 0x08
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_DOUBLE 0x10
+UMASK_FP_ARITH_INST_RETIRED_256B_PACKED_SINGLE 0x20
+
+EVENT_FP_ASSIST                      0xCA  PMC
+UMASK_FP_ASSIST_X87_OUTPUT           0x02
+UMASK_FP_ASSIST_X87_INPUT            0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT          0x08
+UMASK_FP_ASSIST_SIMD_INPUT           0x10
+UMASK_FP_ASSIST_ANY                  0x1E
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS     0xCC  PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS     0x20
+
+EVENT_MEM_UOP_RETIRED                  0xD0    PMC
+UMASK_MEM_UOP_RETIRED_LOADS_ALL        0x81
+UMASK_MEM_UOP_RETIRED_STORES_ALL       0x82
+UMASK_MEM_UOP_RETIRED_LOADS_STLB_MISS  0x11
+UMASK_MEM_UOP_RETIRED_STORES_STLB_MISS 0x12
+UMASK_MEM_UOP_RETIRED_LOADS_SPLIT      0x41
+UMASK_MEM_UOP_RETIRED_STORES_SPLIT     0x42
+
+EVENT_MEM_LOAD_UOPS_RETIRED              0xD1   PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS      0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL       0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT       0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS      0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL       0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
+
+EVENT_MEM_LOAD_UOPS_L3_HIT_RETIRED           0xD2   PMC
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_MISS 0x01
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HIT  0x02
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_HITM 0x04
+UMASK_MEM_LOAD_UOPS_L3_HIT_RETIRED_XSNP_NONE 0x08
+
+EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED            0xD3   PMC
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM 0x01
+
+EVENT_L2_TRANS                0xF0  PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD 0x01
+UMASK_L2_TRANS_RFO            0x02
+UMASK_L2_TRANS_CODE_RD        0x04
+UMASK_L2_TRANS_ALL_PF         0x08
+UMASK_L2_TRANS_L1D_WB         0x10
+UMASK_L2_TRANS_L2_FILL        0x20
+UMASK_L2_TRANS_L2_WB          0x40
+UMASK_L2_TRANS_ALL_REQUESTS   0x80
+
+EVENT_L2_LINES_IN             0xF1   PMC
+UMASK_L2_LINES_IN_I           0x01
+UMASK_L2_LINES_IN_S           0x02
+UMASK_L2_LINES_IN_E           0x04
+UMASK_L2_LINES_IN_ALL         0x07
+
+EVENT_L2_LINES_OUT                  0xF2   PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x05
+UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x06
+
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
+
diff --git a/src/includes/perfmon_core2.h b/src/includes/perfmon_core2.h
index f737dda..4bdee27 100644
--- a/src/includes/perfmon_core2.h
+++ b/src/includes/perfmon_core2.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_core2.h
  *
- *      Description:  Header file of perfmon module for Core 2
+ *      Description:  Header file of perfmon module for Intel Core 2
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,155 +30,301 @@
  */
 
 #include <perfmon_core2_events.h>
-#include <perfmon_core2_groups.h>
 #include <perfmon_core2_counters.h>
+#include <error.h>
+
 
 static int perfmon_numCountersCore2 = NUM_COUNTERS_CORE2;
-static int perfmon_numGroupsCore2 = NUM_GROUPS_CORE2;
 static int perfmon_numArchEventsCore2 = NUM_ARCH_EVENTS_CORE2;
 
-void perfmon_init_core2(PerfmonThread *thread)
+int perfmon_init_core2(int cpu_id)
 {
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    /* Initialize registers */
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
+}
 
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
 
-    /* always initialize fixed counters
-     * FIXED 0: Instructions retired
-     * FIXED 1: Clocks unhalted */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x22ULL);
+uint32_t core2_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            default:
+                break;
+        }
+    }
+    return flags;
+}
 
-    /* Preinit of PMC counters */
-    flags |= (1<<16);  /* user mode flag */
-    flags |= (1<<19);  /* pin control flag */
-    flags |= (1<<22);  /* enable flag */
+int core2_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
 
-    msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
+    flags = (1ULL<<22)|(1ULL<<16)|(1ULL<<19);
+    flags |= (event->umask<<8) + event->eventId;
+    if ( event->cfgBits != 0 ) /* set custom cfg and cmask */
+    {
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for(j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL)<<24);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
 }
 
-
-void perfmon_setupCounterThread_core2(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int perfmon_setupCounterThread_core2( int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    uint64_t reg = core2_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ( core2_counter_map[index].type == PMC )
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
     {
-        flags = (1<<16)|(1<<19)|(1<<22);
-
-        /* Intel with standard 8 bit event mask: [7:0] */
-        flags |= (event->umask<<8) + event->eventId;
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
 
-        if ( event->cfgBits != 0 ) /* set custom cfg and cmask */
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-            flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+            continue;
         }
-
-        msr_write(cpu_id, reg , flags);
-
-        if (perfmon_verbose)
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        switch (type)
         {
-            printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                    cpu_id,
-                    LLU_CAST reg,
-                    LLU_CAST flags);
+            case PMC:
+                core2_pmc_setup(cpu_id, index, event);
+                break;
+            case FIXED:
+                fixed_flags |= core2_fixed_setup(cpu_id, index, event);
+                break;
+            default:
+                break;
         }
     }
-    else if (core2_counter_map[index].type == FIXED)
+    if (fixed_flags > 0x0ULL)
     {
-        fixed_flags |= (0x2 << (index*4));
-        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
     }
+    return 0;
 }
 
-void perfmon_startCountersThread_core2(int thread_id)
+int perfmon_startCountersThread_core2(int thread_id, PerfmonEventSet* eventSet)
 {
     uint64_t flags = 0ULL;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-
-    for ( int i=0; i<NUM_COUNTERS_CORE2; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            msr_write(cpu_id, core2_counter_map[i].counterRegister , 0x0ULL);
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter = counter_map[index].counterRegister;
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
 
-            if (core2_counter_map[i].type == PMC)
+            if (type == PMC)
             {
-                flags |= (1<<(i-2));  /* enable counter */
+                flags |= (1ULL<<(index - cpuid_info.perf_num_fixed_ctr));  /* enable counter */
             }
-            else if (core2_counter_map[i].type == FIXED)
+            else if (type == FIXED)
             {
-                flags |= (1ULL<<(i+32));  /* enable fixed counter */
+                flags |= (1ULL<<(index + 32));  /* enable fixed counter */
             }
         }
     }
 
-    if (perfmon_verbose)
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        printf("perfmon_start_counters: Write Register 0x%X , Flags: 0x%llX \n",
-                MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|flags));
     }
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x300000003ULL);
+    return 0;
 }
 
-void perfmon_stopCountersThread_core2(int thread_id)
+#define CORE2_CHECK_OVERFLOW(offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t ovf_values = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values)); \
+        if (ovf_values & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+        } \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<offset))); \
+    }
+
+int perfmon_stopCountersThread_core2(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t counter_result;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
     /* stop counters */
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
 
     /* read out counter results */
-    for ( int i=0; i<NUM_COUNTERS_CORE2; i++)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            perfmon_threadData[thread_id].counters[i].counterData =
-                msr_read(cpu_id, core2_counter_map[i].counterRegister);
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter = counter_map[index].counterRegister;
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    CORE2_CHECK_OVERFLOW(index - cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_PMC)
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    CORE2_CHECK_OVERFLOW(index + 32);
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_FIXED)
+                    break;
+                default:
+                    break;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
 
-    /* check overflow status */
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    if ( (flags & 0x3) || (flags & (0x3ULL<<32)) )
+    return 0;
+}
+
+int perfmon_readCountersThread_core2(int thread_id, PerfmonEventSet* eventSet)
+{
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t counter_result;
+    uint64_t flags;
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
     {
-        printf ("Overflow occured \n");
-        printf ("Status: 0x%llX \n", LLU_CAST flags);
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, SAFE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, RESET_PMC_FLAGS)
     }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        counter_result = 0x0ULL;
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter = counter_map[index].counterRegister;
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    CORE2_CHECK_OVERFLOW(index - cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_PMC)
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    CORE2_CHECK_OVERFLOW(index - 32);
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_FIXED)
+                    break;
+                default:
+                    break;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+        }
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, RESTORE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+    }
+
+    return 0;
 }
 
-void perfmon_readCountersThread_core2(int thread_id)
+
+int perfmon_finalizeCountersThread_core2(int thread_id, PerfmonEventSet* eventSet)
 {
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
 
-    for ( int i=0; i<NUM_COUNTERS_CORE2; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        uint64_t reg = counter_map[index].configRegister;
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
+        if (type == PMC)
+        {
+            ovf_values_core |= (1ULL<<(index - cpuid_info.perf_num_fixed_ctr));
+        }
+        else if (type == FIXED)
+        {
+            ovf_values_core |= (1ULL<<(index + 32));
+        }
+        if ((reg) && ((type == PMC)||(type == FIXED)))
         {
-            perfmon_threadData[thread_id].counters[i].counterData =
-                msr_read(cpu_id, core2_counter_map[i].counterRegister);
+            VERBOSEPRINTREG(cpu_id, reg, LLU_CAST 0x0ULL, CLEAR_CTRL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
         }
     }
+    VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+    VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    return 0;
 }
-
diff --git a/src/includes/perfmon_core2_counters.h b/src/includes/perfmon_core2_counters.h
index d6c33fb..9acffd8 100644
--- a/src/includes/perfmon_core2_counters.h
+++ b/src/includes/perfmon_core2_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_core2_counters.h
  *
- *      Description:  Counter header file of perfmon module for Core 2
+ *      Description:  Counter header file of perfmon module for Intel Core 2
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,13 +32,21 @@
 #define NUM_COUNTERS_CORE2 5
 #define NUM_COUNTERS_CORE_CORE2 5
 
-static PerfmonCounterMap core2_counter_map[NUM_COUNTERS_CORE2] = {
+#define CORE2_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK
+#define CORE2_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap core2_counter_map[NUM_COUNTERS_CORE2] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
-    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
-    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, CORE2_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, CORE2_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, CORE2_VALID_OPTIONS_FIXED},
     /* PMC Counters: 2 40bit wide */
-    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0}
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, CORE2_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, CORE2_VALID_OPTIONS_PMC},
 };
 
+
+static BoxMap core2_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 40},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+};
diff --git a/src/includes/perfmon_core2_events.txt b/src/includes/perfmon_core2_events.txt
index 60c6211..3aa49a2 100644
--- a/src/includes/perfmon_core2_events.txt
+++ b/src/includes/perfmon_core2_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_core2_events.txt
-# 
+#
 #      Description:  Event list for Intel Core 2
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -29,10 +30,10 @@
 EVENT_INSTR_RETIRED              0x00   FIXC0
 UMASK_INSTR_RETIRED_ANY          0x00
 
-EVENT_CPU_CLK_UNHALTED_CORE      0x00   FIXC1
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
 UMASK_CPU_CLK_UNHALTED_CORE      0x00
 
-EVENT_CPU_CLK_UNHALTED_REF       0x00   FIXC2
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
 UMASK_CPU_CLK_UNHALTED_REF       0x00
 
 EVENT_LOAD_BLOCK                 0x03   PMC
@@ -47,6 +48,9 @@ UMASK_SB_DRAIN_CYCLES            0x01
 UMASK_STORE_BLOCK_ORDER          0x02
 UMASK_STORE_BLOCK_SNOOP          0x08
 
+EVENT_MISALIGN_MEM_REF           0x05   PMC
+UMASK_MISALIGN_MEM_REF           0x00
+
 EVENT_SEGMENT_REG_LOADS          0x06   PMC
 UMASK_SEGMENT_REG_LOADS          0x00
 
@@ -97,6 +101,10 @@ EVENT_L2_ADS                     0x21   PMC
 UMASK_L2_ADS_ALL_CORES           0xC0
 UMASK_L2_ADS_THIS_CORE           0x40
 
+EVENT_L2_DBUS_BUSY               0x22  PMC
+UMASK_L2_DBUS_BUSY_ALL_CORES     0xC0
+UMASK_L2_DBUS_BUSY_THIS_CORE     0x40
+
 EVENT_L2_DBUS_BUSY_RD            0x23  PMC
 UMASK_L2_DBUS_BUSY_RD_ALL_CORES  0xC0
 UMASK_L2_DBUS_BUSY_RD_THIS_CORE  0x40
@@ -266,7 +274,8 @@ UMASK_L2_NO_REQ_ALL_CORES        0xC0
 UMASK_L2_NO_REQ_THIS_CORE        0x40
 
 EVENT_EIST_TRANS               0x3A  PMC
-UMASK_EIST_TRANS               0x00
+UMASK_EIST_TRANS_ANY           0x00
+UMASK_EIST_TRANS_FREQ          0x01
 
 EVENT_THERMAL_TRIP               0x3B  PMC
 UMASK_THERMAL_TRIP               0xC0
@@ -298,9 +307,11 @@ UMASK_L1D_CACHE_LOCK_INVALID      0x01
 UMASK_L1D_CACHE_LOCK_MESI         0x0F
 UMASK_L1D_CACHE_LOCK_DURATION     0x10
 
-EVENT_L1D                        0x43  PMC
+EVENT_L1D_ALL                    0x43  PMC
 UMASK_L1D_ALL_REF                0x01
-UMASK_L1D_ALL_CACHE_REF          0x02
+
+EVENT_L1D_CACHE                  0x44  PMC
+UMASK_L1D_CACHE_REF              0x02
 
 EVENT_L1D_REPL                   0x45  PMC
 UMASK_L1D_REPL                   0x0F
@@ -322,6 +333,7 @@ EVENT_SSE_PRE_MISS               0x4B  PMC
 UMASK_SSE_PRE_MISS_NTA           0x00
 UMASK_SSE_PRE_MISS_L1            0x01
 UMASK_SSE_PRE_MISS_L2            0x02
+UMASK_SSE_PRE_MISS_ALL_CACHES    0x03
 
 EVENT_LOAD_HIT_PRE              0x4C  PMC
 UMASK_LOAD_HIT_PRE              0x00
@@ -329,6 +341,9 @@ UMASK_LOAD_HIT_PRE              0x00
 EVENT_L1D_PREFETCH_REQUESTS     0x4E  PMC
 UMASK_L1D_PREFETCH_REQUESTS     0x10
 
+EVENT_L1D_PREFETCH_DCU_MISSES   0x4F  PMC
+UMASK_L1D_PREFETCH_DCU_MISSES   0x00
+
 EVENT_BUS_REQUEST_OUTSTANDING    0x60  PMC
 UMASK_BUS_REQUEST_OUTSTANDING_ALL_CORES_THIS_A    0xC0
 UMASK_BUS_REQUEST_OUTSTANDING_ALL_CORES_ALL_A     0xE0
@@ -425,6 +440,21 @@ UMASK_BUS_TRANS_ANY_ALL_CORES_ALL_A     0xE0
 UMASK_BUS_TRANS_ANY_THIS_CORE_THIS_A    0x40
 UMASK_BUS_TRANS_ANY_THIS_CORE_ALL_A     0x60
 
+EVENT_BUS_DCU_SNOOP_TO_SHARE     0x78  PMC
+UMASK_BUS_DCU_SNOOP_TO_SHARE_ALL_CORES_THIS_A       0xC1
+UMASK_BUS_DCU_SNOOP_TO_SHARE_ALL_CORES_ALL_A        0xE1
+UMASK_BUS_DCU_SNOOP_TO_SHARE_THIS_CORE_THIS_A       0x41
+UMASK_BUS_DCU_SNOOP_TO_SHARE_THIS_CORE_ALL_A        0x61
+
+EVENT_BUS_DCU_SNOOP_TO_SHARE     0x7D  PMC
+UMASK_BUS_DCU_SNOOP_TO_SHARE_ALL_CORES_THIS_A       0xC0
+UMASK_BUS_DCU_SNOOP_TO_SHARE_ALL_CORES_ALL_A        0xE0
+UMASK_BUS_DCU_SNOOP_TO_SHARE_THIS_CORE_THIS_A       0x40
+UMASK_BUS_DCU_SNOOP_TO_SHARE_THIS_CORE_ALL_A        0x60
+
+EVENT_BUS_SNOOP_STALLED          0x7E  PMC
+UMASK_BUS_SNOOP_STALLED          0x00
+
 EVENT_L1I_READS                  0x80  PMC
 UMASK_L1I_READS                  0x00
 
diff --git a/src/includes/perfmon_haswell.h b/src/includes/perfmon_haswell.h
index 57f12af..4935639 100644
--- a/src/includes/perfmon_haswell.h
+++ b/src/includes/perfmon_haswell.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_haswell.h
  *
- *      Description:  Header File of perfmon module for Haswell.
+ *      Description:  Header File of perfmon module for Intel Haswell.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,382 +29,1732 @@
  * =======================================================================================
  */
 
+#include <perfmon_haswellEP_events.h>
 #include <perfmon_haswell_events.h>
-#include <perfmon_haswell_groups.h>
+#include <perfmon_haswellEP_counters.h>
 #include <perfmon_haswell_counters.h>
+#include <error.h>
+#include <affinity.h>
+#include <limits.h>
+#include <topology.h>
+#include <access.h>
 
+
+static int perfmon_numCountersHaswellEP = NUM_COUNTERS_HASWELL_EP;
+static int perfmon_numCoreCountersHaswellEP = NUM_COUNTERS_CORE_HASWELL_EP;
+static int perfmon_numArchEventsHaswellEP = NUM_ARCH_EVENTS_HASWELLEP;
 static int perfmon_numCountersHaswell = NUM_COUNTERS_HASWELL;
-static int perfmon_numGroupsHaswell = NUM_GROUPS_HASWELL;
+static int perfmon_numCoreCountersHaswell = NUM_COUNTERS_CORE_HASWELL;
 static int perfmon_numArchEventsHaswell = NUM_ARCH_EVENTS_HASWELL;
 
 
-#define OFFSET_PMC 3
+int perfmon_init_haswell(int cpu_id)
+{
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
+}
+
+
+uint32_t hasep_fixed_setup(RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+            default:
+                break;
+        }
+    }
+    return flags;
+}
+
+int hasep_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t offcore_flags = 0x0ULL;
+    uint64_t latency_flags = 0x0ULL;
+
+    flags = (1ULL<<22)|(1ULL<<16);
+    /* Intel with standard 8 bit event mask: [7:0] */
+    flags |= (event->umask<<8) + event->eventId;
+
+    /* set custom cfg and cmask */
+    if ((event->cfgBits != 0) &&
+        (event->eventId != 0xB7) &&
+        (event->eventId != 0xBB))
+    {
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_IN_TRANS:
+                    flags |= (1ULL<<32);
+                    break;
+                case EVENT_OPTION_IN_TRANS_ABORT:
+                    flags |= (1ULL<<33);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0x8FFFULL);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value<< 16);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+    }
+    else if (event->eventId == 0xBB)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister , flags));
+    return 0;
+}
+
+int hasep_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t filter_flags;
+    uint32_t filter0 = box_map[counter_map[index].type].filterRegister1;
+    uint32_t filter1 = box_map[counter_map[index].type].filterRegister2;
+    int set_state_all = 0;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->eventId == 0x34)
+    {
+        set_state_all = 1;
+    }
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, 0x0ULL));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter1, 0x0ULL));
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            filter_flags = 0x0ULL;
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_OPCODE:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter1, &filter_flags));
+                    filter_flags |= (0x3<<27);
+                    filter_flags |= (extractBitField(event->options[j].value,5,0) << 20);
+                    VERBOSEPRINTREG(cpu_id, filter1, filter_flags, SETUP_CBOX_FILTER_OPCODE);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter1, filter_flags));
+                    break;
+                case EVENT_OPTION_NID:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter1, &filter_flags));
+                    filter_flags |= (extractBitField(event->options[j].value,16,0));
+                    VERBOSEPRINTREG(cpu_id, filter1, filter_flags, SETUP_CBOX_FILTER_NID);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter1, filter_flags));
+                    break;
+                case EVENT_OPTION_STATE:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter0, &filter_flags));
+                    filter_flags |= (extractBitField(event->options[j].value,6,0) << 17);
+                    VERBOSEPRINTREG(cpu_id, filter0, filter_flags, SETUP_CBOX_FILTER_STATE);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, filter_flags));
+                    set_state_all = 0;
+                    break;
+                case EVENT_OPTION_TID:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter0, &filter_flags));
+                    filter_flags |= (extractBitField(event->options[j].value,6,0));
+                    VERBOSEPRINTREG(cpu_id, filter0, filter_flags, SETUP_CBOX_FILTER_TID);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, filter_flags));
+                    flags |= (1ULL<<19);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    else
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter1, 0x0ULL));
+    }
+    if (set_state_all)
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, filter0, &filter_flags));
+        filter_flags |= (0x1F << 17);
+        VERBOSEPRINTREG(cpu_id, filter0, filter_flags, SETUP_CBOX_DEF_FILTER_STATE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter0, filter_flags));
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+int hasep_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0x1FULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_UBOX);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+int hasep_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t filter = box_map[counter_map[index].type].filterRegister1;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= event->eventId;
+    if ((event->umask > 0x00) && (event->umask <= 0x3))
+    {
+        flags |= (event->umask << 14);
+    }
+    else if (event->umask == 0xFF)
+    {
+        flags = (1ULL<<21);
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0x1FULL) << 24;
+                    break;
+                case EVENT_OPTION_OCCUPANCY:
+                    flags |= ((event->options[j].value & 0x3ULL)<<14);
+                    break;
+                case EVENT_OPTION_OCCUPANCY_FILTER:
+                    VERBOSEPRINTREG(cpu_id, filter, (event->options[j].value & 0xFFFFFFFFULL), SETUP_WBOX_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter, (event->options[j].value & 0xFFFFFFFFULL)));
+                    break;
+                case EVENT_OPTION_OCCUPANCY_EDGE:
+                    flags |= (1ULL<<31);
+                    break;
+                case EVENT_OPTION_OCCUPANCY_INVERT:
+                    flags |= (1ULL<<30);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_WBOX);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+
+int hasep_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t filter = 0x0ULL;
+    int opcode_flag = 0;
+    int match_flag = 0;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags |= (1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_OPCODE:
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+                                        (event->options[j].value & 0x3FULL), SETUP_BBOX_OPCODE);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+                                        (event->options[j].value & 0x3FULL)));
+                    opcode_flag = 1;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    filter = ((event->options[j].value & 0xFFFFFFC0ULL));
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter, SETUP_ADDR0_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter));
+                    filter = (((event->options[j].value>>32) & 0x3FFFULL));
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter, SETUP_ADDR1_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter));
+                    match_flag = 1;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (!opcode_flag)
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, 0x0ULL, CLEAR_BBOX_OPCODE);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, 0x0ULL));
+    }
+    if (!match_flag)
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, 0x0ULL, CLEAR_BBOX_MATCH0);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, 0x0ULL));
+        VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, 0x0ULL, CLEAR_BBOX_MATCH1);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, 0x0ULL));
+    }
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_BBOX);
+    CHECK_PCI_WRITE_ERROR(HPMwrite( cpu_id, dev, counter_map[index].configRegister, flags));
+    /* Intel notes the registers must be written twice to hold, once without enable and again with enable.
+     * Not mentioned for the BBOX but we do it to be sure.
+     */
+    flags |= (1ULL<<22);
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_BBOX_TWICE);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+int hasep_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_TID:
+                    flags |= (1ULL<<19);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL)<<24);
+                    break;
+
+                default:
+                    break;
+            }
+        }
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_SBOX);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device, counter_map[index].configRegister, flags));
+    flags |= (1ULL<<22);
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_SBOX_TWICE);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+
+int hasep_mbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_MBOX);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+    flags |= (1ULL<<22);
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_MBOX_TWICE);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+int hasep_ibox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_IBOX);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+    flags |= (1ULL<<22);
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_IBOX_TWICE);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+
+int hasep_pbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_PBOX);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+    flags |= (1ULL<<22);
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_PBOX_TWICE);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+int hasep_rbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_PBOX);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+    flags |= (1ULL<<22);
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_PBOX_TWICE);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+int hasep_qbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event, PciDeviceIndex filterdev)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t filterreg;
+    uint64_t filterval = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->cfgBits == 0x01)
+    {
+        flags |= (1ULL<<21);
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) << 24;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    if (pci_checkDevice(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_RX_MATCH_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_RX_MATCH0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    if (pci_checkDevice(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_RX_MATCH_1;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_RX_MATCH1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MATCH2:
+                    if (pci_checkDevice(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_TX_MATCH_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_TX_MATCH0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MATCH3:
+                    if (pci_checkDevice(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_TX_MATCH_1;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_TX_MATCH1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK0:
+                    if (pci_checkDevice(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_RX_MASK_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_RX_MASK0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK1:
+                    if (pci_checkDevice(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_RX_MASK_1;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_RX_MASK1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK2:
+                    if (pci_checkDevice(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_TX_MASK_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_TX_MASK0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK3:
+                    if (pci_checkDevice(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_V3_QPI_PMON_TX_MASK_0;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_TX_MASK1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_QBOX);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    /* Intel notes the registers must be written twice to hold, once without enable and again with enable */
+    flags |= (1ULL<<22);
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_QBOX_TWICE);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+#define HASEP_FREEZE_UNCORE \
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL)) \
+    { \
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<31), FREEZE_UNCORE); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, (1ULL<<31))); \
+    }
+
+#define HASEP_UNFREEZE_UNCORE \
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL)) \
+    { \
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, (1ULL<<29))); \
+    }
+
+#define HASEP_UNFREEZE_UNCORE_AND_RESET_CTR \
+    if (haveLock && (eventSet->regTypeMask & ~(0xFULL))) \
+    { \
+        for (int i=0;i < eventSet->numberOfEvents;i++) \
+        { \
+            RegisterIndex index = eventSet->events[i].index; \
+            RegisterType type = counter_map[index].type; \
+            if ((type < UNCORE) || (type == WBOX0FIX)) \
+            { \
+                continue; \
+            } \
+            PciDeviceIndex dev = counter_map[index].device; \
+            if (pci_checkDevice(dev, cpu_id)) { \
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR_MANUAL); \
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL)); \
+                if (counter_map[index].counterRegister2 != 0x0) \
+                { \
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL, CLEAR_CTR_MANUAL); \
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister2, 0x0ULL)); \
+                } \
+            } \
+        } \
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, (1ULL<<29))); \
+    }
+
+#define HASEP_FREEZE_UNCORE_AND_RESET_CTL \
+    if (haveLock && (eventSet->regTypeMask & ~(REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)|REG_TYPE_MASK(THERMAL)|REG_TYPE_MASK(POWER)))) \
+    { \
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<31), FREEZE_UNCORE); \
+        CHECK_MSR_WRITE_ERROR(msr_twrite(read_fd, cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, (1ULL<<31))); \
+        for (int i=0;i < eventSet->numberOfEvents;i++) \
+        { \
+            RegisterIndex index = eventSet->events[i].index; \
+            RegisterType type = counter_map[index].type; \
+            if ((type < UNCORE) || (type == WBOX0FIX)) \
+            { \
+                continue; \
+            } \
+            PciDeviceIndex dev = counter_map[index].device; \
+            if (pci_checkDevice(dev, cpu_id)) { \
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, 0x0ULL, CLEAR_CTL_MANUAL); \
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, 0x0ULL)); \
+                if ((type >= SBOX0) && (type <= SBOX3)) { \
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, 0x0ULL)); \
+                } \
+            } \
+        } \
+    }
+
+
 
-void perfmon_init_haswell(PerfmonThread *thread)
+
+int perfmon_setupCounterThread_haswell(
+        int thread_id,
+        PerfmonEventSet* eventSet)
 {
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    /* Initialize registers */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC2, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC3, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
+    int haveLock = 0;
+    uint64_t flags;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
-    if (cpuid_info.model != HASWELL_EX && cpuid_info.supportUncore)
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0xC00000070000000F));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    }
+    HASEP_FREEZE_UNCORE;
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        msr_write(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL0, 0xAA);
-        flags = msr_read(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL0);
-        if (flags != 0xAA)
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            fprintf(stdout, "The current system does not support Uncore MSRs, deactivating Uncore support\n");
-            cpuid_info.supportUncore = 0;
+            continue;
         }
-    }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        uint64_t reg = counter_map[index].configRegister;
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        flags = 0x0ULL;
+        switch (type)
+        {
+            case PMC:
+                hasep_pmc_setup(cpu_id, index, event);
+                break;
+
+            case FIXED:
+                fixed_flags |= hasep_fixed_setup(index, event);
+                break;
+
+            case POWER:
+                break;
+
+            case CBOX0:
+            case CBOX1:
+            case CBOX2:
+            case CBOX3:
+            case CBOX4:
+            case CBOX5:
+            case CBOX6:
+            case CBOX7:
+            case CBOX8:
+            case CBOX9:
+            case CBOX10:
+            case CBOX11:
+            case CBOX12:
+            case CBOX13:
+            case CBOX14:
+            case CBOX15:
+            case CBOX16:
+            case CBOX17:
+                hasep_cbox_setup(cpu_id, index, event);
+                break;
+
+            case UBOX:
+                hasep_ubox_setup(cpu_id, index, event);
+                break;
+            case UBOXFIX:
+                flags = (1ULL<<22)|(1ULL<<20);
+                VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_UBOXFIX);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+                break;
+
+            case SBOX0:
+            case SBOX1:
+            case SBOX2:
+            case SBOX3:
+                hasep_sbox_setup(cpu_id, index, event);
+                break;
+
+            case BBOX0:
+            case BBOX1:
+                hasep_bbox_setup(cpu_id, index, event);
+                break;
+
+            case WBOX:
+                hasep_wbox_setup(cpu_id, index, event);
+                break;
+            case WBOX0FIX:
+                break;
+
+            case MBOX0:
+            case MBOX1:
+            case MBOX2:
+            case MBOX3:
+            case MBOX4:
+            case MBOX5:
+            case MBOX6:
+            case MBOX7:
+                hasep_mbox_setup(cpu_id, index, event);
+                break;
+
+            case PBOX:
+                hasep_pbox_setup(cpu_id, index, event);
+                break;
 
+            case RBOX0:
+            case RBOX1:
+                hasep_rbox_setup(cpu_id, index, event);
+                break;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) && (cpuid_info.supportUncore))
+            case QBOX0:
+                hasep_qbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_0);
+                break;
+            case QBOX1:
+                hasep_qbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_1);
+                break;
+
+            case IBOX0:
+            case IBOX1:
+                hasep_ibox_setup(cpu_id, index, event);
+                break;
+
+            default:
+                break;
+        }
+    }
+    if (fixed_flags > 0x0ULL)
     {
-        flags = 0x0ULL;
-        flags = (1ULL<<22)|(1ULL<<20);
-        msr_write(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL0, flags);
-        msr_write(cpu_id, MSR_UNC_CBO_0_PERFEVTSEL1, flags);
-        msr_write(cpu_id, MSR_UNC_CBO_1_PERFEVTSEL0, flags);
-        msr_write(cpu_id, MSR_UNC_CBO_1_PERFEVTSEL1, flags);
-        msr_write(cpu_id, MSR_UNC_CBO_2_PERFEVTSEL0, flags);
-        msr_write(cpu_id, MSR_UNC_CBO_2_PERFEVTSEL1, flags);
-        msr_write(cpu_id, MSR_UNC_CBO_3_PERFEVTSEL0, flags);
-        msr_write(cpu_id, MSR_UNC_CBO_3_PERFEVTSEL1, flags);
-
-        msr_write(cpu_id, MSR_UNC_ARB_PERFEVTSEL0, flags);
-        msr_write(cpu_id, MSR_UNC_ARB_PERFEVTSEL1, flags);
-
-        msr_write(cpu_id, MSR_UNC_PERF_FIXED_CTRL, flags);
-
-        msr_write(cpu_id, MSR_UNC_CBO_0_CTR0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_CBO_0_CTR1, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_CBO_1_CTR0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_CBO_1_CTR1, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_CBO_2_CTR0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_CBO_2_CTR1, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_CBO_3_CTR0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_CBO_3_CTR1, 0x0ULL);
-
-        msr_write(cpu_id, MSR_UNC_ARB_CTR0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNC_ARB_CTR1, 0x0ULL);
-
-        msr_write(cpu_id, MSR_UNC_PERF_FIXED_CTR, 0x0ULL);
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
     }
+    return 0;
 }
 
-#define HAS_SETUP_BOX \
-    if (haveLock) \
-    { \
-        flags = (1ULL<<22)|(1ULL<<20); \
-        flags |= (event->umask<<8) + event->eventId; \
-        if (event->cfgBits != 0) /* set custom cfg and cmask */ \
-        { \
-            flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */ \
-            flags |= ((event->cmask<<8) + event->cfgBits)<<16; \
-        } \
-        msr_write(cpu_id, reg , flags); \
-    }
-
-void perfmon_setupCounterThread_haswell(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int perfmon_startCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
     uint64_t flags = 0x0ULL;
-    uint32_t uflags;
-    uint64_t reg = haswell_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-    uint64_t orig_fixed_flags = fixed_flags;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
+    uint64_t tmp = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    switch (haswell_counter_map[index].type)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        case PMC:
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            PciDeviceIndex dev = counter_map[index].device;
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                    flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));  /* enable counter */
+                    break;
 
-            flags = (1<<22)|(1<<16);
+                case FIXED:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                    flags |= (1ULL<<(index+32));  /* enable fixed counter */
+                    break;
 
-            /* Intel with standard 8 bit event mask: [7:0] */
-            flags |= (event->umask<<8) + event->eventId;
+                case POWER:
+                    if (haveLock)
+                    {
+                        tmp = 0x0ULL;
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1,(uint32_t*)&tmp));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST tmp, START_POWER)
+                        eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+                    }
+                    break;
+                case WBOX0FIX:
+                    if (haveLock)
+                    {
+                        tmp = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST tmp, START_WBOXFIX);
+                        eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
+                    }
+                    break;
 
-            if (event->cfgBits != 0) /* set custom cfg and cmask */
-            {
-                flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-                flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+                default:
+                    break;
             }
+        }
+    }
+
+    HASEP_UNFREEZE_UNCORE_AND_RESET_CTR;
+    
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST (1ULL<<63)|(1ULL<<62)|flags, CLEAR_PMC_AND_FIXED_OVERFLOW)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+    }
+
+    return 0;
+}
+
+int has_uncore_read(int cpu_id, RegisterIndex index, PerfmonEvent *event,
+                     uint64_t* cur_result, int* overflows, int flags,
+                     int global_offset, int box_offset)
+{
+    uint64_t result = 0x0ULL;
+    uint64_t tmp = 0x0ULL;
+    RegisterType type = counter_map[index].type;
+    PciDeviceIndex dev = counter_map[index].device;
+    uint64_t counter1 = counter_map[index].counterRegister;
+    uint64_t counter2 = counter_map[index].counterRegister2;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
 
-            if (perfmon_verbose)
+    CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &result));
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST result, READ_REG_1);
+    if (flags & FREEZE_FLAG_CLEAR_CTR)
+    {
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST 0x0U, CLEAR_PCI_REG_1);
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0U));
+    }
+    if (counter2 != 0x0)
+    {
+        result <<= 32;
+        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter2, &tmp));
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter2, LLU_CAST tmp, READ_REG_2);
+        result += tmp;
+        if (flags & FREEZE_FLAG_CLEAR_CTR)
+        {
+            VERBOSEPRINTPCIREG(cpu_id, dev, counter2, LLU_CAST 0x0U, CLEAR_PCI_REG_2);
+            CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter2, 0x0U));
+        }
+    }
+    result = field64(result, 0, box_map[type].regWidth);
+
+    if (result < *cur_result)
+    {
+        uint64_t ovf_values = 0x0ULL;
+        int global_offset = box_map[type].ovflOffset;
+        int test_local = 0;
+        if (global_offset != -1)
+        {
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV,
+                                           MSR_UNC_V3_U_PMON_GLOBAL_STATUS,
+                                           &ovf_values));
+            VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST ovf_values, READ_GLOBAL_OVFL);
+            if (ovf_values & (1<<global_offset))
             {
-                printf("[%d] perfmon_setup_counter PMC: Write Register 0x%llX , Flags: 0x%llX \n",
-                        cpu_id,
-                        LLU_CAST reg,
-                        LLU_CAST flags);
+                VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST (1<<global_offset), CLEAR_GLOBAL_OVFL);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+                                                 MSR_UNC_V3_U_PMON_GLOBAL_STATUS,
+                                                 (1<<global_offset)));
+                test_local = 1;
             }
-            msr_write(cpu_id, reg , flags);
-            break;
-
-        case FIXED:
-            fixed_flags |= (0x2 << (index*4));
-            break;
-
-        case POWER:
-            break;
-
-        case CBOX0:
-        case CBOX1:
-        case CBOX2:
-        case CBOX3:
-        case UBOX:
-            if (cpuid_info.supportUncore)
+        }
+        else
+        {
+            test_local = 1;
+        }
+
+        if (test_local)
+        {
+            ovf_values = 0x0ULL;
+            CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev,
+                                              box_map[type].statusRegister,
+                                              &ovf_values));
+            VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].statusRegister, LLU_CAST ovf_values, READ_BOX_OVFL);
+            if (ovf_values & (1<<box_offset))
             {
-                HAS_SETUP_BOX;
+                (*overflows)++;
+                VERBOSEPRINTPCIREG(cpu_id, dev, box_map[type].statusRegister, LLU_CAST (1<<box_offset), RESET_BOX_OVFL);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev,
+                                                    box_map[type].statusRegister,
+                                                    (1<<box_offset)));
             }
-            break;
+        }
+    }
+    *cur_result = result;
+    return 0;
+}
 
-        default:
-            /* should never be reached */
-            break;
+#define HASEP_CHECK_CORE_OVERFLOW(offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t ovf_values = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values)); \
+        if (ovf_values & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+        } \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<offset))); \
     }
-    if (fixed_flags != orig_fixed_flags)
-    {
-        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+
+
+#define HASEP_CHECK_LOCAL_OVERFLOW \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t ovf_values = 0x0ULL; \
+        uint64_t offset = getCounterTypeOffset(eventSet->events[i].index); \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[eventSet->events[i].type].statusRegister, &ovf_values)); \
+        if (ovf_values & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[eventSet->events[i].type].statusRegister, (1ULL<<offset))); \
+        } \
     }
-}
 
-void perfmon_startCountersThread_haswell(int thread_id)
+int perfmon_stopCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    uint64_t flags = 0x0ULL;
-    uint32_t uflags = 0x10000UL; /* Clear freeze bit */
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    int start_uncore = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    
+    HASEP_FREEZE_UNCORE;
+
 
-    for ( int i=0; i<perfmon_numCountersHaswell; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            switch (haswell_counter_map[i].type)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            PerfmonEvent *event = &(eventSet->events[i].event);
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+            switch (type)
             {
                 case PMC:
-                    msr_write(cpu_id, haswell_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    HASEP_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
                     break;
 
                 case FIXED:
-                    msr_write(cpu_id, haswell_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1ULL<<(i+32));  /* enable fixed counter */
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    HASEP_CHECK_CORE_OVERFLOW(index+32);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
                     break;
 
                 case POWER:
-                    if(haveLock)
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_read(cpu_id, haswell_counter_map[i].counterRegister);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+                        if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                        }
+                        *current = field64(counter_result, 0, box_map[type].regWidth);
                     }
                     break;
 
+                case THERMAL:
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+                    *current = field64(counter_result, 0, box_map[type].regWidth);
+                    break;
+
                 case CBOX0:
                 case CBOX1:
                 case CBOX2:
                 case CBOX3:
+                case CBOX4:
+                case CBOX5:
+                case CBOX6:
+                case CBOX7:
+                case CBOX8:
+                case CBOX9:
+                case CBOX10:
+                case CBOX11:
+                case CBOX12:
+                case CBOX13:
+                case CBOX14:
+                case CBOX15:
+                case CBOX16:
+                case CBOX17:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, -1, getCounterTypeOffset(index));
+                    break;
+
                 case UBOX:
-                    start_uncore = 1;
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, 1, getCounterTypeOffset(index));
+                    break;
+                case UBOXFIX:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, 0, getCounterTypeOffset(index));
+                    break;
+
+                case SBOX0:
+                case SBOX1:
+                case SBOX2:
+                case SBOX3:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, -1, getCounterTypeOffset(index));
+                    break;
+
+                case WBOX:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, 2, getCounterTypeOffset(index));
+                    break;
+                case WBOX0FIX:
+                    if (haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                        if (counter_result < *current)
+                        {
+                            (*overflows)++;
+                        }
+                        *current = counter_result;
+                    }
+                    break;
+
+                case BBOX0:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, 21, getCounterTypeOffset(index));
+                    break;
+                case BBOX1:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, 22, getCounterTypeOffset(index));
+                    break;
+
+                case MBOX0:
+                case MBOX1:
+                case MBOX2:
+                case MBOX3:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, 23, getCounterTypeOffset(index)+1);
+                    break;
+
+                case MBOX4:
+                case MBOX5:
+                case MBOX6:
+                case MBOX7:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, 24, getCounterTypeOffset(index)+1);
+                    break;
+
+                case PBOX:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, 29, getCounterTypeOffset(index));
+                    break;
+
+                case IBOX0:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, 34, getCounterTypeOffset(index));
+                    break;
+                case IBOX1:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, 34, getCounterTypeOffset(index)+2);
+                    break;
+
+                case RBOX0:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, 27, getCounterTypeOffset(index));
+                    break;
+                case RBOX1:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, 28, getCounterTypeOffset(index));
+                    break;
+
+                case QBOX0:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, 25, getCounterTypeOffset(index));
+                    break;
+                case QBOX1:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    FREEZE_FLAG_CLEAR_CTR, 26, getCounterTypeOffset(index));
+                    break;
+
+                case QBOX0FIX:
+                case QBOX1FIX:
+                    if (eventSet->events[i].event.eventId == 0x00)
+                    {
+                        HPMread(cpu_id, dev, counter1, &counter_result);
+                        switch(extractBitField(counter_result, 3, 0))
+                        {
+                            case 0x2:
+                                counter_result = 5.6E9;
+                                break;
+                            case 0x3:
+                                counter_result = 6.4E9;
+                                break;
+                            case 0x4:
+                                counter_result = 7.2E9;
+                                break;
+                            case 0x5:
+                                counter_result = 8.0E9;
+                                break;
+                            case 0x6:
+                                counter_result = 8.8E9;
+                                break;
+                            case 0x7:
+                                counter_result = 9.6E9;
+                                break;
+                            default:
+                                counter_result = 0;
+                                break;
+                        }
+                        
+                    }
+                    else if ((eventSet->events[i].event.eventId == 0x01) ||
+                             (eventSet->events[i].event.eventId == 0x02))
+                    {
+                        HPMread(cpu_id, dev, counter1, &counter_result);
+                        counter_result = field64(counter_result, 0, box_map[type].regWidth);
+                    }
+                    eventSet->events[i].threadCounter[thread_id].counterData = counter_result;
                     break;
 
                 default:
-                    /* should never be reached */
                     break;
             }
         }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
 
-    if (haveLock && start_uncore && cpuid_info.supportUncore)
-    {
-        msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, (1ULL<<29));
-    }
 
-    if (perfmon_verbose)
-    {
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
-    }
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+    return 0;
 }
 
-void perfmon_stopCountersThread_haswell(int thread_id)
+
+int perfmon_readCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    uint64_t tmp;
-    uint32_t uflags = 0x10100UL; /* Set freeze bit */
-    uint64_t counter_result = 0x0ULL;
+    uint64_t flags = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    if (haveLock && cpuid_info.supportUncore)
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
     {
-        msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, 0x0ULL);
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, SAFE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, RESET_PMC_FLAGS)
     }
+    HASEP_FREEZE_UNCORE;
 
-    for ( int i=0; i < perfmon_numCountersHaswell; i++ ) 
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            switch (haswell_counter_map[i].type)
+            counter_result= 0x0ULL;
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            PerfmonEvent *event = &(eventSet->events[i].event);
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+            switch (type)
             {
                 case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    HASEP_CHECK_CORE_OVERFLOW(index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+                    eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+                    break;
 
                 case FIXED:
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, haswell_counter_map[i].counterRegister);
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    HASEP_CHECK_CORE_OVERFLOW(index+32);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
+                    eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
                     break;
 
                 case POWER:
-                    if(haveLock)
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_info.energyUnit *
-                            ( power_read(cpu_id, haswell_counter_map[i].counterRegister) -
-                              perfmon_threadData[thread_id].counters[i].counterData);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+                        if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                        }
+                        eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
                     }
                     break;
 
                 case THERMAL:
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                             thermal_read(cpu_id);
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id,(uint32_t*)&counter_result));
+                    eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
                     break;
 
                 case CBOX0:
                 case CBOX1:
                 case CBOX2:
                 case CBOX3:
+                case CBOX4:
+                case CBOX5:
+                case CBOX6:
+                case CBOX7:
+                case CBOX8:
+                case CBOX9:
+                case CBOX10:
+                case CBOX11:
+                case CBOX12:
+                case CBOX13:
+                case CBOX14:
+                case CBOX15:
+                case CBOX16:
+                case CBOX17:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, -1, getCounterTypeOffset(index));
+                    break;
+
                 case UBOX:
-                    if(haveLock && cpuid_info.supportUncore)
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, 1, getCounterTypeOffset(index));
+                    break;
+                case UBOXFIX:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, 0, getCounterTypeOffset(index));
+                    break;
+
+                case SBOX0:
+                case SBOX1:
+                case SBOX2:
+                case SBOX3:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, -1, getCounterTypeOffset(index));
+                    break;
+
+                case WBOX:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, 2, getCounterTypeOffset(index));
+                    break;
+                case WBOX0FIX:
+                    if (haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                        if (counter_result < *current)
+                        {
+                            (*overflows)++;
+                        }
+                        *current = counter_result;
+                    }
+                    break;
+
+                case BBOX0:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, 21, getCounterTypeOffset(index));
+                    break;
+                case BBOX1:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, 22, getCounterTypeOffset(index));
+                    break;
+
+                case MBOX0:
+                case MBOX1:
+                case MBOX2:
+                case MBOX3:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, 23, getCounterTypeOffset(index)+1);
+                    break;
+
+                case MBOX4:
+                case MBOX5:
+                case MBOX6:
+                case MBOX7:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, 24, getCounterTypeOffset(index)+1);
+                    break;
+
+                case PBOX:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, 29, getCounterTypeOffset(index));
+                    break;
+
+                case IBOX0:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, 34, getCounterTypeOffset(index));
+                    break;
+                case IBOX1:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, 34, getCounterTypeOffset(index)+2);
+                    break;
+
+                case RBOX0:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, 27, getCounterTypeOffset(index));
+                    break;
+                case RBOX1:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, 28, getCounterTypeOffset(index));
+                    break;
+
+                case QBOX0:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, 25, getCounterTypeOffset(index));
+                    break;
+                case QBOX1:
+                    has_uncore_read(cpu_id, index, event, current, overflows,
+                                    0, 26, getCounterTypeOffset(index));
+                    break;
+
+                case QBOX0FIX:
+                case QBOX1FIX:
+                    if (eventSet->events[i].event.eventId == 0x00)
+                    {
+                        HPMread(cpu_id, dev, counter1, &counter_result);
+                        switch(extractBitField(counter_result, 3, 0))
+                        {
+                            case 0x2:
+                                counter_result = 5.6E9;
+                                break;
+                            case 0x3:
+                                counter_result = 6.4E9;
+                                break;
+                            case 0x4:
+                                counter_result = 7.2E9;
+                                break;
+                            case 0x5:
+                                counter_result = 8.0E9;
+                                break;
+                            case 0x6:
+                                counter_result = 8.8E9;
+                                break;
+                            case 0x7:
+                                counter_result = 9.6E9;
+                                break;
+                            default:
+                                counter_result = 0;
+                                break;
+                        }
+                        
+                    }
+                    else if ((eventSet->events[i].event.eventId == 0x01) ||
+                             (eventSet->events[i].event.eventId == 0x02))
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            msr_read(cpu_id, haswell_counter_map[i].counterRegister);
+                        HPMread(cpu_id, dev, counter1, &counter_result);
+                        counter_result = field64(counter_result, 0, box_map[type].regWidth);
                     }
+                    eventSet->events[i].threadCounter[thread_id].counterData = counter_result;
                     break;
 
                 default:
-                    /* should never be reached */
                     break;
             }
         }
     }
 
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    //    printf ("Status: 0x%llX \n", LLU_CAST flags);
-    if ( (flags & 0x3) || (flags & (0x3ULL<<32)) ) 
+    HASEP_UNFREEZE_UNCORE;
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
     {
-        printf ("Overflow occured \n");
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, RESTORE_PMC_FLAGS)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
     }
+
+    return 0;
 }
 
-void perfmon_readCountersThread_haswell(int thread_id)
+int perfmon_finalizeCountersThread_haswell(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t counter_result = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t core_flags = 0x0ULL;
-    uint64_t uncore_flags = 0x0ULL;
+    int haveTileLock = 0;
+    int clearPBS = 0;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+    uint64_t ovf_values_uncore = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
-
-    core_flags = msr_read(cpu_id, MSR_PERF_GLOBAL_CTRL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    if (cpuid_info.supportUncore)
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
     {
-        uncore_flags = msr_read(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL);
-        msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, 0x0ULL);
+        haveTileLock = 1;
     }
-
-    for ( int i=0; i<perfmon_numCountersHaswell; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        RegisterIndex index = eventSet->events[i].index;
+        PciDeviceIndex dev = counter_map[index].device;
+        uint64_t reg = counter_map[index].configRegister;
+        RegisterType type = eventSet->events[i].type;
+        if (type == NOTYPE)
         {
-            if ((haswell_counter_map[i].type == PMC) ||
-                    (haswell_counter_map[i].type == FIXED))
-            {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, haswell_counter_map[i].counterRegister);
-            }
-            else
-            {
-                if(haveLock)
+            continue;
+        }
+        switch (type)
+        {
+            case PMC:
+                ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                }
+                else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                }
+                break;
+            case FIXED:
+                ovf_values_core |= (1ULL<<(index+32));
+                break;
+            default:
+                /*if (counter_map[index].type > UNCORE)
                 {
-                    switch (haswell_counter_map[i].type)
+                    if (box_map[counter_map[index].type].ovflOffset >= 0)
                     {
-                        case POWER:
-                            perfmon_threadData[thread_id].counters[i].counterData =
-                                power_info.energyUnit *
-                                power_read(cpu_id, haswell_counter_map[i].counterRegister);
-                            break;
-
-                        case CBOX0:
-                        case CBOX1:
-                        case CBOX2:
-                        case CBOX3:
-                        case UBOX:
-                            if(haveLock)
-                            {
-                                perfmon_threadData[thread_id].counters[i].counterData =
-                                    msr_read(cpu_id, haswell_counter_map[i].counterRegister);
-                            }
-                            break;
-                        default:
-                            /* should never be reached */
-                            break;
+                        ovf_values_uncore |= (1ULL<<box_map[counter_map[index].type].ovflOffset);
                     }
-                }
+                }*/
+                break;
+        }
+        if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+        {
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, reg, &ovf_values_uncore));
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, ovf_values_uncore, SHOW_CTL);
+            ovf_values_uncore = 0x0ULL;
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            if ((type >= SBOX0) && (type <= SBOX3))
+            {
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
             }
         }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
-    if (cpuid_info.supportUncore && uncore_flags > 0x0ULL)
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL))
     {
-        msr_write(cpu_id, MSR_UNC_PERF_GLOBAL_CTRL, uncore_flags);
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, LLU_CAST ovf_values_uncore, CLEAR_UNCORE_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_STATUS, ovf_values_uncore));
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_V3_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UNCORE_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_V3_U_PMON_GLOBAL_CTL, 0x0ULL));
     }
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, core_flags);
-}
 
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_haswellEP_counters.h b/src/includes/perfmon_haswellEP_counters.h
new file mode 100644
index 0000000..1fd312c
--- /dev/null
+++ b/src/includes/perfmon_haswellEP_counters.h
@@ -0,0 +1,316 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_haswellEP_counters.h
+ *
+ *      Description:  Counter Header File of perfmon module for Intel Haswell EP/EN/EX.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_HASWELL_EP 179
+#define NUM_COUNTERS_CORE_HASWELL_EP 8
+#define NUM_COUNTERS_UNCORE_HASWELL_EP 111
+
+#define HAS_EP_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define HAS_EP_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_CBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_TID_MASK
+#define HAS_EP_VALID_OPTIONS_UBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_SBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_TID_MASK
+#define HAS_EP_VALID_OPTIONS_BBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_WBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+            EVENT_OPTION_OCCUPANCY_MASK|EVENT_OPTION_OCCUPANCY_FILTER_MASK|EVENT_OPTION_OCCUPANCY_EDGE_MASK|\
+            EVENT_OPTION_OCCUPANCY_INVERT_MASK
+#define HAS_EP_VALID_OPTIONS_MBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_IBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_PBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_RBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define HAS_EP_VALID_OPTIONS_QBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap haswellEP_counter_map[NUM_COUNTERS_HASWELL_EP] = {
+    /* Fixed Counters: instructions retired, cycles unhalted core */
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_FIXED},
+    /* PMC Counters: 4 48bit wide */
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, HAS_EP_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, HAS_EP_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, HAS_EP_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, HAS_EP_VALID_OPTIONS_PMC},
+    /* Temperature Sensor*/
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* RAPL counters */
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX0C0", PMC12, CBOX0, MSR_UNC_V3_C0_PMON_CTL0, MSR_UNC_V3_C0_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX0C1", PMC13, CBOX0, MSR_UNC_V3_C0_PMON_CTL1, MSR_UNC_V3_C0_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX0C2", PMC14, CBOX0, MSR_UNC_V3_C0_PMON_CTL2, MSR_UNC_V3_C0_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX0C3", PMC15, CBOX0, MSR_UNC_V3_C0_PMON_CTL3, MSR_UNC_V3_C0_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX1C0", PMC16, CBOX1, MSR_UNC_V3_C1_PMON_CTL0, MSR_UNC_V3_C1_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX1C1", PMC17, CBOX1, MSR_UNC_V3_C1_PMON_CTL1, MSR_UNC_V3_C1_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX1C2", PMC18, CBOX1, MSR_UNC_V3_C1_PMON_CTL2, MSR_UNC_V3_C1_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX1C3", PMC19, CBOX1, MSR_UNC_V3_C1_PMON_CTL3, MSR_UNC_V3_C1_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX2C0", PMC20, CBOX2, MSR_UNC_V3_C2_PMON_CTL0, MSR_UNC_V3_C2_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX2C1", PMC21, CBOX2, MSR_UNC_V3_C2_PMON_CTL1, MSR_UNC_V3_C2_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX2C2", PMC22, CBOX2, MSR_UNC_V3_C2_PMON_CTL2, MSR_UNC_V3_C2_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX2C3", PMC23, CBOX2, MSR_UNC_V3_C2_PMON_CTL3, MSR_UNC_V3_C2_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX3C0", PMC24, CBOX3, MSR_UNC_V3_C3_PMON_CTL0, MSR_UNC_V3_C3_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX3C1", PMC25, CBOX3, MSR_UNC_V3_C3_PMON_CTL1, MSR_UNC_V3_C3_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX3C2", PMC26, CBOX3, MSR_UNC_V3_C3_PMON_CTL2, MSR_UNC_V3_C3_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX3C3", PMC27, CBOX3, MSR_UNC_V3_C3_PMON_CTL3, MSR_UNC_V3_C3_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX4C0", PMC28, CBOX4, MSR_UNC_V3_C4_PMON_CTL0, MSR_UNC_V3_C4_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX4C1", PMC29, CBOX4, MSR_UNC_V3_C4_PMON_CTL1, MSR_UNC_V3_C4_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX4C2", PMC30, CBOX4, MSR_UNC_V3_C4_PMON_CTL2, MSR_UNC_V3_C4_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX4C3", PMC31, CBOX4, MSR_UNC_V3_C4_PMON_CTL3, MSR_UNC_V3_C4_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX5C0", PMC32, CBOX5, MSR_UNC_V3_C5_PMON_CTL0, MSR_UNC_V3_C5_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX5C1", PMC33, CBOX5, MSR_UNC_V3_C5_PMON_CTL1, MSR_UNC_V3_C5_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX5C2", PMC34, CBOX5, MSR_UNC_V3_C5_PMON_CTL2, MSR_UNC_V3_C5_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX5C3", PMC35, CBOX5, MSR_UNC_V3_C5_PMON_CTL3, MSR_UNC_V3_C5_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX6C0", PMC36, CBOX6, MSR_UNC_V3_C6_PMON_CTL0, MSR_UNC_V3_C6_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX6C1", PMC37, CBOX6, MSR_UNC_V3_C6_PMON_CTL1, MSR_UNC_V3_C6_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX6C2", PMC38, CBOX6, MSR_UNC_V3_C6_PMON_CTL2, MSR_UNC_V3_C6_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX6C3", PMC39, CBOX6, MSR_UNC_V3_C6_PMON_CTL3, MSR_UNC_V3_C6_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX7C0", PMC40, CBOX7, MSR_UNC_V3_C7_PMON_CTL0, MSR_UNC_V3_C7_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX7C1", PMC41, CBOX7, MSR_UNC_V3_C7_PMON_CTL1, MSR_UNC_V3_C7_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX7C2", PMC42, CBOX7, MSR_UNC_V3_C7_PMON_CTL2, MSR_UNC_V3_C7_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX7C3", PMC43, CBOX7, MSR_UNC_V3_C7_PMON_CTL3, MSR_UNC_V3_C7_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX8C0", PMC44, CBOX8, MSR_UNC_V3_C8_PMON_CTL0, MSR_UNC_V3_C8_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX8C1", PMC45, CBOX8, MSR_UNC_V3_C8_PMON_CTL1, MSR_UNC_V3_C8_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX8C2", PMC46, CBOX8, MSR_UNC_V3_C8_PMON_CTL2, MSR_UNC_V3_C8_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX8C3", PMC47, CBOX8, MSR_UNC_V3_C8_PMON_CTL3, MSR_UNC_V3_C8_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX9C0", PMC48, CBOX9, MSR_UNC_V3_C9_PMON_CTL0, MSR_UNC_V3_C9_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX9C1", PMC49, CBOX9, MSR_UNC_V3_C9_PMON_CTL1, MSR_UNC_V3_C9_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX9C2", PMC50, CBOX9, MSR_UNC_V3_C9_PMON_CTL2, MSR_UNC_V3_C9_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX9C3", PMC51, CBOX9, MSR_UNC_V3_C9_PMON_CTL3, MSR_UNC_V3_C9_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX10C0", PMC52, CBOX10, MSR_UNC_V3_C10_PMON_CTL0, MSR_UNC_V3_C10_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX10C1", PMC53, CBOX10, MSR_UNC_V3_C10_PMON_CTL1, MSR_UNC_V3_C10_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX10C2", PMC54, CBOX10, MSR_UNC_V3_C10_PMON_CTL2, MSR_UNC_V3_C10_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX10C3", PMC55, CBOX10, MSR_UNC_V3_C10_PMON_CTL3, MSR_UNC_V3_C10_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX11C0", PMC56, CBOX11, MSR_UNC_V3_C11_PMON_CTL0, MSR_UNC_V3_C11_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX11C1", PMC57, CBOX11, MSR_UNC_V3_C11_PMON_CTL1, MSR_UNC_V3_C11_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX11C2", PMC58, CBOX11, MSR_UNC_V3_C11_PMON_CTL2, MSR_UNC_V3_C11_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX11C3", PMC59, CBOX11, MSR_UNC_V3_C11_PMON_CTL3, MSR_UNC_V3_C11_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX12C0", PMC60, CBOX12, MSR_UNC_V3_C12_PMON_CTL0, MSR_UNC_V3_C12_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX12C1", PMC61, CBOX12, MSR_UNC_V3_C12_PMON_CTL1, MSR_UNC_V3_C12_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX12C2", PMC62, CBOX12, MSR_UNC_V3_C12_PMON_CTL2, MSR_UNC_V3_C12_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX12C3", PMC63, CBOX12, MSR_UNC_V3_C12_PMON_CTL3, MSR_UNC_V3_C12_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX13C0", PMC64, CBOX13, MSR_UNC_V3_C13_PMON_CTL0, MSR_UNC_V3_C13_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX13C1", PMC65, CBOX13, MSR_UNC_V3_C13_PMON_CTL1, MSR_UNC_V3_C13_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX13C2", PMC66, CBOX13, MSR_UNC_V3_C13_PMON_CTL2, MSR_UNC_V3_C13_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX13C3", PMC67, CBOX13, MSR_UNC_V3_C13_PMON_CTL3, MSR_UNC_V3_C13_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX14C0", PMC68, CBOX14, MSR_UNC_V3_C14_PMON_CTL0, MSR_UNC_V3_C14_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX14C1", PMC69, CBOX14, MSR_UNC_V3_C14_PMON_CTL1, MSR_UNC_V3_C14_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX14C2", PMC70, CBOX14, MSR_UNC_V3_C14_PMON_CTL2, MSR_UNC_V3_C14_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX14C3", PMC71, CBOX14, MSR_UNC_V3_C14_PMON_CTL3, MSR_UNC_V3_C14_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX15C0", PMC72, CBOX15, MSR_UNC_V3_C15_PMON_CTL0, MSR_UNC_V3_C15_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX15C1", PMC73, CBOX15, MSR_UNC_V3_C15_PMON_CTL1, MSR_UNC_V3_C15_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX15C2", PMC74, CBOX15, MSR_UNC_V3_C15_PMON_CTL2, MSR_UNC_V3_C15_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX15C3", PMC75, CBOX15, MSR_UNC_V3_C15_PMON_CTL3, MSR_UNC_V3_C15_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX16C0", PMC76, CBOX16, MSR_UNC_V3_C16_PMON_CTL0, MSR_UNC_V3_C16_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX16C1", PMC77, CBOX16, MSR_UNC_V3_C16_PMON_CTL1, MSR_UNC_V3_C16_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX16C2", PMC78, CBOX16, MSR_UNC_V3_C16_PMON_CTL2, MSR_UNC_V3_C16_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX16C3", PMC79, CBOX16, MSR_UNC_V3_C16_PMON_CTL3, MSR_UNC_V3_C16_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX17C0", PMC80, CBOX17, MSR_UNC_V3_C17_PMON_CTL0, MSR_UNC_V3_C17_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX17C1", PMC81, CBOX17, MSR_UNC_V3_C17_PMON_CTL1, MSR_UNC_V3_C17_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX17C2", PMC82, CBOX17, MSR_UNC_V3_C17_PMON_CTL2, MSR_UNC_V3_C17_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"CBOX17C3", PMC83, CBOX17, MSR_UNC_V3_C17_PMON_CTL3, MSR_UNC_V3_C17_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_CBOX},
+    {"UBOX0", PMC84, UBOX, MSR_UNC_V3_U_PMON_CTL0, MSR_UNC_V3_U_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_UBOX},
+    {"UBOX1", PMC85, UBOX, MSR_UNC_V3_U_PMON_CTL1, MSR_UNC_V3_U_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_UBOX},
+    {"UBOXFIX", PMC86, UBOXFIX, MSR_UNC_V3_U_UCLK_FIXED_CTL, MSR_UNC_V3_U_UCLK_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"SBOX0C0", PMC87, SBOX0, MSR_UNC_V3_S0_PMON_CTL_0, MSR_UNC_V3_S0_PMON_CTR_0, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX0C1", PMC88, SBOX0, MSR_UNC_V3_S0_PMON_CTL_1, MSR_UNC_V3_S0_PMON_CTR_1, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX0C2", PMC89, SBOX0, MSR_UNC_V3_S0_PMON_CTL_2, MSR_UNC_V3_S0_PMON_CTR_2, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX0C3", PMC90, SBOX0, MSR_UNC_V3_S0_PMON_CTL_3, MSR_UNC_V3_S0_PMON_CTR_3, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX1C0", PMC91, SBOX1, MSR_UNC_V3_S1_PMON_CTL_0, MSR_UNC_V3_S1_PMON_CTR_0, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX1C1", PMC92, SBOX1, MSR_UNC_V3_S1_PMON_CTL_1, MSR_UNC_V3_S1_PMON_CTR_1, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX1C2", PMC93, SBOX1, MSR_UNC_V3_S1_PMON_CTL_2, MSR_UNC_V3_S1_PMON_CTR_2, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX1C3", PMC94, SBOX1, MSR_UNC_V3_S1_PMON_CTL_3, MSR_UNC_V3_S1_PMON_CTR_3, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX2C0", PMC95, SBOX2, MSR_UNC_V3_S2_PMON_CTL_0, MSR_UNC_V3_S2_PMON_CTR_0, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX2C1", PMC96, SBOX2, MSR_UNC_V3_S2_PMON_CTL_1, MSR_UNC_V3_S2_PMON_CTR_1, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX2C2", PMC97, SBOX2, MSR_UNC_V3_S2_PMON_CTL_2, MSR_UNC_V3_S2_PMON_CTR_2, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX2C3", PMC98, SBOX2, MSR_UNC_V3_S2_PMON_CTL_3, MSR_UNC_V3_S2_PMON_CTR_3, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX3C0", PMC99, SBOX3, MSR_UNC_V3_S3_PMON_CTL_0, MSR_UNC_V3_S3_PMON_CTR_0, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX3C1", PMC100, SBOX3, MSR_UNC_V3_S3_PMON_CTL_1, MSR_UNC_V3_S3_PMON_CTR_1, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX3C2", PMC101, SBOX3, MSR_UNC_V3_S3_PMON_CTL_2, MSR_UNC_V3_S3_PMON_CTR_2, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"SBOX3C3", PMC102, SBOX3, MSR_UNC_V3_S3_PMON_CTL_3, MSR_UNC_V3_S3_PMON_CTR_3, 0, 0, HAS_EP_VALID_OPTIONS_SBOX},
+    {"WBOX0", PMC103, WBOX, MSR_UNC_V3_PCU_PMON_CTL0, MSR_UNC_V3_PCU_PMON_CTR0, 0, 0, HAS_EP_VALID_OPTIONS_WBOX},
+    {"WBOX1", PMC104, WBOX, MSR_UNC_V3_PCU_PMON_CTL1, MSR_UNC_V3_PCU_PMON_CTR1, 0, 0, HAS_EP_VALID_OPTIONS_WBOX},
+    {"WBOX2", PMC105, WBOX, MSR_UNC_V3_PCU_PMON_CTL2, MSR_UNC_V3_PCU_PMON_CTR2, 0, 0, HAS_EP_VALID_OPTIONS_WBOX},
+    {"WBOX3", PMC106, WBOX, MSR_UNC_V3_PCU_PMON_CTL3, MSR_UNC_V3_PCU_PMON_CTR3, 0, 0, HAS_EP_VALID_OPTIONS_WBOX},
+    {"WBOX0FIX", PMC107, WBOX0FIX, 0, MSR_UNC_V3_PCU_CC6_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"WBOX1FIX", PMC108, WBOX0FIX, 0, MSR_UNC_V3_PCU_CC3_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"WBOX2FIX", PMC109, WBOX0FIX, 0, MSR_UNC_V3_PCU_PC2_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"WBOX3FIX", PMC110, WBOX0FIX, 0, MSR_UNC_V3_PCU_PC3_CTR , 0, 0, EVENT_OPTION_NONE_MASK},
+    {"BBOX0C0", PMC111, BBOX0, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_0, HAS_EP_VALID_OPTIONS_BBOX},
+    {"BBOX0C1", PMC112, BBOX0, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_0, HAS_EP_VALID_OPTIONS_BBOX},
+    {"BBOX0C2", PMC113, BBOX0, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_0, HAS_EP_VALID_OPTIONS_BBOX},
+    {"BBOX0C3", PMC114, BBOX0, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_0, HAS_EP_VALID_OPTIONS_BBOX},
+    {"BBOX1C0", PMC115, BBOX1, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_1, HAS_EP_VALID_OPTIONS_BBOX},
+    {"BBOX1C1", PMC116, BBOX1, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_1, HAS_EP_VALID_OPTIONS_BBOX},
+    {"BBOX1C2", PMC117, BBOX1, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_1, HAS_EP_VALID_OPTIONS_BBOX},
+    {"BBOX1C3", PMC118, BBOX1, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_1, HAS_EP_VALID_OPTIONS_BBOX},
+    {"MBOX0C0", PMC119, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0C1", PMC120, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0C2", PMC121, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX0C3", PMC122, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX1C0", PMC123, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX1C1", PMC124, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX1C2", PMC125, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX1C3", PMC126, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX2C0", PMC127, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX2C1", PMC128, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX2C2", PMC129, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX2C3", PMC130, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX3C0", PMC131, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX3C1", PMC132, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX3C2", PMC133, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX3C3", PMC134, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX4C0", PMC135, MBOX4, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX4C1", PMC136, MBOX4, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX4C2", PMC137, MBOX4, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX4C3", PMC138, MBOX4, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_0, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX5C0", PMC139, MBOX5, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX5C1", PMC140, MBOX5, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX5C2", PMC141, MBOX5, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX5C3", PMC142, MBOX5, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_1, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX6C0", PMC143, MBOX6, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX6C1", PMC144, MBOX6, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX6C2", PMC145, MBOX6, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX6C3", PMC146, MBOX6, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_2, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX7C0", PMC147, MBOX7, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX7C1", PMC148, MBOX7, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX7C2", PMC149, MBOX7, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"MBOX7C3", PMC150, MBOX7, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_3, HAS_EP_VALID_OPTIONS_MBOX},
+    {"IBOX0C0", PMC151, IBOX0, PCI_UNC_IRP0_PMON_CTL_0, PCI_UNC_IRP0_PMON_CTR_0, 0, PCI_IRP_DEVICE, HAS_EP_VALID_OPTIONS_IBOX},
+    {"IBOX0C1", PMC152, IBOX0, PCI_UNC_IRP0_PMON_CTL_1, PCI_UNC_IRP0_PMON_CTR_1, 0, PCI_IRP_DEVICE, HAS_EP_VALID_OPTIONS_IBOX},
+    {"IBOX1C0", PMC153, IBOX1, PCI_UNC_IRP1_PMON_CTL_0, PCI_UNC_IRP1_PMON_CTR_0, 0, PCI_IRP_DEVICE, HAS_EP_VALID_OPTIONS_IBOX},
+    {"IBOX1C1", PMC154, IBOX1, PCI_UNC_IRP1_PMON_CTL_1, PCI_UNC_IRP1_PMON_CTR_1, 0, PCI_IRP_DEVICE, HAS_EP_VALID_OPTIONS_IBOX},
+    {"PBOX0", PMC155, PBOX, PCI_UNC_R2PCIE_PMON_CTL_0, PCI_UNC_R2PCIE_PMON_CTR_0_A, PCI_UNC_R2PCIE_PMON_CTR_0_B, PCI_R2PCIE_DEVICE, HAS_EP_VALID_OPTIONS_PBOX},
+    {"PBOX1", PMC156, PBOX, PCI_UNC_R2PCIE_PMON_CTL_1, PCI_UNC_R2PCIE_PMON_CTR_1_A, PCI_UNC_R2PCIE_PMON_CTR_1_B, PCI_R2PCIE_DEVICE, HAS_EP_VALID_OPTIONS_PBOX},
+    {"PBOX2", PMC157, PBOX, PCI_UNC_R2PCIE_PMON_CTL_2, PCI_UNC_R2PCIE_PMON_CTR_2_A, PCI_UNC_R2PCIE_PMON_CTR_2_B, PCI_R2PCIE_DEVICE, HAS_EP_VALID_OPTIONS_PBOX},
+    {"PBOX3", PMC158, PBOX, PCI_UNC_R2PCIE_PMON_CTL_3, PCI_UNC_R2PCIE_PMON_CTR_3_A, PCI_UNC_R2PCIE_PMON_CTR_3_B, PCI_R2PCIE_DEVICE, HAS_EP_VALID_OPTIONS_PBOX},
+    {"RBOX0C0", PMC159, RBOX0, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_0, HAS_EP_VALID_OPTIONS_RBOX},
+    {"RBOX0C1", PMC160, RBOX0, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_0, HAS_EP_VALID_OPTIONS_RBOX},
+    {"RBOX0C2", PMC161, RBOX0, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_0, HAS_EP_VALID_OPTIONS_RBOX},
+    {"RBOX1C0", PMC162, RBOX1, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_1, HAS_EP_VALID_OPTIONS_RBOX},
+    {"RBOX1C1", PMC163, RBOX1, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_1, HAS_EP_VALID_OPTIONS_RBOX},
+    {"RBOX1C2", PMC164, RBOX1, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_1, HAS_EP_VALID_OPTIONS_RBOX},
+    {"QBOX0C0", PMC165, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_0, PCI_UNC_V3_QPI_PMON_CTR_0_A, PCI_UNC_V3_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_0, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX0C1", PMC166, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_1, PCI_UNC_V3_QPI_PMON_CTR_1_A, PCI_UNC_V3_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_0, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX0C2", PMC167, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_2, PCI_UNC_V3_QPI_PMON_CTR_2_A, PCI_UNC_V3_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_0, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX0C3", PMC168, QBOX0, PCI_UNC_V3_QPI_PMON_CTL_3, PCI_UNC_V3_QPI_PMON_CTR_3_A, PCI_UNC_V3_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_0, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX1C0", PMC169, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_0, PCI_UNC_V3_QPI_PMON_CTR_0_A, PCI_UNC_V3_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_1, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX1C1", PMC170, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_1, PCI_UNC_V3_QPI_PMON_CTR_1_A, PCI_UNC_V3_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_1, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX1C2", PMC171, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_2, PCI_UNC_V3_QPI_PMON_CTR_2_A, PCI_UNC_V3_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_1, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX1C3", PMC172, QBOX1, PCI_UNC_V3_QPI_PMON_CTL_3, PCI_UNC_V3_QPI_PMON_CTR_3_A, PCI_UNC_V3_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_1, HAS_EP_VALID_OPTIONS_QBOX},
+    {"QBOX0FIX0", PMC173, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_RATE_STATUS, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"QBOX0FIX1", PMC174, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_LINK_IDLE, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"QBOX0FIX2", PMC175, QBOX0FIX, 0x0, PCI_UNC_V3_QPI_LINK_LLR, 0x0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"QBOX1FIX0", PMC176, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_RATE_STATUS, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+    {"QBOX1FIX1", PMC177, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_LINK_IDLE, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+    {"QBOX1FIX2", PMC178, QBOX1FIX, 0x0, PCI_UNC_V3_QPI_LINK_LLR, 0x0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+};
+
+
+static BoxMap haswellEP_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [THERMAL] = {0,0,0,-1,0,0,8},
+    [FIXED] =  {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [POWER] = {0,0,0,-1,0,0,32},
+    [CBOX0] = {MSR_UNC_V3_C0_PMON_BOX_CTL, MSR_UNC_V3_C0_PMON_BOX_STATUS, MSR_UNC_V3_C0_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C0_PMON_BOX_FILTER0, MSR_UNC_V3_C0_PMON_BOX_FILTER1},
+    [CBOX1] = {MSR_UNC_V3_C1_PMON_BOX_CTL, MSR_UNC_V3_C1_PMON_BOX_STATUS, MSR_UNC_V3_C1_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C1_PMON_BOX_FILTER0, MSR_UNC_V3_C1_PMON_BOX_FILTER1},
+    [CBOX2] = {MSR_UNC_V3_C2_PMON_BOX_CTL, MSR_UNC_V3_C2_PMON_BOX_STATUS, MSR_UNC_V3_C2_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C2_PMON_BOX_FILTER0, MSR_UNC_V3_C2_PMON_BOX_FILTER1},
+    [CBOX3] = {MSR_UNC_V3_C3_PMON_BOX_CTL, MSR_UNC_V3_C3_PMON_BOX_STATUS, MSR_UNC_V3_C3_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C3_PMON_BOX_FILTER0, MSR_UNC_V3_C3_PMON_BOX_FILTER1},
+    [CBOX4] = {MSR_UNC_V3_C4_PMON_BOX_CTL, MSR_UNC_V3_C4_PMON_BOX_STATUS, MSR_UNC_V3_C4_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C4_PMON_BOX_FILTER0, MSR_UNC_V3_C4_PMON_BOX_FILTER1},
+    [CBOX5] = {MSR_UNC_V3_C5_PMON_BOX_CTL, MSR_UNC_V3_C5_PMON_BOX_STATUS, MSR_UNC_V3_C5_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C5_PMON_BOX_FILTER0, MSR_UNC_V3_C5_PMON_BOX_FILTER1},
+    [CBOX6] = {MSR_UNC_V3_C6_PMON_BOX_CTL, MSR_UNC_V3_C6_PMON_BOX_STATUS, MSR_UNC_V3_C6_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C6_PMON_BOX_FILTER0, MSR_UNC_V3_C6_PMON_BOX_FILTER1},
+    [CBOX7] = {MSR_UNC_V3_C7_PMON_BOX_CTL, MSR_UNC_V3_C7_PMON_BOX_STATUS, MSR_UNC_V3_C7_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C7_PMON_BOX_FILTER0, MSR_UNC_V3_C7_PMON_BOX_FILTER1},
+    [CBOX8] = {MSR_UNC_V3_C8_PMON_BOX_CTL, MSR_UNC_V3_C8_PMON_BOX_STATUS, MSR_UNC_V3_C8_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C8_PMON_BOX_FILTER0, MSR_UNC_V3_C8_PMON_BOX_FILTER1},
+    [CBOX9] = {MSR_UNC_V3_C9_PMON_BOX_CTL, MSR_UNC_V3_C9_PMON_BOX_STATUS, MSR_UNC_V3_C9_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C9_PMON_BOX_FILTER0, MSR_UNC_V3_C9_PMON_BOX_FILTER1},
+    [CBOX10] = {MSR_UNC_V3_C10_PMON_BOX_CTL, MSR_UNC_V3_C10_PMON_BOX_STATUS, MSR_UNC_V3_C10_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C10_PMON_BOX_FILTER0, MSR_UNC_V3_C10_PMON_BOX_FILTER1},
+    [CBOX11] = {MSR_UNC_V3_C11_PMON_BOX_CTL, MSR_UNC_V3_C11_PMON_BOX_STATUS, MSR_UNC_V3_C11_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C11_PMON_BOX_FILTER0, MSR_UNC_V3_C11_PMON_BOX_FILTER1},
+    [CBOX12] = {MSR_UNC_V3_C12_PMON_BOX_CTL, MSR_UNC_V3_C12_PMON_BOX_STATUS, MSR_UNC_V3_C12_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C12_PMON_BOX_FILTER0, MSR_UNC_V3_C12_PMON_BOX_FILTER1},
+    [CBOX13] = {MSR_UNC_V3_C13_PMON_BOX_CTL, MSR_UNC_V3_C13_PMON_BOX_STATUS, MSR_UNC_V3_C13_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C13_PMON_BOX_FILTER0, MSR_UNC_V3_C13_PMON_BOX_FILTER1},
+    [CBOX14] = {MSR_UNC_V3_C14_PMON_BOX_CTL, MSR_UNC_V3_C14_PMON_BOX_STATUS, MSR_UNC_V3_C14_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C14_PMON_BOX_FILTER0, MSR_UNC_V3_C14_PMON_BOX_FILTER1},
+    [CBOX15] = {MSR_UNC_V3_C15_PMON_BOX_CTL, MSR_UNC_V3_C15_PMON_BOX_STATUS, MSR_UNC_V3_C15_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C15_PMON_BOX_FILTER0, MSR_UNC_V3_C15_PMON_BOX_FILTER1},
+    [CBOX16] = {MSR_UNC_V3_C16_PMON_BOX_CTL, MSR_UNC_V3_C16_PMON_BOX_STATUS, MSR_UNC_V3_C16_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C16_PMON_BOX_FILTER0, MSR_UNC_V3_C16_PMON_BOX_FILTER1},
+    [CBOX17] = {MSR_UNC_V3_C17_PMON_BOX_CTL, MSR_UNC_V3_C17_PMON_BOX_STATUS, MSR_UNC_V3_C17_PMON_BOX_STATUS, -1, 0, 0, 48, MSR_UNC_V3_C17_PMON_BOX_FILTER0, MSR_UNC_V3_C17_PMON_BOX_FILTER1},
+    [UBOX] = {0, MSR_UNC_V3_U_PMON_BOX_STATUS, MSR_UNC_V3_U_PMON_BOX_STATUS, 1, 0, 0, 48},
+    [UBOXFIX] = {0, MSR_UNC_V3_U_PMON_BOX_STATUS, MSR_UNC_V3_U_PMON_BOX_STATUS, 0, 0, 0, 48},
+    [SBOX0] = {MSR_UNC_V3_S0_PMON_BOX_CTL, MSR_UNC_V3_S0_PMON_BOX_STATUS, MSR_UNC_V3_S0_PMON_BOX_STATUS, -1, 0, 0, 48},
+    [SBOX1] = {MSR_UNC_V3_S1_PMON_BOX_CTL, MSR_UNC_V3_S1_PMON_BOX_STATUS, MSR_UNC_V3_S1_PMON_BOX_STATUS, -1, 0, 0, 48},
+    [SBOX2] = {MSR_UNC_V3_S2_PMON_BOX_CTL, MSR_UNC_V3_S2_PMON_BOX_STATUS, MSR_UNC_V3_S2_PMON_BOX_STATUS, -1, 0, 0, 48},
+    [SBOX3] = {MSR_UNC_V3_S3_PMON_BOX_CTL, MSR_UNC_V3_S3_PMON_BOX_STATUS, MSR_UNC_V3_S3_PMON_BOX_STATUS, -1, 0, 0, 48},
+    [WBOX] = {MSR_UNC_V3_PCU_PMON_BOX_CTL, MSR_UNC_V3_PCU_PMON_BOX_STATUS,MSR_UNC_V3_PCU_PMON_BOX_STATUS, 2, 0, 0, 48, MSR_UNC_V3_PCU_PMON_BOX_FILTER},
+    [WBOX0FIX] = {0,0,0,-1,0,0,64},
+    [BBOX0] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 21, 1, PCI_HA_DEVICE_0, 48},
+    [BBOX1] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 22, 1, PCI_HA_DEVICE_1, 48},
+    [MBOX0] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+    [MBOX1] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+    [MBOX2] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+    [MBOX3] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 23, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+    [MBOX4] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+    [MBOX5] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+    [MBOX6] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+    [MBOX7] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 24, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+    [IBOX0] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, 34, 1, PCI_IRP_DEVICE, 48},
+    [IBOX1] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, 34, 1, PCI_IRP_DEVICE, 48},
+    [PBOX] = {PCI_UNC_R2PCIE_PMON_BOX_CTL, PCI_UNC_R2PCIE_PMON_BOX_STATUS, PCI_UNC_R2PCIE_PMON_BOX_STATUS, 29, 1, PCI_R2PCIE_DEVICE, 48},
+    [RBOX0] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 27, 1, PCI_R3QPI_DEVICE_LINK_0, 44},
+    [RBOX1] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 28, 1, PCI_R3QPI_DEVICE_LINK_1, 44},
+    [QBOX0] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, 25, 1, PCI_QPI_DEVICE_PORT_0, 48},
+    [QBOX1] = {PCI_UNC_V3_QPI_PMON_BOX_CTL, PCI_UNC_V3_QPI_PMON_BOX_STATUS, PCI_UNC_V3_QPI_PMON_BOX_STATUS, 26, 1, PCI_QPI_DEVICE_PORT_1, 48},
+    [QBOX0FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_0, 32},
+    [QBOX1FIX] = {0x0, 0x0, 0x0, -1, 1, PCI_QPI_MISC_DEVICE_PORT_1, 32},
+};
+
+static PciDevice haswellEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [MSR_DEV] = {NONE, "", "MSR", ""},
+ [PCI_R3QPI_DEVICE_LINK_0] = {R3QPI, "0b.1", "PCI_R3QPI_DEVICE_LINK_0", "RBOX0", 0x2F36},
+ [PCI_R3QPI_DEVICE_LINK_1] = {R3QPI, "0b.2", "PCI_R3QPI_DEVICE_LINK_1", "RBOX1", 0x2F37},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "10.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x2F34},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "14.0", "PCI_IMC_DEVICE_0_CH_0", "MBOX0", 0x2FB4},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "14.1", "PCI_IMC_DEVICE_0_CH_1", "MBOX1", 0x2FB5},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "15.0", "PCI_IMC_DEVICE_0_CH_2", "MBOX2", 0x2FB0},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "15.1", "PCI_IMC_DEVICE_0_CH_3", "MBOX3", 0x2FB1},
+ [PCI_HA_DEVICE_0] = {HA, "12.1", "PCI_HA_DEVICE_0", "BBOX0", 0x2F30},
+ [PCI_HA_DEVICE_1] = {HA, "12.5", "PCI_HA_DEVICE_1", "BBOX1", 0x2F38},
+ [PCI_IMC_DEVICE_1_CH_0] = {IMC, "17.0", "PCI_IMC_DEVICE_1_CH_0", "MBOX4", 0x2FD4},
+ [PCI_IMC_DEVICE_1_CH_1] = {IMC, "17.1", "PCI_IMC_DEVICE_1_CH_1", "MBOX5", 0x2FD5},
+ [PCI_IMC_DEVICE_1_CH_2] = {IMC, "18.0", "PCI_IMC_DEVICE_1_CH_2", "MBOX6", 0x2FD0},
+ [PCI_IMC_DEVICE_1_CH_3] = {IMC, "18.1", "PCI_IMC_DEVICE_1_CH_3", "MBOX7", 0x2FD1},
+ [PCI_IRP_DEVICE] = {IRP, "05.6", "PCI_IRP_DEVICE", NULL, 0x2F39},
+ [PCI_QPI_DEVICE_PORT_0] = {QPI, "08.2", "PCI_QPI_DEVICE_PORT_0", "QBOX0", 0x2F32},
+ [PCI_QPI_DEVICE_PORT_1] = {QPI, "09.2", "PCI_QPI_DEVICE_PORT_1", "QBOX1", 0x2F33},
+ [PCI_QPI_DEVICE_PORT_2] = {QPI, "0a.2", "PCI_QPI_DEVICE_PORT_2", "QBOX2", 0x2F3A},
+ [PCI_QPI_MASK_DEVICE_PORT_0] = {QPI, "08.6", "PCI_QPI_MASK_DEVICE_PORT_0", NULL, 0x2F86},
+ [PCI_QPI_MASK_DEVICE_PORT_1] = {QPI, "09.6", "PCI_QPI_MASK_DEVICE_PORT_1", NULL, 0x2F96},
+ [PCI_QPI_MASK_DEVICE_PORT_2] = {QPI, "0a.6", "PCI_QPI_MASK_DEVICE_PORT_2", NULL, 0x2F46},
+ [PCI_QPI_MISC_DEVICE_PORT_0] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_01", "QBOX01FIX", 0x2F80},
+ [PCI_QPI_MISC_DEVICE_PORT_1] = {QPI, "0a.0", "PCI_QPI_MISC_DEVICE_PORT_2", "QBOX2FIX", 0x2F40},
+};
+
diff --git a/src/includes/perfmon_haswellEP_events.txt b/src/includes/perfmon_haswellEP_events.txt
new file mode 100644
index 0000000..ba9ae1f
--- /dev/null
+++ b/src/includes/perfmon_haswellEP_events.txt
@@ -0,0 +1,2391 @@
+# =======================================================================================
+#
+#      Filename:  perfmon_haswellEP_events.txt
+#
+#      Description:  Event list for Intel Haswell EP/EN/EX
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE          0x00   TMP0
+UMASK_TEMP_CORE          0x00
+
+EVENT_PWR_PKG_ENERGY          0x00   PWR0
+UMASK_PWR_PKG_ENERGY          0x00
+
+EVENT_PWR_PP0_ENERGY          0x00   PWR1
+UMASK_PWR_PP0_ENERGY          0x00
+
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
+
+EVENT_PWR_DRAM_ENERGY          0x00   PWR3
+UMASK_PWR_DRAM_ENERGY          0x00
+
+EVENT_INSTR_RETIRED              0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY          0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE      0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF       0x00
+
+EVENT_LD_BLOCKS                 0x03  PMC
+UMASK_LD_BLOCKS_STORE_FORWARD   0x02
+
+EVENT_MISALIGN_MEM_REF           0x05  PMC
+UMASK_MISALIGN_MEM_REF_LOADS      0x01
+UMASK_MISALIGN_MEM_REF_STORES     0x02
+UMASK_MISALIGN_MEM_REF_ANY        0x03
+
+EVENT_LD_BLOCKS_PARTIAL      0x07  PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01  PMC
+
+EVENT_DTLB_LOAD_MISSES                 0x08  PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK         0x01
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_4K     0x02
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED_LARGE  0x04
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED        0x0E
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION         0x10
+UMASK_DTLB_LOAD_MISSES_STLB_HIT_4K           0x20
+UMASK_DTLB_LOAD_MISSES_STLB_HIT_2M           0x40
+UMASK_DTLB_LOAD_MISSES_STLB_HIT              0x60
+UMASK_DTLB_LOAD_MISSES_PDE_CACHE_MISS        0x80
+
+EVENT_INT_MISC            0x0D  PMC
+UMASK_INT_MISC_RECOVERY_CYCLES  0x03 0x01
+
+EVENT_UOPS_ISSUED                0x0E  PMC
+UMASK_UOPS_ISSUED_ANY            0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE    0x10
+UMASK_UOPS_ISSUED_SLOW_LEA       0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL     0x40
+
+EVENT_L2_RQSTS                   0x24   PMC
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_MISS 0x21
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD_HIT 0x41
+UMASK_L2_RQSTS_ALL_DEMAND_DATA_RD     0xE1
+UMASK_L2_RQSTS_RFO_HIT           0x42
+UMASK_L2_RQSTS_RFO_MISS          0x22
+UMASK_L2_RQSTS_RFO_ANY           0xE2
+UMASK_L2_RQSTS_CODE_RD_HIT        0x44
+UMASK_L2_RQSTS_CODE_RD_MISS       0x24
+UMASK_L2_RQSTS_ALL_DEMAND_MISS   0x27
+UMASK_L2_RQSTS_ALL_DEMAND_REFERENCES   0xE7
+UMASK_L2_RQSTS_ALL_CODE_RD   0xE4
+UMASK_L2_RQSTS_L2_PF_HIT      0x50
+UMASK_L2_RQSTS_L2_PF_MISS     0x30
+UMASK_L2_RQSTS_ALL_PF        0xF8
+UMASK_L2_RQSTS_MISS              0x3F
+UMASK_L2_RQSTS_REFERENCES        0xFF
+
+EVENT_L2_DEMAND_RQST_WB_HIT            0x27   PMC
+UMASK_L2_DEMAND_RQST_WB_HIT       0x50
+
+EVENT_LONGEST_LAT_CACHE               0x2E   PMC
+UMASK_LONGEST_LAT_CACHE_REFERENCE     0x4F
+UMASK_LONGEST_LAT_CACHE_MISS          0x41
+
+EVENT_CPU_CLOCK_UNHALTED         0x3C   PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_P     0x01
+
+EVENT_L1D_PEND_MISS              0x48   PMC1
+UMASK_L1D_PEND_MISS_PENDING      0x01
+
+EVENT_DTLB_STORE_MISSES                0x49   PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK   0x01
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_4K       0x02
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_LARGE    0x04
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED          0x0E
+UMASK_DTLB_STORE_MISSES_WALK_DURATION       0x10
+UMASK_DTLB_STORE_MISSES_STLB_HIT_4K             0x20
+UMASK_DTLB_STORE_MISSES_STLB_HIT_LARGE             0x40
+UMASK_DTLB_STORE_MISSES_STLB_HIT              0x60
+UMASK_DTLB_STORE_MISSES_PDE_CACHE_MISS              0x80
+
+EVENT_LOAD_HIT_PRE               0x4C    PMC
+UMASK_LOAD_HIT_PRE_SW_PF               0x01
+UMASK_LOAD_HIT_PRE_HW_PF               0x02
+
+EVENT_L1D                        0x51   PMC
+UMASK_L1D_REPLACEMENT             0x01
+UMASK_L1D_ALLOCATED_IN_M          0x02
+UMASK_L1D_M_EVICT                 0x04
+UMASK_L1D_ALL_M_REPLACEMENT       0x08
+
+EVENT_MOVE_ELIMINATION                        0x58   PMC
+UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED     0x04
+UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED    0x08
+UMASK_MOVE_ELIMINATION_INT_ELIMINATED         0x01
+UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED        0x02
+
+EVENT_CPL_CYCLES               0x5C    PMC
+UMASK_CPL_CYCLES_RING0             0x01
+UMASK_CPL_CYCLES_RING123             0x02
+
+EVENT_RS_EVENTS               0x5E    PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING          0x60   PMC
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO   0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD   0x08
+
+EVENT_CACHE_LOCK_CYCLES          0x63   PMC
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION      0x01
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION       0x02
+
+EVENT_IDQ               0x79   PMC
+UMASK_IDQ_EMPTY         0x02
+UMASK_IDQ_MITE_UOPS     0x04
+UMASK_IDQ_DSB_UOPS      0x08
+UMASK_IDQ_MS_DSB_UOPS   0x10
+UMASK_IDQ_MS_MITE_UOPS  0x20
+UMASK_IDQ_MS_UOPS       0x30
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS      0x18 0x00 0x01
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS        0x18 0x00 0x04
+UMASK_IDQ_ALL_MITE_CYCLES_ANY_UOPS       0x24 0x00 0x01
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24 0x00 0x04
+UMASK_IDQ_MITE_ALL_UOPS       0x3C
+
+EVENT_ICACHE                  0x80   PMC
+UMASK_ICACHE_HITS             0x01
+UMASK_ICACHE_MISSES             0x02
+UMASK_ICACHE_ACCESSES           0x03
+UMASK_ICACHE_IFETCH_STALL       0x04
+
+EVENT_ITLB_MISSES                 0x85      PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK   0x01
+UMASK_ITLB_MISSES_WALK_COMPLETED_4K  0x02
+UMASK_ITLB_MISSES_WALK_COMPLETED_LARGE  0x04
+UMASK_ITLB_MISSES_WALK_COMPLETED     0x0E
+UMASK_ITLB_MISSES_WALK_DURATION   0x10
+UMASK_ITLB_MISSES_STLB_HIT_4K   0x20
+UMASK_ITLB_MISSES_STLB_HIT_2M   0x40
+UMASK_ITLB_MISSES_STLB_HIT   0x60
+
+EVENT_ILD_STALL                 0x87      PMC
+UMASK_ILD_STALL_LCP             0x01
+UMASK_ILD_STALL_IQ_FULL         0x04
+
+EVENT_BR_INST_EXEC                                      0x88   PMC
+UMASK_BR_INST_EXEC_COND_TAKEN                           0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN                     0x82
+UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN                 0x42
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN                0x48
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0 
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60 
+UMASK_BR_INST_EXEC_ALL_BRANCHES                         0xFF 
+
+EVENT_BR_MISP_EXEC                                      0x89   PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN                           0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN                0x48
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60
+UMASK_BR_MISP_EXEC_ALL_BRANCHES                         0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE               0x01
+
+EVENT_UOPS_EXECUTED_PORT                 0xA1   PMC
+UMASK_UOPS_EXECUTED_PORT_PORT_0           0x01
+UMASK_UOPS_EXECUTED_PORT_PORT_1           0x02
+UMASK_UOPS_EXECUTED_PORT_PORT_2           0x04
+UMASK_UOPS_EXECUTED_PORT_PORT_3           0x08
+UMASK_UOPS_EXECUTED_PORT_PORT_4           0x10
+UMASK_UOPS_EXECUTED_PORT_PORT_5           0x20
+UMASK_UOPS_EXECUTED_PORT_PORT_6           0x40
+UMASK_UOPS_EXECUTED_PORT_PORT_7           0x80
+
+EVENT_RESOURCE_STALLS                 0xA2   PMC
+UMASK_RESOURCE_STALLS_ANY             0x01
+UMASK_RESOURCE_STALLS_RS              0x04
+UMASK_RESOURCE_STALLS_SB               0x08
+UMASK_RESOURCE_STALLS_ROB             0x10
+
+EVENT_CYCLE_ACTIVITY                 0xA3   PMC
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING             0x01
+UMASK_CYCLE_ACTIVITY_CYCLES_LDM_PENDING              0x02
+UMASK_CYCLE_ACTIVITY_STALL_L2_PENDING            0x05
+UMASK_CYCLE_ACTIVITY_L1D_PENDING               0x08
+
+EVENT_LSD_UOPS                 0xA8   PMC
+UMASK_LSD_UOPS             0x01
+
+EVENT_ITLB                         0xAE   PMC
+UMASK_ITLB_ITLB_FLUSH            0x01
+
+EVENT_OFFCORE_REQUESTS     0xB0   PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD      0x08
+
+EVENT_UOPS_EXECUTED               0xB1   PMC
+UMASK_UOPS_EXECUTED_CORE              0x02
+
+EVENT_PAGE_WALKER_LOADS          0xBC  PMC
+UMASK_PAGE_WALKER_LOADS_DTLB_L1     0x11
+UMASK_PAGE_WALKER_LOADS_ITLB_L1     0x21
+UMASK_PAGE_WALKER_LOADS_DTLB_L2     0x12
+UMASK_PAGE_WALKER_LOADS_ITLB_L2     0x22
+UMASK_PAGE_WALKER_LOADS_DTLB_L3     0x14
+UMASK_PAGE_WALKER_LOADS_ITLB_L3     0x24
+UMASK_PAGE_WALKER_LOADS_DTLB_MEMORY     0x18
+UMASK_PAGE_WALKER_LOADS_ITLB_MEMORY     0x28
+
+EVENT_TLB_FLUSH          0xBD  PMC
+UMASK_TLB_FLUSH_DTLB_THREAD     0x01
+UMASK_TLB_FLUSH_STLB_ANY        0x20
+
+EVENT_INST_RETIRED                  0xC0  PMC1
+UMASK_INST_RETIRED_ANY_P            0x00
+UMASK_INST_RETIRED_ALL              0x01
+
+EVENT_OTHER_ASSISTS                  0xC1  PMC
+UMASK_OTHER_ASSISTS_AVX_TO_SSE            0x08
+UMASK_OTHER_ASSISTS_SSE_TO_AVX            0x10
+UMASK_OTHER_ASSISTS_ANY_WB_ASSIST         0x40
+
+EVENT_UOPS_RETIRED                  0xC2  PMC
+UMASK_UOPS_RETIRED_ALL              0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS     0x02
+
+EVENT_MACHINE_CLEARS              0xC3  PMC
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
+UMASK_MACHINE_CLEARS_SMC                0x04
+UMASK_MACHINE_CLEARS_MASKMOV            0x20
+
+EVENT_BR_INST_RETIRED               0xC4  PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x04
+UMASK_BR_INST_RETIRED_NEAR_RETURN   0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH    0x40
+
+EVENT_BR_MISP_RETIRED               0xC5  PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL  0x01
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES     0x04
+UMASK_BR_MISP_RETIRED_NOT_TAKEN      0x10
+UMASK_BR_MISP_RETIRED_TAKEN      0x20
+
+EVENT_FP_ASSIST               0xCA  PMC
+UMASK_FP_ASSIST_X87_OUTPUT               0x02
+UMASK_FP_ASSIST_X87_INPUT                0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT               0x08
+UMASK_FP_ASSIST_SIMD_INPUT               0x10
+UMASK_FP_ASSIST_ANY               0x1E
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS               0xCC  PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS               0x20
+
+
+EVENT_MEM_UOP_RETIRED            0xD0    PMC
+UMASK_MEM_UOP_RETIRED_LOADS            0x81
+UMASK_MEM_UOP_RETIRED_STORES           0x82
+UMASK_MEM_UOP_RETIRED_LOADS_STLB_MISS         0x11
+UMASK_MEM_UOP_RETIRED_STORES_STLB_MISS        0x12
+UMASK_MEM_UOP_RETIRED_LOADS_LOCK              0x21
+UMASK_MEM_UOP_RETIRED_STORES_LOCK             0x22
+UMASK_MEM_UOP_RETIRED_LOADS_SPLIT             0x41
+UMASK_MEM_UOP_RETIRED_STORES_SPLIT            0x42
+
+EVENT_MEM_LOAD_UOPS_RETIRED               0xD1   PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS      0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL       0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT       0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS      0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL       0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
+
+EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED               0xD2   PMC
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS         0x01
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT          0x02
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM         0x04
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE         0x08
+
+EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED               0xD3   PMC
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM      0x01
+
+EVENT_BACLEARS               0xE6   PMC
+UMASK_BACLEARS_ANY           0x1F
+
+EVENT_L2_TRANS               0xF0  PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD          0x01
+UMASK_L2_TRANS_RFO           0x02
+UMASK_L2_TRANS_CODE_RD       0x04
+UMASK_L2_TRANS_ALL_PREF      0x08
+UMASK_L2_TRANS_L1D_WB        0x10
+UMASK_L2_TRANS_L2_FILL       0x20
+UMASK_L2_TRANS_L2_WB         0x40
+UMASK_L2_TRANS_ALL_REQUESTS  0x80
+
+EVENT_L2_LINES_IN                   0xF1   PMC
+UMASK_L2_LINES_IN_I           0x01
+UMASK_L2_LINES_IN_S            0x02
+UMASK_L2_LINES_IN_E           0x04
+UMASK_L2_LINES_IN_ALL               0x07
+
+EVENT_L2_LINES_OUT                  0xF2   PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x05
+UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x06
+
+EVENT_TX_MEM_ABORT_CONFLICT          0x54   PMC
+UMASK_TX_MEM_ABORT_CONFLICT     0x01
+UMASK_TX_MEM_ABORT_CAPACITY     0x02
+UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK     0x04
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY     0x08
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH     0x10
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPP_ALIGNMENT     0x20
+UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_FULL     0x40
+
+EVENT_TX_EXEC          0x5D   PMC
+UMASK_TX_EXEC_MISC1     0x01
+UMASK_TX_EXEC_MISC2     0x02
+UMASK_TX_EXEC_MISC3     0x04
+UMASK_TX_EXEC_MISC4     0x08
+
+EVENT_HLE_RETIRED                  0xC8   PMC
+UMASK_HLE_RETIRED_START            0x01
+UMASK_HLE_RETIRED_COMMIT           0x02
+UMASK_HLE_RETIRED_ABORTED           0x04
+UMASK_HLE_RETIRED_ABORTED_MISC1     0x08
+UMASK_HLE_RETIRED_ABORTED_MISC2     0x10
+UMASK_HLE_RETIRED_ABORTED_MISC3     0x20
+UMASK_HLE_RETIRED_ABORTED_MISC4     0x40
+UMASK_HLE_RETIRED_ABORTED_MISC5     0x80
+
+EVENT_RTM_RETIRED                  0xC9   PMC
+UMASK_RTM_RETIRED_START            0x01
+UMASK_RTM_RETIRED_COMMIT           0x02
+UMASK_RTM_RETIRED_ABORTED           0x04
+UMASK_RTM_RETIRED_ABORTED_MISC1     0x08
+UMASK_RTM_RETIRED_ABORTED_MISC2     0x10
+UMASK_RTM_RETIRED_ABORTED_MISC3     0x20
+UMASK_RTM_RETIRED_ABORTED_MISC4     0x40
+UMASK_RTM_RETIRED_ABORTED_MISC5     0x80
+
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM               EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM       EVENT_OPTION_MATCH0=0x8FFF,EVENT_OPTION_MATCH1=0x60040
+UMASK_OFFCORE_RESPONSE_0_LOCAL_DRAM                 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM      EVENT_OPTION_MATCH0=0x8FFF,EVENT_OPTION_MATCH1=0x67F80
+UMASK_OFFCORE_RESPONSE_0_REMOTE_DRAM                0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM               EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM       EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x60040
+UMASK_OFFCORE_RESPONSE_1_LOCAL_DRAM                 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM      EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x67F80
+UMASK_OFFCORE_RESPONSE_1_REMOTE_DRAM                0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_CBOX_CLOCKTICKS               0x00 CBOX
+UMASK_CBOX_CLOCKTICKS               0x00
+
+EVENT_TXR_INSERTS                   0x02 CBOX
+UMASK_TXR_INSERTS_AD_CACHE          0x01
+UMASK_TXR_INSERTS_AK_CACHE          0x02
+UMASK_TXR_INSERTS_BL_CACHE          0x04
+UMASK_TXR_INSERTS_IV_CACHE          0x08
+UMASK_TXR_INSERTS_AD_CORE           0x10
+UMASK_TXR_INSERTS_AK_CORE           0x20
+UMASK_TXR_INSERTS_BL_CORE           0x40
+
+EVENT_TXR_ADS_USED                  0x04 CBOX
+UMASK_TXR_ADS_USED_AD               0x01
+UMASK_TXR_ADS_USED_AK               0x02
+UMASK_TXR_ADS_USED_BL               0x04
+
+EVENT_RING_BOUNCES                  0x05 CBOX
+UMASK_RING_BOUNCES_AD               0x01
+UMASK_RING_BOUNCES_AK               0x02
+UMASK_RING_BOUNCES_BL               0x04
+UMASK_RING_BOUNCES_IV               0x08
+
+EVENT_RING_SRC_THRTL                0x07 CBOX
+UMASK_RING_SRC_THRTL                0x00
+
+EVENT_FAST_ASSERTED                 0x09 CBOX0C0|CBOX0C1|CBOX1C0|CBOX1C1|CBOX2C0|CBOX2C1|CBOX3C0|CBOX3C1|CBOX4C0|CBOX4C1|CBOX5C0|CBOX5C1|CBOX6C0|CBOX6C1|CBOX7C0|CBOX7C1|CBOX8C0|CBOX8C1|CBOX9C0|CBOX9C1|CBOX10C0|CBOX10C1|CBOX11C0|CBOX11C1|CBOX12C0|CBOX12C1|CBOX13C0|CBOX13C1|CBOX14C0|CBOX14C1|CBOX15C0|CBOX15C1|CBOX16C0|CBOX16C1|CBOX17C0|CBOX17C1
+UMASK_FAST_ASSERTED                 0x00
+
+EVENT_BOUNCE_CONTROL                0xA CBOX
+UMASK_BOUNCE_CONTROL                0x00
+
+EVENT_RXR_OCCUPANCY                 0x11 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0|CBOX16C0|CBOX17C0
+UMASK_RXR_OCCUPANCY_IRQ             0x01
+UMASK_RXR_OCCUPANCY_IRQ_REJ         0x02
+UMASK_RXR_OCCUPANCY_IPQ             0x04
+UMASK_RXR_OCCUPANCY_PRQ_REJ         0x20
+UMASK_RXR_OCCUPANCY_IRQ_IPQ         0x05
+UMASK_RXR_OCCUPANCY_IRQ_PRQ_REJ     0x21
+UMASK_RXR_OCCUPANCY_IPQ_PRQ_REJ     0x24
+
+EVENT_RXR_EXT_STARVED               0x12 CBOX
+UMASK_RXR_EXT_STARVED_IRQ           0x01
+UMASK_RXR_EXT_STARVED_IPQ           0x02
+UMASK_RXR_EXT_STARVED_PRQ           0x04
+UMASK_RXR_EXT_STARVED_ISMQ_BIDS     0x08
+
+EVENT_RXR_INSERTS                   0x13 CBOX
+UMASK_RXR_INSERTS_IRQ               0x01
+UMASK_RXR_INSERTS_IRQ_REJ           0x02
+UMASK_RXR_INSERTS_IPQ               0x04
+UMASK_RXR_INSERTS_PRQ               0x10
+UMASK_RXR_INSERTS_PRQ_REJ           0x20
+
+EVENT_RING_AD_USED                  0x1B CBOX
+UMASK_RING_AD_USED_UP_EVEN          0x01
+UMASK_RING_AD_USED_UP_ODD           0x02
+UMASK_RING_AD_USED_UP               0x03
+UMASK_RING_AD_USED_DOWN_EVEN        0x04
+UMASK_RING_AD_USED_DOWN_ODD         0x08
+UMASK_RING_AD_USED_DOWN             0x0C
+UMASK_RING_AD_USED_ANY              0x0F
+
+EVENT_RING_AK_USED                  0x1C CBOX
+UMASK_RING_AK_USED_UP_EVEN          0x01
+UMASK_RING_AK_USED_UP_ODD           0x02
+UMASK_RING_AK_USED_UP               0x03
+UMASK_RING_AK_USED_DOWN_EVEN        0x04
+UMASK_RING_AK_USED_DOWN_ODD         0x08
+UMASK_RING_AK_USED_DOWN             0x0C
+UMASK_RING_AK_USED_ANY              0x0F
+
+EVENT_RING_BL_USED                  0x1D CBOX
+UMASK_RING_BL_USED_UP_EVEN          0x01
+UMASK_RING_BL_USED_UP_ODD           0x02
+UMASK_RING_BL_USED_UP               0x03
+UMASK_RING_BL_USED_DOWN_EVEN        0x04
+UMASK_RING_BL_USED_DOWN_ODD         0x08
+UMASK_RING_BL_USED_DOWN             0x0C
+UMASK_RING_BL_USED_ANY              0x0F
+
+EVENT_RING_IV_USED                  0x1E CBOX
+UMASK_RING_IV_USED_UP               0x03
+UMASK_RING_IV_USED_DN               0x0C
+UMASK_RING_IV_USED_ANY              0x0F
+
+
+EVENT_COUNTER0_OCCUPANCY            0x1F CBOX
+UMASK_COUNTER0_OCCUPANCY            0x00
+
+EVENT_RXR_IPQ_RETRY2                0x28 CBOX
+UMASK_RXR_IPQ_RETRY2_AD_SBO         0x01
+OPTIONS_RXR_IPQ_RETRY2_TARGET       EVENT_OPTION_NID_MASK
+UMASK_RXR_IPQ_RETRY2_TARGET         0x40
+
+EVENT_RXR_IRQ_RETRY2                0x29 CBOX
+UMASK_RXR_IRQ_RETRY2_AD_SBO         0x01
+UMASK_RXR_IRQ_RETRY2_BL_SBO         0x02
+OPTIONS_RXR_IRQ_RETRY2_TARGET       EVENT_OPTION_NID_MASK
+UMASK_RXR_IRQ_RETRY2_TARGET         0x40
+
+EVENT_RXR_ISMQ_RETRY2               0x2A CBOX
+UMASK_RXR_ISMQ_RETRY2_AD_SBO         0x01
+UMASK_RXR_ISMQ_RETRY2_BL_SBO         0x02
+OPTIONS_RXR_ISMQ_RETRY2_TARGET       EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY2_TARGET         0x40
+
+EVENT_RXR_IPQ_RETRY                 0x31 CBOX
+UMASK_RXR_IPQ_RETRY_ANY             0x01
+UMASK_RXR_IPQ_RETRY_FULL            0x02
+UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT   0x04
+UMASK_RXR_IPQ_RETRY_QPI_CREDITS     0x10
+
+EVENT_RXR_IRQ_RETRY                 0x32 CBOX
+UMASK_RXR_IRQ_RETRY_ANY             0x01
+UMASK_RXR_IRQ_RETRY_FULL            0x02
+UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT   0x04
+UMASK_RXR_IRQ_RETRY_RTID            0x01
+UMASK_RXR_IRQ_RETRY_QPI_CREDITS     0x01
+UMASK_RXR_IRQ_RETRY_IIO_CREDITS     0x01
+OPTIONS_RXR_IRQ_RETRY_NID           EVENT_OPTION_NID_MASK
+UMASK_RXR_IRQ_RETRY_NID             0x01
+
+EVENT_RXR_ISMQ_RETRY                0x33 CBOX
+UMASK_RXR_ISMQ_RETRY_ANY            0x01
+UMASK_RXR_ISMQ_RETRY_FULL           0x02
+UMASK_RXR_ISMQ_RETRY_RTID           0x08
+UMASK_RXR_ISMQ_RETRY_QPI_CREDITS    0x10
+UMASK_RXR_ISMQ_RETRY_IIO_CREDITS    0x20
+OPTIONS_RXR_ISMQ_RETRY_NID          EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY_NID            0x40
+OPTIONS_RXR_ISMQ_RETRY_WB_CREDITS   EVENT_OPTION_NID_MASK
+UMASK_RXR_ISMQ_RETRY_WB_CREDITS     0x80
+
+EVENT_LLC_LOOKUP                    0x34 CBOX
+OPTIONS_LLC_LOOKUP_DATA_READ        EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_DATA_READ          0x03
+OPTIONS_LLC_LOOKUP_WRITE            EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_WRITE              0x05
+OPTIONS_LLC_LOOKUP_REMOTE_SNOOP     EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_REMOTE_SNOOP       0x09
+OPTIONS_LLC_LOOKUP_ANY              EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_ANY                0x11
+OPTIONS_LLC_LOOKUP_READ             EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_READ               0x21
+OPTIONS_LLC_LOOKUP_NID_MASK         EVENT_OPTION_STATE_MASK|EVENT_OPTION_NID_MASK
+UMASK_LLC_LOOKUP_NID                0x41
+
+EVENT_TOR_INSERTS                   0x35 CBOX
+UMASK_TOR_INSERTS_ALL               0x08
+UMASK_TOR_INSERTS_WB                0x10
+OPTIONS_TOR_INSERTS_LOCAL_OPCODE    EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_LOCAL_OPCODE      0x21
+OPTIONS_TOR_INSERTS_MISS_LOCAL_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_INSERTS_LOCAL             0x28
+UMASK_TOR_INSERTS_MISS_LOCAL        0x2A
+OPTIONS_TOR_INSERTS_NID_OPCODE      EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_OPCODE        0x41
+OPTIONS_TOR_INSERTS_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_MISS_OPCODE   0x43
+OPTIONS_TOR_INSERTS_NID_EVICION     EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_EVICION       0x44
+OPTIONS_TOR_INSERTS_NID_ALL         EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_ALL           0x48
+OPTIONS_TOR_INSERTS_NID_MISS_ALL    EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_MISS_ALL      0x4A
+OPTIONS_TOR_INSERTS_NID_WB          EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_WB            0x50
+OPTIONS_TOR_INSERTS_REMOTE_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_REMOTE_OPCODE     0x81
+OPTIONS_TOR_INSERTS_MISS_REMOTE_OPCODE EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_INSERTS_REMOTE            0x88
+UMASK_TOR_INSERTS_MISS_REMOTE       0x8A
+
+EVENT_TOR_OCCUPANCY                 0x36 CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0|CBOX16C0|CBOX17C0
+OPTIONS_TOR_OCCUPANCY_OPCODE        EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_OPCODE          0x01
+OPTIONS_TOR_OCCUPANCY_MISS_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_OPCODE     0x03
+UMASK_TOR_OCCUPANCY_EVICTION        0x04
+UMASK_TOR_OCCUPANCY_ALL             0x08
+UMASK_TOR_OCCUPANCY_MISS_ALL        0x0A
+UMASK_TOR_OCCUPANCY_WB              0x10
+UMASK_TOR_OCCUPANCY_LOCAL_OPCODE    0x21
+UMASK_TOR_OCCUPANCY_MISS_LOCAL_OPCODE 0x23
+UMASK_TOR_OCCUPANCY_LOCAL           0x28
+UMASK_TOR_OCCUPANCY_MISS_LOCAL      0x2A
+OPTIONS_TOR_OCCUPANCY_NID_OPCODE    EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_OPCODE      0x41
+OPTIONS_TOR_OCCUPANCY_NID_MISS_OPCODE EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE 0x43
+OPTIONS_TOR_OCCUPANCY_NID_EVICTION  EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_EVICTION    0x44
+OPTIONS_TOR_OCCUPANCY_NID_ALL       EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_ALL         0x48
+OPTIONS_TOR_OCCUPANCY_NID_MISS_ALL  EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_ALL    0x4A
+OPTIONS_TOR_OCCUPANCY_NID_WB        EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_WB          0x50
+OPTIONS_TOR_OCCUPANCY_REMOTE_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_REMOTE_OPCODE   0x81
+OPTIONS_TOR_OCCUPANCY_MISS_REMOTE_OPCODE   EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_REMOTE_OPCODE 0x83
+UMASK_TOR_OCCUPANCY_REMOTE          0x88
+UMASK_TOR_OCCUPANCY_MISS_REMOTE     0x8A
+
+EVENT_LLC_VICTIMS                   0x37 CBOX
+UMASK_LLC_VICTIMS_M                 0x01
+UMASK_LLC_VICTIMS_E                 0x02
+UMASK_LLC_VICTIMS_I                 0x04
+UMASK_LLC_VICTIMS_F                 0x08
+UMASK_LLC_VICTIMS_MEIF              0x0F
+UMASK_LLC_VICTIMS_MISS              0x10
+OPTIONS_LLC_VICTIMS_NID             EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID               0x40
+
+EVENT_MISC                          0x39 CBOX
+UMASK_MISC_RSPI_WAS_FSE             0x01
+UMASK_MISC_WC_ALIASING              0x02
+UMASK_MISC_STARTED                  0x04
+UMASK_MISC_RFO_HIT_S                0x08
+UMASK_MISC_CVZERO_PREFETCH_VICTIM   0x10
+UMASK_MISC_CVZERO_PREFETCH_MISS     0x20
+
+EVENT_SBO_CREDITS_ACQUIRED          0x3D CBOX
+UMASK_SBO_CREDITS_ACQUIRED_AD       0x01
+UMASK_SBO_CREDITS_ACQUIRED_BL       0x02
+
+EVENT_SBO_CREDIT_OCCUPANCY          0x3E CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX15C0|CBOX16C0|CBOX17C0
+UMASK_SBO_CREDIT_OCCUPANCY_AD       0x01
+UMASK_SBO_CREDIT_OCCUPANCY_BL       0x02
+
+EVENT_EVENT_MSG                     0x42 UBOX
+UMASK_EVENT_MSG_DOORBELL_RCVD       0x08
+
+EVENT_PHOLD_CYCLES                  0x45 UBOX
+UMASK_PHOLD_CYCLES_ASSERT_TO_ACK    0x01
+
+EVENT_RACU_REQUESTS                 0x46 UBOX
+UMASK_RACU_REQUESTS                 0x00
+
+EVENT_UNCORE_CLOCK                  0x00 UBOXFIX
+UMASK_UNCORE_CLOCK                  0x00
+
+EVENT_SBOX_CLOCKTICKS               0x00 SBOX
+UMASK_SBOX_CLOCKTICKS               0x00
+
+EVENT_TXR_OCCUPANCY                 0x01 SBOX
+UMASK_TXR_OCCUPANCY_AD_CRD          0x01
+UMASK_TXR_OCCUPANCY_AD_BNC          0x02
+UMASK_TXR_OCCUPANCY_BL_CRD          0x04
+UMASK_TXR_OCCUPANCY_BL_CRD          0x08
+UMASK_TXR_OCCUPANCY_AK              0x10
+UMASK_TXR_OCCUPANCY_IV              0x20
+
+EVENT_TXR_INSERTS                   0x02 SBOX
+UMASK_TXR_INSERTS_AD_CRD            0x01
+UMASK_TXR_INSERTS_AD_BNC            0x02
+UMASK_TXR_INSERTS_BL_CRD            0x04
+UMASK_TXR_INSERTS_BL_CRD            0x08
+UMASK_TXR_INSERTS_AK                0x10
+UMASK_TXR_INSERTS_IV                0x20
+
+EVENT_TXR_ADS_USED                  0x04 SBOX
+UMASK_TXR_ADS_USED_AD               0x01
+UMASK_TXR_ADS_USED_AK               0x02
+UMASK_TXR_ADS_USED_BL               0x04
+
+EVENT_RING_BOUNCES                  0x05 SBOX
+UMASK_RING_BOUNCES_AD_CACHE         0x01
+UMASK_RING_BOUNCES_AK_CORE          0x02
+UMASK_RING_BOUNCES_BL_CORE          0x04
+UMASK_RING_BOUNCES_IV_CORE          0x08
+
+EVENT_FAST_ASSERTED                 0x09 SBOX
+UMASK_FAST_ASSERTED                 0x00
+
+EVENT_BOUNCE_CONTROL                0x0A SBOX
+UMASK_BOUNCE_CONTROL                0x00
+
+EVENT_RXR_OCCUPANCY                 0x11 SBOX
+UMASK_RXR_OCCUPANCY_AD_CRD          0x01
+UMASK_RXR_OCCUPANCY_AD_BNC          0x02
+UMASK_RXR_OCCUPANCY_BL_CRD          0x04
+UMASK_RXR_OCCUPANCY_BL_CRD          0x08
+UMASK_RXR_OCCUPANCY_AK              0x10
+UMASK_RXR_OCCUPANCY_IV              0x20
+
+EVENT_RXR_BYPASS                    0x12 SBOX
+UMASK_RXR_BYPASS_AD_CRD             0x01
+UMASK_RXR_BYPASS_AD_BNC             0x02
+UMASK_RXR_BYPASS_BL_CRD             0x04
+UMASK_RXR_BYPASS_BL_CRD             0x08
+UMASK_RXR_BYPASS_AK                 0x10
+UMASK_RXR_BYPASS_IV                 0x20
+
+EVENT_RxR_INSERTS                   0x13 SBOX
+UMASK_RXR_INSERTS_AD_CRD            0x01
+UMASK_RXR_INSERTS_AD_BNC            0x02
+UMASK_RXR_INSERTS_BL_CRD            0x04
+UMASK_RXR_INSERTS_BL_CRD            0x08
+UMASK_RXR_INSERTS_AK                0x10
+UMASK_RXR_INSERTS_IV                0x20
+
+EVENT_RING_AD_USED                  0x1B SBOX
+UMASK_RING_AD_USED_ANY              0x0F
+UMASK_RING_AD_USED_UP_EVEN          0x01
+UMASK_RING_AD_USED_UP_ODD           0x02
+UMASK_RING_AD_USED_UP               0x03
+UMASK_RING_AD_USED_DOWN_EVEN        0x04
+UMASK_RING_AD_USED_DOWN_ODD         0x08
+UMASK_RING_AD_USED_DOWN             0x0C
+
+EVENT_RING_AK_USED                  0x1C SBOX
+UMASK_RING_AK_USED_ANY              0x0F
+UMASK_RING_AK_USED_UP_EVEN          0x01
+UMASK_RING_AK_USED_UP_ODD           0x02
+UMASK_RING_AK_USED_UP               0x03
+UMASK_RING_AK_USED_DOWN_EVEN        0x04
+UMASK_RING_AK_USED_DOWN_ODD         0x08
+UMASK_RING_AK_USED_DOWN             0x0C
+
+EVENT_RING_BL_USED                  0x1D SBOX
+UMASK_RING_BL_USED_ANY              0x0F
+UMASK_RING_BL_USED_UP_EVEN          0x01
+UMASK_RING_BL_USED_UP_ODD           0x02
+UMASK_RING_BL_USED_UP               0x03
+UMASK_RING_BL_USED_DOWN_EVEN        0x04
+UMASK_RING_BL_USED_DOWN_ODD         0x08
+UMASK_RING_BL_USED_DOWN             0x0C
+
+EVENT_RING_IV_USED                  0x1E SBOX
+UMASK_RING_IV_USED_ANY              0x0F
+UMASK_RING_IV_USED_UP               0x03
+UMASK_RING_IV_USED_DOWN             0x0C
+
+EVENT_WBOX_CLOCKTICKS               0x00 WBOX
+UMASK_WBOX_CLOCKTICKS               0x00
+
+EVENT_CORE0_TRANSITION_CYCLES       0x60 WBOX
+UMASK_CORE0_TRANSITION_CYCLES       0x00
+
+EVENT_CORE1_TRANSITION_CYCLES       0x61 WBOX
+UMASK_CORE1_TRANSITION_CYCLES       0x00
+
+EVENT_CORE2_TRANSITION_CYCLES       0x62 WBOX
+UMASK_CORE2_TRANSITION_CYCLES       0x00
+
+EVENT_CORE3_TRANSITION_CYCLES       0x63 WBOX
+UMASK_CORE3_TRANSITION_CYCLES       0x00
+
+EVENT_CORE4_TRANSITION_CYCLES       0x64 WBOX
+UMASK_CORE4_TRANSITION_CYCLES       0x00
+
+EVENT_CORE5_TRANSITION_CYCLES       0x65 WBOX
+UMASK_CORE5_TRANSITION_CYCLES       0x00
+
+EVENT_CORE6_TRANSITION_CYCLES       0x66 WBOX
+UMASK_CORE6_TRANSITION_CYCLES       0x00
+
+EVENT_CORE7_TRANSITION_CYCLES       0x67 WBOX
+UMASK_CORE7_TRANSITION_CYCLES       0x00
+
+EVENT_CORE8_TRANSITION_CYCLES       0x68 WBOX
+UMASK_CORE8_TRANSITION_CYCLES       0x00
+
+EVENT_CORE9_TRANSITION_CYCLES       0x69 WBOX
+UMASK_CORE9_TRANSITION_CYCLES       0x00
+
+EVENT_CORE10_TRANSITION_CYCLES       0x6A WBOX
+UMASK_CORE10_TRANSITION_CYCLES       0x00
+
+EVENT_CORE11_TRANSITION_CYCLES       0x6B WBOX
+UMASK_CORE11_TRANSITION_CYCLES       0x00
+
+EVENT_CORE12_TRANSITION_CYCLES       0x6C WBOX
+UMASK_CORE12_TRANSITION_CYCLES       0x00
+
+EVENT_CORE13_TRANSITION_CYCLES       0x6D WBOX
+UMASK_CORE13_TRANSITION_CYCLES       0x00
+
+EVENT_CORE14_TRANSITION_CYCLES       0x6E WBOX
+UMASK_CORE14_TRANSITION_CYCLES       0x00
+
+EVENT_CORE15_TRANSITION_CYCLES       0x6F WBOX
+UMASK_CORE15_TRANSITION_CYCLES       0x00
+
+EVENT_CORE16_TRANSITION_CYCLES       0x70 WBOX
+UMASK_CORE16_TRANSITION_CYCLES       0x00
+
+EVENT_CORE17_TRANSITION_CYCLES       0x71 WBOX
+UMASK_CORE17_TRANSITION_CYCLES       0x00
+
+EVENT_DEMOTIONS_CORE0                0x30 WBOX
+UMASK_DEMOTIONS_CORE0                0x00
+
+EVENT_DEMOTIONS_CORE1                0x31 WBOX
+UMASK_DEMOTIONS_CORE1                0x00
+
+EVENT_DEMOTIONS_CORE2                0x32 WBOX
+UMASK_DEMOTIONS_CORE2                0x00
+
+EVENT_DEMOTIONS_CORE3                0x33 WBOX
+UMASK_DEMOTIONS_CORE3                0x00
+
+EVENT_DEMOTIONS_CORE4                0x34 WBOX
+UMASK_DEMOTIONS_CORE4                0x00
+
+EVENT_DEMOTIONS_CORE5                0x35 WBOX
+UMASK_DEMOTIONS_CORE5                0x00
+
+EVENT_DEMOTIONS_CORE6                0x36 WBOX
+UMASK_DEMOTIONS_CORE6                0x00
+
+EVENT_DEMOTIONS_CORE7                0x37 WBOX
+UMASK_DEMOTIONS_CORE7                0x00
+
+EVENT_DEMOTIONS_CORE8                0x38 WBOX
+UMASK_DEMOTIONS_CORE8                0x00
+
+EVENT_DEMOTIONS_CORE9                0x39 WBOX
+UMASK_DEMOTIONS_CORE9                0x00
+
+EVENT_DEMOTIONS_CORE10                0x3A WBOX
+UMASK_DEMOTIONS_CORE10                0x00
+
+EVENT_DEMOTIONS_CORE11                0x3B WBOX
+UMASK_DEMOTIONS_CORE11                0x00
+
+EVENT_DEMOTIONS_CORE12                0x3C WBOX
+UMASK_DEMOTIONS_CORE12                0x00
+
+EVENT_DEMOTIONS_CORE13                0x3D WBOX
+UMASK_DEMOTIONS_CORE13                0x00
+
+EVENT_DEMOTIONS_CORE14                0x3E WBOX
+UMASK_DEMOTIONS_CORE14                0x00
+
+EVENT_DEMOTIONS_CORE15                0x3F WBOX
+UMASK_DEMOTIONS_CORE15                0x00
+
+EVENT_DEMOTIONS_CORE16                0x40 WBOX
+UMASK_DEMOTIONS_CORE16                0x00
+
+EVENT_DEMOTIONS_CORE17                0x41 WBOX
+UMASK_DEMOTIONS_CORE17                0x00
+
+EVENT_FREQ_BAND0_CYCLES                 0x0B WBOX
+OPTIONS_FREQ_BAND0_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND0_CYCLES                 0x00
+
+EVENT_FREQ_BAND1_CYCLES                 0x0C WBOX
+OPTIONS_FREQ_BAND1_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND1_CYCLES                 0x00
+
+EVENT_FREQ_BAND2_CYCLES                 0x0D WBOX
+OPTIONS_FREQ_BAND2_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND2_CYCLES                 0x00
+
+EVENT_FREQ_BAND3_CYCLES                 0x0E WBOX
+OPTIONS_FREQ_BAND3_CYCLES               EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND3_CYCLES                 0x00
+
+EVENT_FREQ_MAX_LIMIT_THERMAL_CYCLES     0x04 WBOX
+UMASK_FREQ_MAX_LIMIT_THERMAL_CYCLES     0x00
+
+EVENT_FREQ_MAX_OS_CYCLES                0x06 WBOX
+UMASK_FREQ_MAX_OS_CYCLES                0x00
+
+EVENT_FREQ_MAX_POWER_CYCLES             0x05 WBOX
+UMASK_FREQ_MAX_POWER_CYCLES             0x00
+
+EVENT_FREQ_MIN_IO_P_CYCLES              0x73 WBOX
+UMASK_FREQ_MIN_IO_P_CYCLES              0x00
+
+EVENT_FREQ_TRANS_CYCLES                 0x74 WBOX
+UMASK_FREQ_TRANS_CYCLES                 0x00
+
+EVENT_MEMORY_PHASE_SHEDDING_CYCLES      0x2F WBOX
+UMASK_MEMORY_PHASE_SHEDDING_CYCLES      0x00
+
+EVENT_PKG_RESIDENCY_C0_CYCLES           0x2A WBOX
+UMASK_PKG_RESIDENCY_C0_CYCLES           0x00
+
+EVENT_PKG_RESIDENCY_C1E_CYCLES          0x4E WBOX
+UMASK_PKG_RESIDENCY_C1E_CYCLES          0x00
+
+EVENT_PKG_RESIDENCY_C2E_CYCLES          0x2B WBOX
+UMASK_PKG_RESIDENCY_C2E_CYCLES          0x00
+
+EVENT_PKG_RESIDENCY_C3_CYCLES           0x2C WBOX
+UMASK_PKG_RESIDENCY_C3_CYCLES           0x00
+
+EVENT_PKG_RESIDENCY_C6_CYCLES           0x2D WBOX
+UMASK_PKG_RESIDENCY_C6_CYCLES           0x00
+
+EVENT_PKG_RESIDENCY_C7_CYCLES           0x2E WBOX
+UMASK_PKG_RESIDENCY_C7_CYCLES           0x00
+
+EVENT_POWER_STATE_OCCUPANCY             0x80 WBOX
+UMASK_POWER_STATE_OCCUPANCY_CORES_C0    0x40
+UMASK_POWER_STATE_OCCUPANCY_CORES_C3    0x80
+UMASK_POWER_STATE_OCCUPANCY_CORES_C6    0xC0
+
+EVENT_PROCHOT_EXTERNAL_CYCLES           0x0A WBOX
+UMASK_PROCHOT_EXTERNAL_CYCLES           0x00
+
+EVENT_PROCHOT_INTERNAL_CYCLES           0x09 WBOX
+UMASK_PROCHOT_INTERNAL_CYCLES           0x00
+
+EVENT_TOTAL_TRANSITION_CYCLES           0x72 WBOX
+UMASK_TOTAL_TRANSITION_CYCLES           0x00
+
+EVENT_UFS_TRANSITIONS_RING_GV           0x79 WBOX
+UMASK_UFS_TRANSITIONS_RING_GV           0x00
+
+EVENT_VR_HOT_CYCLES                     0x42 WBOX
+UMASK_VR_HOT_CYCLES                     0x00
+
+EVENT_CORE_CORE_C6_RESIDENCY             0x00 WBOX0FIX
+UMASK_CORE_CORE_C6_RESIDENCY             0x00
+
+EVENT_CORE_CORE_C3_RESIDENCY             0x00 WBOX1FIX
+UMASK_CORE_CORE_C3_RESIDENCY             0x00
+
+EVENT_CORE_PKG_C2_RESIDENCY              0x00 WBOX2FIX
+EVENT_CORE_PKG_C2_RESIDENCY              0x00
+
+EVENT_CORE_PKG_C3_RESIDENCY              0x00 WBOX3FIX
+UMASK_CORE_PKG_C3_RESIDENCY              0x00
+
+EVENT_BBOX_CLOCKTICKS                   0x00 BBOX
+UMASK_BBOX_CLOCKTICKS                   0x00
+
+EVENT_ADDR_OPC_MATCH                    0x20 BBOX
+OPTIONS_ADDR_OPC_MATCH_ADDR             EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_ADDR_OPC_MATCH_ADDR               0x01
+OPTIONS_ADDR_OPC_MATCH_OPC              EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_OPC                0x02
+OPTIONS_ADDR_OPC_MATCH_FILT             EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_ADDR_OPC_MATCH_FILT               0x03
+OPTIONS_ADDR_OPC_MATCH_AD               EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_AD                 0x04
+OPTIONS_ADDR_OPC_MATCH_BL               EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_BL                 0x08
+OPTIONS_ADDR_OPC_MATCH_AK               EVENT_OPTION_OPCODE_MASK
+UMASK_ADDR_OPC_MATCH_AK                 0x10
+
+EVENT_BT_CYCLES_NE                      0x42 BBOX
+UMASK_BT_CYCLES_NE                      0x00
+
+EVENT_BT_OCCUPANCY                      0x43 BBOX
+UMASK_BT_OCCUPANCY                      0x00
+
+EVENT_BYPASS_IMC                        0x14 BBOX
+UMASK_BYPASS_IMC_TAKEN                  0x01
+UMASK_BYPASS_IMC_NOT_TAKEN              0x02
+
+EVENT_CONFLICT_CYCLES                   0x0B BBOX0C1|BBOX1C1
+UMASK_CONFLICT_CYCLES                   0x00
+
+EVENT_DIRECT2CORE_COUNT                 0x11 BBOX
+UMASK_DIRECT2CORE_COUNT                 0x00
+
+EVENT_DIRECT2CORE_CYCLES_DISABLED       0x12 BBOX
+UMASK_DIRECT2CORE_CYCLES_DISABLED       0x00
+
+EVENT_DIRECT2CORE_TXN_OVERRIDE          0x13 BBOX
+UMASK_DIRECT2CORE_TXN_OVERRIDE          0x00
+
+EVENT_DIRECTORY_LAT_OPT                 0x41 BBOX
+UMASK_DIRECTORY_LAT_OPT                 0x00
+
+EVENT_DIRECTORY_LOOKUP                  0x0C BBOX
+UMASK_DIRECTORY_LOOKUP_SNP              0x01
+UMASK_DIRECTORY_LOOKUP_NO_SNP           0x02
+
+EVENT_DIRECTORY_UPDATE                  0x0D BBOX
+UMASK_DIRECTORY_UPDATE_SET              0x01
+UMASK_DIRECTORY_UPDATE_CLEAR            0x02
+UMASK_DIRECTORY_UPDATE_ANY              0x03
+
+EVENT_HITME_LOOKUP                      0x70 BBOX
+UMASK_HITME_LOOKUP_READ_OR_INVITOE         0x01
+UMASK_HITME_LOOKUP_WBMTOI                  0x02
+UMASK_HITME_LOOKUP_ACKCNFLTWBI             0x04
+UMASK_HITME_LOOKUP_WBMTOE_OR_S             0x08
+UMASK_HITME_LOOKUP_HOM                     0x0F
+UMASK_HITME_LOOKUP_RSPFWDI_REMOTE          0x10
+UMASK_HITME_LOOKUP_RSPFWDI_LOCAL           0x20
+UMASK_HITME_LOOKUP_INVALS                  0x26
+UMASK_HITME_LOOKUP_RSPFWDS                 0x40
+UMASK_HITME_LOOKUP_EVICTS                  0x42
+UMASK_HITME_LOOKUP_ALLOCS                  0x70
+UMASK_HITME_LOOKUP_RSP                     0x80
+UMASK_HITME_LOOKUP_ALL                     0xFF
+
+EVENT_HITME_HIT                         0x71 BBOX
+UMASK_HITME_HIT_READ_OR_INVITOE         0x01
+UMASK_HITME_HIT_WBMTOI                  0x02
+UMASK_HITME_HIT_ACKCNFLTWBI             0x04
+UMASK_HITME_HIT_WBMTOE_OR_S             0x08
+UMASK_HITME_HIT_HOM                     0x0F
+UMASK_HITME_HIT_RSPFWDI_REMOTE          0x10
+UMASK_HITME_HIT_RSPFWDI_LOCAL           0x20
+UMASK_HITME_HIT_INVALS                  0x26
+UMASK_HITME_HIT_RSPFWDS                 0x40
+UMASK_HITME_HIT_EVICTS                  0x42
+UMASK_HITME_HIT_ALLOCS                  0x70
+UMASK_HITME_HIT_RSP                     0x80
+UMASK_HITME_HIT_ALL                     0xFF
+
+EVENT_HITME_HIT_PV_BITS_SET             0x72 BBOX
+UMASK_HITME_HIT_PV_BITS_SET_READ_OR_INVITOE 0x01
+UMASK_HITME_HIT_PV_BITS_SET_WBMTOI          0x02
+UMASK_HITME_HIT_PV_BITS_SET_ACKCNFLTWBI     0x04
+UMASK_HITME_HIT_PV_BITS_SET_WBMTOE_OR_S     0x08
+UMASK_HITME_HIT_PV_BITS_SET_HOM             0x0F
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDI_REMOTE  0x10
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDI_LOCAL   0x20
+UMASK_HITME_HIT_PV_BITS_SET_RSPFWDS         0x40
+UMASK_HITME_HIT_PV_BITS_SET_RSP             0x80
+UMASK_HITME_HIT_PV_BITS_SET_ALL             0xFF
+
+EVENT_IGR_NO_CREDIT_CYCLES              0x22 BBOX
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI0      0x01
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI1      0x02
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI0      0x04
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI1      0x08
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI2      0x10
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI2      0x20
+
+EVENT_IMC_READS                         0x17 BBOX
+UMASK_IMC_READS_NORMAL                  0x01
+
+EVENT_IMC_RETRY                         0x1E BBOX
+UMASK_IMC_RETRY                         0x00
+
+EVENT_IMC_WRITES                        0x1A BBOX
+UMASK_IMC_WRITES_FULL                   0x01
+UMASK_IMC_WRITES_PARTIAL                0x02
+UMASK_IMC_WRITES_FULL_ISOCH             0x04
+UMASK_IMC_WRITES_PARTIAL_ISOCH          0x08
+UMASK_IMC_WRITES_ALL                    0x0F
+
+EVENT_OSB                               0x53 BBOX
+UMASK_OSB_READS_LOCAL                   0x02
+UMASK_OSB_INVITOE_LOCAL                 0x04
+UMASK_OSB_REMOTE                        0x08
+UMASK_OSB_CANCELLED                     0x10
+UMASK_OSB_READS_LOCAL_USEFUL            0x20
+UMASK_OSB_REMOTE_USEFUL                 0x40
+
+EVENT_OSB_EDR                           0x54 BBOX
+UMASK_OSB_EDR_ALL                       0x01
+UMASK_OSB_EDR_READS_LOCAL_I             0x02
+UMASK_OSB_EDR_READS_REMOTE_I            0x04
+UMASK_OSB_EDR_READS_LOCAL_S             0x08
+UMASK_OSB_EDR_READS_REMOTE_S            0x10
+
+EVENT_REQUESTS                          0x01 BBOX
+UMASK_REQUESTS_READS_LOCAL              0x01
+UMASK_REQUESTS_READS_REMOTE             0x02
+UMASK_REQUESTS_READS                    0x03
+UMASK_REQUESTS_WRITES_LOCAL             0x04
+UMASK_REQUESTS_WRITES_REMOTE            0x08
+UMASK_REQUESTS_WRITES                   0x0C
+UMASK_REQUESTS_INVITOE_LOCAL            0x10
+UMASK_REQUESTS_INVITOE_REMOTE           0x20
+
+EVENT_RING_AD_USED                      0x3E BBOX
+UMASK_RING_AD_USED_CW_EVEN              0x01
+UMASK_RING_AD_USED_CW_ODD               0x02
+UMASK_RING_AD_USED_CW                   0x03
+UMASK_RING_AD_USED_CCW_EVEN             0x04
+UMASK_RING_AD_USED_CCW_ODD              0x08
+UMASK_RING_AD_USED_CCW                  0x0C
+
+EVENT_RING_AK_USED                      0x3F BBOX
+UMASK_RING_AK_USED_CW_EVEN              0x01
+UMASK_RING_AK_USED_CW_ODD               0x02
+UMASK_RING_AK_USED_CW                   0x03
+UMASK_RING_AK_USED_CCW_EVEN             0x04
+UMASK_RING_AK_USED_CCW_ODD              0x08
+UMASK_RING_AK_USED_CCW                  0x0C
+
+EVENT_RING_BL_USED                      0x40 BBOX
+UMASK_RING_BL_USED_CW_EVEN              0x01
+UMASK_RING_BL_USED_CW_ODD               0x02
+UMASK_RING_BL_USED_CW                   0x03
+UMASK_RING_BL_USED_CCW_EVEN             0x04
+UMASK_RING_BL_USED_CCW_ODD              0x08
+UMASK_RING_BL_USED_CCW                  0x0C
+
+EVENT_RPQ_CYCLES_NO_REG_CREDITS         0x15 BBOX
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0    0x01
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1    0x02
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2    0x04
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3    0x08
+
+EVENT_WPQ_CYCLES_NO_REG_CREDITS         0x18 BBOX
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x01
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0    0x08
+
+EVENT_SBO0_CREDITS_ACQUIRED             0x68 BBOX
+UMASK_SBO0_CREDITS_ACQUIRED_AD          0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL          0x02
+
+EVENT_SBO1_CREDITS_ACQUIRED             0x69 BBOX
+UMASK_SBO1_CREDITS_ACQUIRED_AD          0x01
+UMASK_SBO1_CREDITS_ACQUIRED_BL          0x02
+
+EVENT_SBO0_CREDITS_OCCUPANCY            0x6A BBOX
+UMASK_SBO0_CREDITS_OCCUPANCY_AD         0x01
+UMASK_SBO0_CREDITS_OCCUPANCY_BL         0x02
+
+EVENT_SBO1_CREDITS_OCCUPANCY            0x6B BBOX
+UMASK_SBO1_CREDITS_OCCUPANCY_AD         0x01
+UMASK_SBO1_CREDITS_OCCUPANCY_BL         0x02
+
+EVENT_SNOOPS_RSP_AFTER_DATA             0x0A BBOX
+UMASK_SNOOPS_RSP_AFTER_DATA_LOCAL       0x01
+UMASK_SNOOPS_RSP_AFTER_DATA_REMOTE      0x02
+
+EVENT_SNOOP_CYCLES_NE                   0x08 BBOX
+UMASK_SNOOP_CYCLES_NE_LOCAL             0x01
+UMASK_SNOOP_CYCLES_NE_REMOTE            0x02
+UMASK_SNOOP_CYCLES_NE_ALL               0x03
+
+EVENT_SNOOP_OCCUPANCY                   0x09 BBOX
+UMASK_SNOOP_OCCUPANCY_LOCAL             0x01
+UMASK_SNOOP_OCCUPANCY_REMOTE            0x02
+
+EVENT_SNOOP_RESP                        0x21 BBOX
+UMASK_SNOOP_RESP_RSPI                   0x01
+UMASK_SNOOP_RESP_RSPS                   0x02
+UMASK_SNOOP_RESP_RSPIFWD                0x04
+UMASK_SNOOP_RESP_RSPSFWD                0x08
+UMASK_SNOOP_RESP_RSP_WB                 0x10
+UMASK_SNOOP_RESP_RSP_FWD_WB             0x20
+UMASK_SNOOP_RESP_RSPCNFLCT              0x40
+
+EVENT_SNP_RESP_RECV_LOCAL               0x60 BBOX
+UMASK_SNP_RESP_RECV_LOCAL_RSPI          0x01
+UMASK_SNP_RESP_RECV_LOCAL_RSPS          0x02
+UMASK_SNP_RESP_RECV_LOCAL_RSPIFWD       0x04
+UMASK_SNP_RESP_RECV_LOCAL_RSPSFWD       0x08
+UMASK_SNP_RESP_RECV_LOCAL_RSPXWB        0x10
+UMASK_SNP_RESP_RECV_LOCAL_RSPXFWDXWB    0x20
+UMASK_SNP_RESP_RECV_LOCAL_RSPCNFLCT     0x40
+UMASK_SNP_RESP_RECV_LOCAL_OTHER         0x80
+
+EVENT_STALL_NO_SBO_CREDIT               0x6C BBOX
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD       0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD       0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL       0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO1_BL       0x08
+
+EVENT_TAD_REQUESTS_G0                   0x1B BBOX
+UMASK_TAD_REQUESTS_G0_REGION0           0x01
+UMASK_TAD_REQUESTS_G0_REGION1           0x02
+UMASK_TAD_REQUESTS_G0_REGION2           0x04
+UMASK_TAD_REQUESTS_G0_REGION3           0x08
+UMASK_TAD_REQUESTS_G0_REGION4           0x10
+UMASK_TAD_REQUESTS_G0_REGION5           0x20
+UMASK_TAD_REQUESTS_G0_REGION6           0x40
+UMASK_TAD_REQUESTS_G0_REGION7           0x60
+
+EVENT_TAD_REQUESTS_G1                   0x1C BBOX
+UMASK_TAD_REQUESTS_G1_REGION8           0x01
+UMASK_TAD_REQUESTS_G1_REGION9           0x02
+UMASK_TAD_REQUESTS_G1_REGION10          0x04
+UMASK_TAD_REQUESTS_G1_REGION11          0x08
+
+EVENT_TRACKER_CYCLES_FULL               0x02 BBOX
+UMASK_TRACKER_CYCLES_FULL_GP            0x01
+UMASK_TRACKER_CYCLES_FULL_ALL           0x02
+
+EVENT_TRACKER_CYCLES_NE                 0x03 BBOX
+UMASK_TRACKER_CYCLES_NE_LOCAL           0x01
+UMASK_TRACKER_CYCLES_NE_REMOTE          0x02
+UMASK_TRACKER_CYCLES_NE_ALL             0x03
+
+EVENT_TRACKER_OCCUPANCY                 0x04 BBOX
+UMASK_TRACKER_OCCUPANCY_READS_LOCAL     0x04
+UMASK_TRACKER_OCCUPANCY_READS_REMOTE    0x08
+UMASK_TRACKER_OCCUPANCY_WRITES_LOCAL    0x10
+UMASK_TRACKER_OCCUPANCY_WRITES_REMOTE   0x20
+UMASK_TRACKER_OCCUPANCY_INVITOE_LOCAL   0x40
+UMASK_TRACKER_OCCUPANCY_INVITOE_REMOTE  0x80
+
+EVENT_TRACKER_PENDING_OCCUPANCY         0x05 BBOX
+UMASK_TRACKER_PENDING_OCCUPANCY_LOCAL   0x01
+UMASK_TRACKER_PENDING_OCCUPANCY_REMOTE  0x02
+
+EVENT_TXR_AD_CYCLES_FULL                0x2A BBOX
+UMASK_TXR_AD_CYCLES_FULL_SCHED0         0x01
+UMASK_TXR_AD_CYCLES_FULL_SCHED1         0x02
+UMASK_TXR_AD_CYCLES_FULL_ALL            0x03
+
+EVENT_TXR_AK                            0x0E BBOX
+UMASK_TXR_AK                            0x00
+
+EVENT_TXR_AK_CYCLES_FULL                0x32 BBOX
+UMASK_TXR_AK_CYCLES_FULL_SCHED0         0x01
+UMASK_TXR_AK_CYCLES_FULL_SCHED1         0x02
+UMASK_TXR_AK_CYCLES_FULL_ALL            0x03
+
+EVENT_TXR_BL                            0x10 BBOX
+UMASK_TXR_BL_DRS_CACHE                  0x01
+UMASK_TXR_BL_DRS_CORE                   0x02
+UMASK_TXR_BL_DRS_QPI                    0x04
+
+EVENT_TXR_BL_CYCLES_FULL                0x36 BBOX
+UMASK_TXR_BL_CYCLES_FULL_SCHED0         0x01
+UMASK_TXR_BL_CYCLES_FULL_SCHED1         0x02
+UMASK_TXR_BL_CYCLES_FULL_ALL            0x03
+
+EVENT_TXR_BL_OCCUPANCY                  0x34 BBOX
+UMASK_TXR_BL_OCCUPANCY                  0x00
+
+EVENT_TXR_STARVED                       0x6D BBOX
+UMASK_TXR_STARVED_AK                    0x01
+UMASK_TXR_STARVED_BL                    0x02
+
+EVENT_DRAM_CLOCKTICKS                   0x00 MBOX
+UMASK_DRAM_CLOCKTICKS                   0x00
+
+EVENT_ACT_COUNT                         0x01 MBOX
+UMASK_ACT_COUNT_RD                      0x01
+UMASK_ACT_COUNT_WR                      0x02
+UMASK_ACT_COUNT_BYP                     0x08
+
+EVENT_BYP_CMDS                          0xA1 MBOX
+UMASK_BYP_CMDS_ACT                      0x01
+UMASK_BYP_CMDS_CAS                      0x02
+UMASK_BYP_CMDS_PRE                      0x04
+
+EVENT_CAS_COUNT                         0x04 MBOX
+UMASK_CAS_COUNT_RD_REG                  0x01
+UMASK_CAS_COUNT_RD_UNDERFILL            0x02
+UMASK_CAS_COUNT_RD                      0x03
+UMASK_CAS_COUNT_RD_WMM                  0x10
+UMASK_CAS_COUNT_RD_RMM                  0x20
+UMASK_CAS_COUNT_WR_WMM                  0x04
+UMASK_CAS_COUNT_WR_RMM                  0x08
+UMASK_CAS_COUNT_WR                      0x0C
+UMASK_CAS_COUNT_ALL                     0x0F
+
+EVENT_DRAM_PRE_ALL                      0x06 MBOX
+UMASK_DRAM_PRE_ALL                      0x00
+
+EVENT_DRAM_REFRESH                      0x05 MBOX
+UMASK_DRAM_REFRESH_PANIC                0x02
+UMASK_DRAM_REFRESH_HIGH                 0x04
+
+EVENT_ECC_CORRECTABLE_ERRORS            0x09 MBOX
+UMASK_ECC_CORRECTABLE_ERRORS            0x00
+
+EVENT_MAJOR_MODES                       0x07 MBOX
+UMASK_MAJOR_MODES_READ                  0x01
+UMASK_MAJOR_MODES_WRITE                 0x02
+UMASK_MAJOR_MODES_PARTIAL               0x03
+UMASK_MAJOR_MODES_ISOCH                 0x04
+
+EVENT_POWER_CHANNEL_DLLOFF              0x84 MBOX
+UMASK_POWER_CHANNEL_DLLOFF              0x00
+
+EVENT_POWER_CHANNEL_PPD                 0x85 MBOX
+UMASK_POWER_CHANNEL_PPD                 0x00
+
+EVENT_POWER_CKE_CYCLES                  0x83 MBOX
+UMASK_POWER_CKE_CYCLES_RANK0            0x01
+UMASK_POWER_CKE_CYCLES_RANK1            0x02
+UMASK_POWER_CKE_CYCLES_RANK2            0x04
+UMASK_POWER_CKE_CYCLES_RANK3            0x08
+UMASK_POWER_CKE_CYCLES_RANK4            0x10
+UMASK_POWER_CKE_CYCLES_RANK5            0x20
+UMASK_POWER_CKE_CYCLES_RANK6            0x40
+UMASK_POWER_CKE_CYCLES_RANK7            0x80
+
+EVENT_POWER_CRITICAL_THROTTLE_CYCLES    0x86 MBOX
+UMASK_POWER_CRITICAL_THROTTLE_CYCLES    0x00
+
+EVENT_POWER_PCU_THROTTLING              0x42 MBOX
+UMASK_POWER_PCU_THROTTLING              0x00
+
+EVENT_POWER_SELF_REFRESH                0x43 MBOX
+UMASK_POWER_SELF_REFRESH                0x00
+
+EVENT_POWER_THROTTLE_CYCLES             0x41 MBOX
+UMASK_POWER_THROTTLE_CYCLES_RANK0       0x01
+UMASK_POWER_THROTTLE_CYCLES_RANK1       0x02
+UMASK_POWER_THROTTLE_CYCLES_RANK2       0x04
+UMASK_POWER_THROTTLE_CYCLES_RANK3       0x08
+UMASK_POWER_THROTTLE_CYCLES_RANK4       0x10
+UMASK_POWER_THROTTLE_CYCLES_RANK5       0x20
+UMASK_POWER_THROTTLE_CYCLES_RANK6       0x40
+UMASK_POWER_THROTTLE_CYCLES_RANK7       0x80
+
+EVENT_PREEMPTION                        0x08 MBOX
+UMASK_PREEMPTION_RD_PREEMPT_RD          0x01
+UMASK_PREEMPTION_RD_PREEMPT_WR          0x02
+
+EVENT_PRE_COUNT                         0x02 MBOX
+UMASK_PRE_COUNT_PAGE_MISS               0x01
+UMASK_PRE_COUNT_PAGE_CLOSE              0x02
+UMASK_PRE_COUNT_RD                      0x04
+UMASK_PRE_COUNT_WR                      0x08
+UMASK_PRE_COUNT_BYP                     0x10
+
+EVENT_RD_CAS_PRIO                       0xA0 MBOX
+UMASK_RD_CAS_PRIO_LOW                   0x01
+UMASK_RD_CAS_PRIO_MED                   0x02
+UMASK_RD_CAS_PRIO_HIGH                  0x04
+UMASK_RD_CAS_PRIO_PANIC                 0x08
+
+EVENT_RD_CAS_RANK0                      0xB0 MBOX
+UMASK_RD_CAS_RANK0_BANK0                0x00
+UMASK_RD_CAS_RANK0_BANK1                0x01
+UMASK_RD_CAS_RANK0_BANK2                0x02
+UMASK_RD_CAS_RANK0_BANK3                0x03
+UMASK_RD_CAS_RANK0_BANK4                0x04
+UMASK_RD_CAS_RANK0_BANK5                0x05
+UMASK_RD_CAS_RANK0_BANK6                0x06
+UMASK_RD_CAS_RANK0_BANK7                0x07
+UMASK_RD_CAS_RANK0_BANK8                0x08
+UMASK_RD_CAS_RANK0_BANK9                0x09
+UMASK_RD_CAS_RANK0_BANK10               0x0A
+UMASK_RD_CAS_RANK0_BANK11               0x0B
+UMASK_RD_CAS_RANK0_BANK12               0x0C
+UMASK_RD_CAS_RANK0_BANK13               0x0D
+UMASK_RD_CAS_RANK0_BANK14               0x0E
+UMASK_RD_CAS_RANK0_BANK15               0x0F
+UMASK_RD_CAS_RANK0_ALLBANKS             0x10
+UMASK_RD_CAS_RANK0_BANKG0               0x11
+UMASK_RD_CAS_RANK0_BANKG1               0x12
+UMASK_RD_CAS_RANK0_BANKG2               0x13
+UMASK_RD_CAS_RANK0_BANKG3               0x14
+
+EVENT_RD_CAS_RANK1                      0xB1 MBOX
+UMASK_RD_CAS_RANK1_BANK0                0x00
+UMASK_RD_CAS_RANK1_BANK1                0x01
+UMASK_RD_CAS_RANK1_BANK2                0x02
+UMASK_RD_CAS_RANK1_BANK3                0x03
+UMASK_RD_CAS_RANK1_BANK4                0x04
+UMASK_RD_CAS_RANK1_BANK5                0x05
+UMASK_RD_CAS_RANK1_BANK6                0x06
+UMASK_RD_CAS_RANK1_BANK7                0x07
+UMASK_RD_CAS_RANK1_BANK8                0x08
+UMASK_RD_CAS_RANK1_BANK9                0x09
+UMASK_RD_CAS_RANK1_BANK10               0x0A
+UMASK_RD_CAS_RANK1_BANK11               0x0B
+UMASK_RD_CAS_RANK1_BANK12               0x0C
+UMASK_RD_CAS_RANK1_BANK13               0x0D
+UMASK_RD_CAS_RANK1_BANK14               0x0E
+UMASK_RD_CAS_RANK1_BANK15               0x0F
+UMASK_RD_CAS_RANK1_ALLBANKS             0x10
+UMASK_RD_CAS_RANK1_BANKG0               0x11
+UMASK_RD_CAS_RANK1_BANKG1               0x12
+UMASK_RD_CAS_RANK1_BANKG2               0x13
+UMASK_RD_CAS_RANK1_BANKG3               0x14
+
+EVENT_RD_CAS_RANK2                      0xB2 MBOX
+UMASK_RD_CAS_RANK2_BANK0                0x00
+UMASK_RD_CAS_RANK2_BANK1                0x01
+UMASK_RD_CAS_RANK2_BANK2                0x02
+UMASK_RD_CAS_RANK2_BANK3                0x03
+UMASK_RD_CAS_RANK2_BANK4                0x04
+UMASK_RD_CAS_RANK2_BANK5                0x05
+UMASK_RD_CAS_RANK2_BANK6                0x06
+UMASK_RD_CAS_RANK2_BANK7                0x07
+UMASK_RD_CAS_RANK2_BANK8                0x08
+UMASK_RD_CAS_RANK2_BANK9                0x09
+UMASK_RD_CAS_RANK2_BANK10               0x0A
+UMASK_RD_CAS_RANK2_BANK11               0x0B
+UMASK_RD_CAS_RANK2_BANK12               0x0C
+UMASK_RD_CAS_RANK2_BANK13               0x0D
+UMASK_RD_CAS_RANK2_BANK14               0x0E
+UMASK_RD_CAS_RANK2_BANK15               0x0F
+UMASK_RD_CAS_RANK2_ALLBANKS             0x10
+UMASK_RD_CAS_RANK2_BANKG0               0x11
+UMASK_RD_CAS_RANK2_BANKG1               0x12
+UMASK_RD_CAS_RANK2_BANKG2               0x13
+UMASK_RD_CAS_RANK2_BANKG3               0x14
+
+EVENT_RD_CAS_RANK3                      0xB3 MBOX
+UMASK_RD_CAS_RANK3_BANK0                0x00
+UMASK_RD_CAS_RANK3_BANK1                0x01
+UMASK_RD_CAS_RANK3_BANK2                0x02
+UMASK_RD_CAS_RANK3_BANK3                0x03
+UMASK_RD_CAS_RANK3_BANK4                0x04
+UMASK_RD_CAS_RANK3_BANK5                0x05
+UMASK_RD_CAS_RANK3_BANK6                0x06
+UMASK_RD_CAS_RANK3_BANK7                0x07
+UMASK_RD_CAS_RANK3_BANK8                0x08
+UMASK_RD_CAS_RANK3_BANK9                0x09
+UMASK_RD_CAS_RANK3_BANK10               0x0A
+UMASK_RD_CAS_RANK3_BANK11               0x0B
+UMASK_RD_CAS_RANK3_BANK12               0x0C
+UMASK_RD_CAS_RANK3_BANK13               0x0D
+UMASK_RD_CAS_RANK3_BANK14               0x0E
+UMASK_RD_CAS_RANK3_BANK15               0x0F
+UMASK_RD_CAS_RANK3_ALLBANKS             0x10
+UMASK_RD_CAS_RANK3_BANKG0               0x11
+UMASK_RD_CAS_RANK3_BANKG1               0x12
+UMASK_RD_CAS_RANK3_BANKG2               0x13
+UMASK_RD_CAS_RANK3_BANKG3               0x14
+
+EVENT_RD_CAS_RANK4                      0xB4 MBOX
+UMASK_RD_CAS_RANK4_BANK0                0x00
+UMASK_RD_CAS_RANK4_BANK1                0x01
+UMASK_RD_CAS_RANK4_BANK2                0x02
+UMASK_RD_CAS_RANK4_BANK3                0x03
+UMASK_RD_CAS_RANK4_BANK4                0x04
+UMASK_RD_CAS_RANK4_BANK5                0x05
+UMASK_RD_CAS_RANK4_BANK6                0x06
+UMASK_RD_CAS_RANK4_BANK7                0x07
+UMASK_RD_CAS_RANK4_BANK8                0x08
+UMASK_RD_CAS_RANK4_BANK9                0x09
+UMASK_RD_CAS_RANK4_BANK10               0x0A
+UMASK_RD_CAS_RANK4_BANK11               0x0B
+UMASK_RD_CAS_RANK4_BANK12               0x0C
+UMASK_RD_CAS_RANK4_BANK13               0x0D
+UMASK_RD_CAS_RANK4_BANK14               0x0E
+UMASK_RD_CAS_RANK4_BANK15               0x0F
+UMASK_RD_CAS_RANK4_ALLBANKS             0x10
+UMASK_RD_CAS_RANK4_BANKG0               0x11
+UMASK_RD_CAS_RANK4_BANKG1               0x12
+UMASK_RD_CAS_RANK4_BANKG2               0x13
+UMASK_RD_CAS_RANK4_BANKG3               0x14
+
+EVENT_RD_CAS_RANK5                      0xB5 MBOX
+UMASK_RD_CAS_RANK5_BANK0                0x00
+UMASK_RD_CAS_RANK5_BANK1                0x01
+UMASK_RD_CAS_RANK5_BANK2                0x02
+UMASK_RD_CAS_RANK5_BANK3                0x03
+UMASK_RD_CAS_RANK5_BANK4                0x04
+UMASK_RD_CAS_RANK5_BANK5                0x05
+UMASK_RD_CAS_RANK5_BANK6                0x06
+UMASK_RD_CAS_RANK5_BANK7                0x07
+UMASK_RD_CAS_RANK5_BANK8                0x08
+UMASK_RD_CAS_RANK5_BANK9                0x09
+UMASK_RD_CAS_RANK5_BANK10               0x0A
+UMASK_RD_CAS_RANK5_BANK11               0x0B
+UMASK_RD_CAS_RANK5_BANK12               0x0C
+UMASK_RD_CAS_RANK5_BANK13               0x0D
+UMASK_RD_CAS_RANK5_BANK14               0x0E
+UMASK_RD_CAS_RANK5_BANK15               0x0F
+UMASK_RD_CAS_RANK5_ALLBANKS             0x10
+UMASK_RD_CAS_RANK5_BANKG0               0x11
+UMASK_RD_CAS_RANK5_BANKG1               0x12
+UMASK_RD_CAS_RANK5_BANKG2               0x13
+UMASK_RD_CAS_RANK5_BANKG3               0x14
+
+EVENT_RD_CAS_RANK6                      0xB6 MBOX
+UMASK_RD_CAS_RANK6_BANK0                0x00
+UMASK_RD_CAS_RANK6_BANK1                0x01
+UMASK_RD_CAS_RANK6_BANK2                0x02
+UMASK_RD_CAS_RANK6_BANK3                0x03
+UMASK_RD_CAS_RANK6_BANK4                0x04
+UMASK_RD_CAS_RANK6_BANK5                0x05
+UMASK_RD_CAS_RANK6_BANK6                0x06
+UMASK_RD_CAS_RANK6_BANK7                0x07
+UMASK_RD_CAS_RANK6_BANK8                0x08
+UMASK_RD_CAS_RANK6_BANK9                0x09
+UMASK_RD_CAS_RANK6_BANK10               0x0A
+UMASK_RD_CAS_RANK6_BANK11               0x0B
+UMASK_RD_CAS_RANK6_BANK12               0x0C
+UMASK_RD_CAS_RANK6_BANK13               0x0D
+UMASK_RD_CAS_RANK6_BANK14               0x0E
+UMASK_RD_CAS_RANK6_BANK15               0x0F
+UMASK_RD_CAS_RANK6_ALLBANKS             0x10
+UMASK_RD_CAS_RANK6_BANKG0               0x11
+UMASK_RD_CAS_RANK6_BANKG1               0x12
+UMASK_RD_CAS_RANK6_BANKG2               0x13
+UMASK_RD_CAS_RANK6_BANKG3               0x14
+
+EVENT_RD_CAS_RANK7                      0xB7 MBOX
+UMASK_RD_CAS_RANK7_BANK0                0x00
+UMASK_RD_CAS_RANK7_BANK1                0x01
+UMASK_RD_CAS_RANK7_BANK2                0x02
+UMASK_RD_CAS_RANK7_BANK3                0x03
+UMASK_RD_CAS_RANK7_BANK4                0x04
+UMASK_RD_CAS_RANK7_BANK5                0x05
+UMASK_RD_CAS_RANK7_BANK6                0x06
+UMASK_RD_CAS_RANK7_BANK7                0x07
+UMASK_RD_CAS_RANK7_BANK8                0x08
+UMASK_RD_CAS_RANK7_BANK9                0x09
+UMASK_RD_CAS_RANK7_BANK10               0x0A
+UMASK_RD_CAS_RANK7_BANK11               0x0B
+UMASK_RD_CAS_RANK7_BANK12               0x0C
+UMASK_RD_CAS_RANK7_BANK13               0x0D
+UMASK_RD_CAS_RANK7_BANK14               0x0E
+UMASK_RD_CAS_RANK7_BANK15               0x0F
+UMASK_RD_CAS_RANK7_ALLBANKS             0x10
+UMASK_RD_CAS_RANK7_BANKG0               0x11
+UMASK_RD_CAS_RANK7_BANKG1               0x12
+UMASK_RD_CAS_RANK7_BANKG2               0x13
+UMASK_RD_CAS_RANK7_BANKG3               0x14
+
+EVENT_RPQ_CYCLES_NE                     0x11 MBOX
+UMASK_RPQ_CYCLES_NE                     0x00
+
+EVENT_RPQ_INSERTS                       0x10 MBOX
+UMASK_RPQ_INSERTS                       0x00
+
+EVENT_VMSE_MXB_WR_OCCUPANCY             0x91 MBOX
+UMASK_VMSE_MXB_WR_OCCUPANCY             0x00
+
+EVENT_VMSE_WR_PUSH                      0x90 MBOX
+UMASK_VMSE_WR_PUSH_WMM                  0x01
+UMASK_VMSE_WR_PUSH_RMM                  0x02
+
+EVENT_WMM_TO_RMM                        0xC0 MBOX
+UMASK_WMM_TO_RMM_LOW_THRESH             0x01
+UMASK_WMM_TO_RMM_STARVE                 0x02
+UMASK_WMM_TO_RMM_VMSE_RETRY             0x04
+
+# Undocumented event, mentioned in metrics table but not defined
+EVENT_WPQ_INSERTS                       0x20 MBOX
+UMASK_WPQ_INSERTS                       0x00
+
+EVENT_WPQ_CYCLES_FULL                   0x22 MBOX
+UMASK_WPQ_CYCLES_FULL                   0x00
+
+EVENT_WPQ_CYCLES_NE                     0x21 MBOX
+UMASK_WPQ_CYCLES_NE                     0x00
+
+EVENT_WPQ_READ_HIT                      0x23 MBOX
+UMASK_WPQ_READ_HIT                      0x00
+
+EVENT_WPQ_WRITE_HIT                     0x24 MBOX
+UMASK_WPQ_WRITE_HIT                     0x00
+
+EVENT_WRONG_MM                          0xC1 MBOX
+UMASK_WRONG_MM                          0x00
+
+EVENT_WR_CAS_RANK0                      0xB8 MBOX
+UMASK_WR_CAS_RANK0_BANK0                0x00
+UMASK_WR_CAS_RANK0_BANK1                0x01
+UMASK_WR_CAS_RANK0_BANK2                0x02
+UMASK_WR_CAS_RANK0_BANK3                0x03
+UMASK_WR_CAS_RANK0_BANK4                0x04
+UMASK_WR_CAS_RANK0_BANK5                0x05
+UMASK_WR_CAS_RANK0_BANK6                0x06
+UMASK_WR_CAS_RANK0_BANK7                0x07
+UMASK_WR_CAS_RANK0_BANK8                0x08
+UMASK_WR_CAS_RANK0_BANK9                0x09
+UMASK_WR_CAS_RANK0_BANK10               0x0A
+UMASK_WR_CAS_RANK0_BANK11               0x0B
+UMASK_WR_CAS_RANK0_BANK12               0x0C
+UMASK_WR_CAS_RANK0_BANK13               0x0D
+UMASK_WR_CAS_RANK0_BANK14               0x0E
+UMASK_WR_CAS_RANK0_BANK15               0x0F
+UMASK_WR_CAS_RANK0_ALLBANKS             0x10
+UMASK_WR_CAS_RANK0_BANKG0               0x11
+UMASK_WR_CAS_RANK0_BANKG1               0x12
+UMASK_WR_CAS_RANK0_BANKG2               0x13
+UMASK_WR_CAS_RANK0_BANKG3               0x14
+
+EVENT_WR_CAS_RANK1                      0xB9 MBOX
+UMASK_WR_CAS_RANK1_BANK0                0x00
+UMASK_WR_CAS_RANK1_BANK1                0x01
+UMASK_WR_CAS_RANK1_BANK2                0x02
+UMASK_WR_CAS_RANK1_BANK3                0x03
+UMASK_WR_CAS_RANK1_BANK4                0x04
+UMASK_WR_CAS_RANK1_BANK5                0x05
+UMASK_WR_CAS_RANK1_BANK6                0x06
+UMASK_WR_CAS_RANK1_BANK7                0x07
+UMASK_WR_CAS_RANK1_BANK8                0x08
+UMASK_WR_CAS_RANK1_BANK9                0x09
+UMASK_WR_CAS_RANK1_BANK10               0x0A
+UMASK_WR_CAS_RANK1_BANK11               0x0B
+UMASK_WR_CAS_RANK1_BANK12               0x0C
+UMASK_WR_CAS_RANK1_BANK13               0x0D
+UMASK_WR_CAS_RANK1_BANK14               0x0E
+UMASK_WR_CAS_RANK1_BANK15               0x0F
+UMASK_WR_CAS_RANK1_ALLBANKS             0x10
+UMASK_WR_CAS_RANK1_BANKG0               0x11
+UMASK_WR_CAS_RANK1_BANKG1               0x12
+UMASK_WR_CAS_RANK1_BANKG2               0x13
+UMASK_WR_CAS_RANK1_BANKG3               0x14
+
+EVENT_WR_CAS_RANK2                      0xBA MBOX
+UMASK_WR_CAS_RANK2_BANK0                0x00
+UMASK_WR_CAS_RANK2_BANK1                0x01
+UMASK_WR_CAS_RANK2_BANK2                0x02
+UMASK_WR_CAS_RANK2_BANK3                0x03
+UMASK_WR_CAS_RANK2_BANK4                0x04
+UMASK_WR_CAS_RANK2_BANK5                0x05
+UMASK_WR_CAS_RANK2_BANK6                0x06
+UMASK_WR_CAS_RANK2_BANK7                0x07
+UMASK_WR_CAS_RANK2_BANK8                0x08
+UMASK_WR_CAS_RANK2_BANK9                0x09
+UMASK_WR_CAS_RANK2_BANK10               0x0A
+UMASK_WR_CAS_RANK2_BANK11               0x0B
+UMASK_WR_CAS_RANK2_BANK12               0x0C
+UMASK_WR_CAS_RANK2_BANK13               0x0D
+UMASK_WR_CAS_RANK2_BANK14               0x0E
+UMASK_WR_CAS_RANK2_BANK15               0x0F
+UMASK_WR_CAS_RANK2_ALLBANKS             0x10
+UMASK_WR_CAS_RANK2_BANKG0               0x11
+UMASK_WR_CAS_RANK2_BANKG1               0x12
+UMASK_WR_CAS_RANK2_BANKG2               0x13
+UMASK_WR_CAS_RANK2_BANKG3               0x14
+
+EVENT_WR_CAS_RANK3                      0xBB MBOX
+UMASK_WR_CAS_RANK3_BANK0                0x00
+UMASK_WR_CAS_RANK3_BANK1                0x01
+UMASK_WR_CAS_RANK3_BANK2                0x02
+UMASK_WR_CAS_RANK3_BANK3                0x03
+UMASK_WR_CAS_RANK3_BANK4                0x04
+UMASK_WR_CAS_RANK3_BANK5                0x05
+UMASK_WR_CAS_RANK3_BANK6                0x06
+UMASK_WR_CAS_RANK3_BANK7                0x07
+UMASK_WR_CAS_RANK3_BANK8                0x08
+UMASK_WR_CAS_RANK3_BANK9                0x09
+UMASK_WR_CAS_RANK3_BANK10               0x0A
+UMASK_WR_CAS_RANK3_BANK11               0x0B
+UMASK_WR_CAS_RANK3_BANK12               0x0C
+UMASK_WR_CAS_RANK3_BANK13               0x0D
+UMASK_WR_CAS_RANK3_BANK14               0x0E
+UMASK_WR_CAS_RANK3_BANK15               0x0F
+UMASK_WR_CAS_RANK3_ALLBANKS             0x10
+UMASK_WR_CAS_RANK3_BANKG0               0x11
+UMASK_WR_CAS_RANK3_BANKG1               0x12
+UMASK_WR_CAS_RANK3_BANKG2               0x13
+UMASK_WR_CAS_RANK3_BANKG3               0x14
+
+EVENT_WR_CAS_RANK4                      0xBC MBOX
+UMASK_WR_CAS_RANK4_BANK0                0x00
+UMASK_WR_CAS_RANK4_BANK1                0x01
+UMASK_WR_CAS_RANK4_BANK2                0x02
+UMASK_WR_CAS_RANK4_BANK3                0x03
+UMASK_WR_CAS_RANK4_BANK4                0x04
+UMASK_WR_CAS_RANK4_BANK5                0x05
+UMASK_WR_CAS_RANK4_BANK6                0x06
+UMASK_WR_CAS_RANK4_BANK7                0x07
+UMASK_WR_CAS_RANK4_BANK8                0x08
+UMASK_WR_CAS_RANK4_BANK9                0x09
+UMASK_WR_CAS_RANK4_BANK10               0x0A
+UMASK_WR_CAS_RANK4_BANK11               0x0B
+UMASK_WR_CAS_RANK4_BANK12               0x0C
+UMASK_WR_CAS_RANK4_BANK13               0x0D
+UMASK_WR_CAS_RANK4_BANK14               0x0E
+UMASK_WR_CAS_RANK4_BANK15               0x0F
+UMASK_WR_CAS_RANK4_ALLBANKS             0x10
+UMASK_WR_CAS_RANK4_BANKG0               0x11
+UMASK_WR_CAS_RANK4_BANKG1               0x12
+UMASK_WR_CAS_RANK4_BANKG2               0x13
+UMASK_WR_CAS_RANK4_BANKG3               0x14
+
+EVENT_WR_CAS_RANK5                      0xBD MBOX
+UMASK_WR_CAS_RANK5_BANK0                0x00
+UMASK_WR_CAS_RANK5_BANK1                0x01
+UMASK_WR_CAS_RANK5_BANK2                0x02
+UMASK_WR_CAS_RANK5_BANK3                0x03
+UMASK_WR_CAS_RANK5_BANK4                0x04
+UMASK_WR_CAS_RANK5_BANK5                0x05
+UMASK_WR_CAS_RANK5_BANK6                0x06
+UMASK_WR_CAS_RANK5_BANK7                0x07
+UMASK_WR_CAS_RANK5_BANK8                0x08
+UMASK_WR_CAS_RANK5_BANK9                0x09
+UMASK_WR_CAS_RANK5_BANK10               0x0A
+UMASK_WR_CAS_RANK5_BANK11               0x0B
+UMASK_WR_CAS_RANK5_BANK12               0x0C
+UMASK_WR_CAS_RANK5_BANK13               0x0D
+UMASK_WR_CAS_RANK5_BANK14               0x0E
+UMASK_WR_CAS_RANK5_BANK15               0x0F
+UMASK_WR_CAS_RANK5_ALLBANKS             0x10
+UMASK_WR_CAS_RANK5_BANKG0               0x11
+UMASK_WR_CAS_RANK5_BANKG1               0x12
+UMASK_WR_CAS_RANK5_BANKG2               0x13
+UMASK_WR_CAS_RANK5_BANKG3               0x14
+
+EVENT_WR_CAS_RANK6                      0xBE MBOX
+UMASK_WR_CAS_RANK6_BANK0                0x00
+UMASK_WR_CAS_RANK6_BANK1                0x01
+UMASK_WR_CAS_RANK6_BANK2                0x02
+UMASK_WR_CAS_RANK6_BANK3                0x03
+UMASK_WR_CAS_RANK6_BANK4                0x04
+UMASK_WR_CAS_RANK6_BANK5                0x05
+UMASK_WR_CAS_RANK6_BANK6                0x06
+UMASK_WR_CAS_RANK6_BANK7                0x07
+UMASK_WR_CAS_RANK6_BANK8                0x08
+UMASK_WR_CAS_RANK6_BANK9                0x09
+UMASK_WR_CAS_RANK6_BANK10               0x0A
+UMASK_WR_CAS_RANK6_BANK11               0x0B
+UMASK_WR_CAS_RANK6_BANK12               0x0C
+UMASK_WR_CAS_RANK6_BANK13               0x0D
+UMASK_WR_CAS_RANK6_BANK14               0x0E
+UMASK_WR_CAS_RANK6_BANK15               0x0F
+UMASK_WR_CAS_RANK6_ALLBANKS             0x10
+UMASK_WR_CAS_RANK6_BANKG0               0x11
+UMASK_WR_CAS_RANK6_BANKG1               0x12
+UMASK_WR_CAS_RANK6_BANKG2               0x13
+UMASK_WR_CAS_RANK6_BANKG3               0x14
+
+EVENT_WR_CAS_RANK7                      0xBF MBOX
+UMASK_WR_CAS_RANK7_BANK0                0x00
+UMASK_WR_CAS_RANK7_BANK1                0x01
+UMASK_WR_CAS_RANK7_BANK2                0x02
+UMASK_WR_CAS_RANK7_BANK3                0x03
+UMASK_WR_CAS_RANK7_BANK4                0x04
+UMASK_WR_CAS_RANK7_BANK5                0x05
+UMASK_WR_CAS_RANK7_BANK6                0x06
+UMASK_WR_CAS_RANK7_BANK7                0x07
+UMASK_WR_CAS_RANK7_BANK8                0x08
+UMASK_WR_CAS_RANK7_BANK9                0x09
+UMASK_WR_CAS_RANK7_BANK10               0x0A
+UMASK_WR_CAS_RANK7_BANK11               0x0B
+UMASK_WR_CAS_RANK7_BANK12               0x0C
+UMASK_WR_CAS_RANK7_BANK13               0x0D
+UMASK_WR_CAS_RANK7_BANK14               0x0E
+UMASK_WR_CAS_RANK7_BANK15               0x0F
+UMASK_WR_CAS_RANK7_ALLBANKS             0x10
+UMASK_WR_CAS_RANK7_BANKG0               0x11
+UMASK_WR_CAS_RANK7_BANKG1               0x12
+UMASK_WR_CAS_RANK7_BANKG2               0x13
+UMASK_WR_CAS_RANK7_BANKG3               0x14
+
+EVENT_PBOX_CLOCKTICKS                   0x01 PBOX
+UMASK_PBOX_CLOCKTICKS                   0x00
+
+EVENT_IIO_CREDIT                        0x2D PBOX
+UMASK_IIO_CREDIT_PRQ_QPI0               0x01
+UMASK_IIO_CREDIT_PRQ_QPI1               0x02
+UMASK_IIO_CREDIT_ISOCH_QPI0             0x04
+UMASK_IIO_CREDIT_ISOCH_QPI1             0x08
+
+EVENT_RING_AD_USED                      0x07 PBOX
+UMASK_RING_AD_USED_CW_EVEN              0x01
+UMASK_RING_AD_USED_CW_ODD               0x02
+UMASK_RING_AD_USED_CW                   0x03
+UMASK_RING_AD_USED_CCW_EVEN             0x04
+UMASK_RING_AD_USED_CCW_ODD              0x08
+UMASK_RING_AD_USED_CCW                  0x0C
+
+EVENT_RING_AK_BOUNCES                   0x12 PBOX
+UMASK_RING_AK_BOUNCES_UP                0x01
+UMASK_RING_AK_BOUNCES_DN                0x02
+
+EVENT_RING_AK_USED                      0x08 PBOX
+UMASK_RING_AK_USED_CW_EVEN              0x01
+UMASK_RING_AK_USED_CW_ODD               0x02
+UMASK_RING_AK_USED_CW                   0x03
+UMASK_RING_AK_USED_CCW_EVEN             0x04
+UMASK_RING_AK_USED_CCW_ODD              0x08
+UMASK_RING_AK_USED_CCW                  0x0C
+
+EVENT_RING_BL_USED                      0x09 PBOX
+UMASK_RING_BL_USED_CW_EVEN              0x01
+UMASK_RING_BL_USED_CW_ODD               0x02
+UMASK_RING_BL_USED_CW                   0x03
+UMASK_RING_BL_USED_CCW_EVEN             0x04
+UMASK_RING_BL_USED_CCW_ODD              0x08
+UMASK_RING_BL_USED_CCW                  0x0C
+
+EVENT_RING_IV_USED                      0x09 PBOX
+UMASK_RING_IV_USED_CW                   0x03
+UMASK_RING_IV_USED_CCW                  0x0C
+UMASK_RING_IV_USED_ANY                  0x0F
+
+EVENT_RXR_CYCLES_NE                     0x10 PBOX
+UMASK_RXR_CYCLES_NE_NCB                 0x10
+UMASK_RXR_CYCLES_NE_NCS                 0x20
+
+EVENT_RXR_INSERTS                       0x11 PBOX
+UMASK_RXR_INSERTS_NCB                   0x10
+UMASK_RXR_INSERTS_NCS                   0x20
+
+EVENT_RXR_OCCUPANCY                     0x13 PBOX
+UMASK_RXR_OCCUPANCY_DRS                 0x08
+
+EVENT_SBO0_CREDITS_ACQUIRED             0x28 PBOX
+UMASK_SBO0_CREDITS_ACQUIRED_AD          0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL          0x02
+
+EVENT_STALL_NO_SBO_CREDIT               0x2C PBOX
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD       0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD       0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL       0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO1_BL       0x08
+
+EVENT_TXR_NACK_CW                       0x26 PBOX
+UMASK_TXR_NACK_CW_DN_AD                 0x01
+UMASK_TXR_NACK_CW_DN_BL                 0x02
+UMASK_TXR_NACK_CW_DN_AK                 0x04
+UMASK_TXR_NACK_CW_UP_AD                 0x08
+UMASK_TXR_NACK_CW_UP_BL                 0x10
+UMASK_TXR_NACK_CW_UP_AK                 0x20
+
+EVENT_CACHE_TOTAL_OCCUPANCY             0x12 IBOX
+UMASK_CACHE_TOTAL_OCCUPANCY_ANY         0x01
+UMASK_CACHE_TOTAL_OCCUPANCY_SOURCE      0x02
+
+EVENT_COHERENT_OPS                      0x13 IBOX
+UMASK_COHERENT_OPS_PCIRDCUR             0x01
+UMASK_COHERENT_OPS_CRD                  0x02
+UMASK_COHERENT_OPS_DRD                  0x04
+UMASK_COHERENT_OPS_RFO                  0x08
+UMASK_COHERENT_OPS_PCITOM               0x10
+UMASK_COHERENT_OPS_PCIDCAHINT           0x20
+UMASK_COHERENT_OPS_WBMTOI               0x40
+UMASK_COHERENT_OPS_CLFLUSH              0x80
+
+EVENT_MISC0                             0x14 IBOX
+UMASK_MISC0_FAST_REQ                    0x01
+UMASK_MISC0_FAST_REJ                    0x02
+UMASK_MISC0_2ND_RD_INSERT               0x04
+UMASK_MISC0_2ND_WR_INSERT               0x08
+UMASK_MISC0_2ND_ATOMIC_INSERT           0x10
+UMASK_MISC0_FAST_XFER                   0x20
+UMASK_MISC0_PF_ACK_HINT                 0x40
+UMASK_MISC0_PF_TIMEOUT                  0x80
+
+EVENT_MISC1                             0x15 IBOX
+UMASK_MISC1_SLOW_I                      0x01
+UMASK_MISC1_SLOW_S                      0x02
+UMASK_MISC1_SLOW_E                      0x04
+UMASK_MISC1_SLOW_M                      0x08
+UMASK_MISC1_LOST_FWD                    0x10
+UMASK_MISC1_SEC_RCVD_INVLD              0x20
+UMASK_MISC1_SEC_RCVD_VLD                0x40
+UMASK_MISC1_DATA_THROTTLE               0x80
+
+EVENT_SNOOP_RESP                        0x17 IBOX
+UMASK_SNOOP_RESP_MISS                   0x01
+UMASK_SNOOP_RESP_HIT_I                  0x02
+UMASK_SNOOP_RESP_HIT_ES                 0x04
+UMASK_SNOOP_RESP_HIT_M                  0x08
+UMASK_SNOOP_RESP_SNPCODE                0x10
+UMASK_SNOOP_RESP_SNPDATA                0x20
+UMASK_SNOOP_RESP_SNPINV                 0x40
+
+EVENT_TRANSACTIONS                      0x16 IBOX
+UMASK_TRANSACTIONS_READS                0x01
+UMASK_TRANSACTIONS_WRITES               0x02
+UMASK_TRANSACTIONS_RD_PREF              0x04
+UMASK_TRANSACTIONS_WR_PREF              0x08
+UMASK_TRANSACTIONS_ATOMIC               0x10
+UMASK_TRANSACTIONS_OTHER                0x20
+UMASK_TRANSACTIONS_ORDERINGQ            0x40
+
+EVENT_RXR_AK_INSERTS                    0x0A IBOX
+UMASK_RXR_AK_INSERTS                    0x00
+
+EVENT_RXR_BL_DRS_CYCLES_FULL            0x04 IBOX
+UMASK_RXR_BL_DRS_CYCLES_FULL            0x00
+
+EVENT_RXR_BL_DRS_INSERTS                0x01 IBOX
+UMASK_RXR_BL_DRS_INSERTS                0x00
+
+EVENT_RXR_BL_DRS_OCCUPANCY              0x07 IBOX
+UMASK_RXR_BL_DRS_OCCUPANCY              0x00
+
+EVENT_RXR_BL_NCB_CYCLES_FULL            0x05 IBOX
+UMASK_RXR_BL_NCB_CYCLES_FULL            0x00
+
+EVENT_RXR_BL_NCB_INSERTS                0x02 IBOX
+UMASK_RXR_BL_NCB_INSERTS                0x00
+
+EVENT_RXR_BL_NCB_OCCUPANCY              0x08 IBOX
+UMASK_RXR_BL_NCB_OCCUPANCY              0x00
+
+EVENT_RXR_BL_NCS_CYCLES_FULL            0x06 IBOX
+UMASK_RXR_BL_NCS_CYCLES_FULL            0x00
+
+EVENT_RXR_BL_NCS_INSERTS                0x03 IBOX
+UMASK_RXR_BL_NCS_INSERTS                0x00
+
+EVENT_RXR_BL_NCS_OCCUPANCY              0x09 IBOX
+UMASK_RXR_BL_NCS_OCCUPANCY              0x00
+
+EVENT_TXR_AD_STALL_CREDIT_CYCLES        0x18 IBOX
+UMASK_TXR_AD_STALL_CREDIT_CYCLES        0x00
+
+EVENT_TXR_BL_STALL_CREDIT_CYCLES        0x19 IBOX
+UMASK_TXR_BL_STALL_CREDIT_CYCLES        0x00
+
+EVENT_TXR_DATA_INSERTS_NCB              0x0E IBOX
+UMASK_TXR_DATA_INSERTS_NCB              0x00
+
+EVENT_TXR_DATA_INSERTS_NCS              0x0F IBOX
+UMASK_TXR_DATA_INSERTS_NCS              0x00
+
+EVENT_TXR_REQUEST_OCCUPANCY             0x0D IBOX
+UMASK_TXR_REQUEST_OCCUPANCY             0x00
+
+EVENT_RBOX_CLOCKTICKS                   0x01 RBOX
+UMASK_RBOX_CLOCKTICKS                   0x00
+
+EVENT_C_LO_AD_CREDITS_EMPTY             0x22 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO0        0x01
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO1        0x02
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO2        0x04
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO3        0x08
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO4        0x10
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO5        0x20
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO6        0x40
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO7        0x80
+
+EVENT_C_HI_AD_CREDITS_EMPTY             0x1F RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO8        0x01
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO9        0x02
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO10       0x04
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO11       0x08
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO12       0x10
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO13       0x20
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO14_16    0x40
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO15_17    0x80
+
+EVENT_HA_R2_BL_CREDITS_EMPTY_LO         0x2D RBOX0C0|RBOX1C0
+UMASK_HA_R2_BL_CREDITS_EMPTY_LO_HA0     0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_LO_HA1     0x02
+UMASK_HA_R2_BL_CREDITS_EMPTY_LO_R2_NCB  0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_LO_R2_NCS  0x01
+
+EVENT_HA_R2_BL_CREDITS_EMPTY_HI         0x2D RBOX0C1|RBOX1C1
+UMASK_HA_R2_BL_CREDITS_EMPTY_HI_HA0     0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_HI_HA1     0x02
+UMASK_HA_R2_BL_CREDITS_EMPTY_HI_R2_NCB  0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_HI_R2_NCS  0x01
+
+EVENT_QPI0_AD_CREDITS_EMPTY             0x20 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI0_AD_CREDITS_EMPTY_VNA         0x01
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_HOM     0x02
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_SNP     0x04
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_NDR     0x08
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_HOM     0x10
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_SNP     0x20
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_NDR     0x40
+
+EVENT_QPI1_AD_CREDITS_EMPTY             0x2E RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI1_AD_CREDITS_EMPTY_VNA         0x01
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_HOM     0x02
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_SNP     0x04
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_NDR     0x08
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_HOM     0x10
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_SNP     0x20
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_NDR     0x40
+
+EVENT_QPI0_BL_CREDITS_EMPTY             0x21 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI0_BL_CREDITS_EMPTY_VNA         0x01
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_HOM     0x02
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_SNP     0x04
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_NDR     0x08
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_HOM     0x10
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_SNP     0x20
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_NDR     0x40
+
+EVENT_QPI1_BL_CREDITS_EMPTY             0x2F RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_QPI1_BL_CREDITS_EMPTY_VNA         0x01
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_HOM     0x02
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_SNP     0x04
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_NDR     0x08
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_HOM     0x10
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_SNP     0x20
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_NDR     0x40
+
+EVENT_RING_AD_USED                      0x07 RBOX
+UMASK_RING_AD_USED_CW_EVEN              0x01
+UMASK_RING_AD_USED_CW_ODD               0x02
+UMASK_RING_AD_USED_CW                   0x03
+UMASK_RING_AD_USED_CCW_EVEN             0x04
+UMASK_RING_AD_USED_CCW_ODD              0x08
+UMASK_RING_AD_USED_CCW                  0x0C
+
+EVENT_RING_AK_USED                      0x08 RBOX
+UMASK_RING_AK_USED_CW_EVEN              0x01
+UMASK_RING_AK_USED_CW_ODD               0x02
+UMASK_RING_AK_USED_CW                   0x03
+UMASK_RING_AK_USED_CCW_EVEN             0x04
+UMASK_RING_AK_USED_CCW_ODD              0x08
+UMASK_RING_AK_USED_CCW                  0x0C
+
+EVENT_RING_BL_USED                      0x09 RBOX
+UMASK_RING_BL_USED_CW_EVEN              0x01
+UMASK_RING_BL_USED_CW_ODD               0x02
+UMASK_RING_BL_USED_CW                   0x03
+UMASK_RING_BL_USED_CCW_EVEN             0x04
+UMASK_RING_BL_USED_CCW_ODD              0x08
+UMASK_RING_BL_USED_CCW                  0x0C
+
+EVENT_RING_IV_USED                      0x0A RBOX
+UMASK_RING_IV_USED_CW                   0x03
+UMASK_RING_IV_USED_CCW                  0x0C
+UMASK_RING_IV_USED_ANY                  0x0F
+
+EVENT_RING_SINK_STARVED                 0x0E RBOX
+UMASK_RING_SINK_STARVED_AK              0x02
+
+EVENT_RXR_CYCLES_NE                     0x10 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_CYCLES_NE_HOM                 0x01
+UMASK_RXR_CYCLES_NE_SNP                 0x02
+UMASK_RXR_CYCLES_NE_NDR                 0x04
+
+EVENT_RXR_CYCLES_NE_VN1                 0x14 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_CYCLES_NE_VN1_HOM             0x01
+UMASK_RXR_CYCLES_NE_VN1_SNP             0x02
+UMASK_RXR_CYCLES_NE_VN1_NDR             0x04
+UMASK_RXR_CYCLES_NE_VN1_DRS             0x08
+UMASK_RXR_CYCLES_NE_VN1_NCB             0x10
+UMASK_RXR_CYCLES_NE_VN1_NCS             0x20
+
+EVENT_RXR_INSERTS                       0x11 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_INSERTS_HOM                   0x01
+UMASK_RXR_INSERTS_SNP                   0x02
+UMASK_RXR_INSERTS_NDR                   0x04
+UMASK_RXR_INSERTS_DRS                   0x08
+UMASK_RXR_INSERTS_NCB                   0x10
+UMASK_RXR_INSERTS_NCS                   0x20
+
+EVENT_RXR_INSERTS_VN1                   0x15 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_INSERTS_VN1_HOM               0x01
+UMASK_RXR_INSERTS_VN1_SNP               0x02
+UMASK_RXR_INSERTS_VN1_NDR               0x04
+UMASK_RXR_INSERTS_VN1_DRS               0x08
+UMASK_RXR_INSERTS_VN1_NCB               0x10
+UMASK_RXR_INSERTS_VN1_NCS               0x20
+
+EVENT_RXR_OCCUPANCY_VN1                 0x13 RBOX0C0|RBOX1C0
+UMASK_RXR_OCCUPANCY_VN1_HOM             0x01
+UMASK_RXR_OCCUPANCY_VN1_SNP             0x02
+UMASK_RXR_OCCUPANCY_VN1_NDR             0x04
+UMASK_RXR_OCCUPANCY_VN1_DRS             0x08
+UMASK_RXR_OCCUPANCY_VN1_NCB             0x10
+UMASK_RXR_OCCUPANCY_VN1_NCS             0x20
+
+EVENT_TXR_NACK                          0x26 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_NACK_DN_AD                    0x01
+UMASK_TXR_NACK_DN_BL                    0x02
+UMASK_TXR_NACK_DN_AK                    0x04
+UMASK_TXR_NACK_UP_AD                    0x08
+UMASK_TXR_NACK_UP_BL                    0x10
+UMASK_TXR_NACK_UP_AK                    0x20
+
+EVENT_SBO0_CREDITS_ACQUIRED             0x28 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_SBO0_CREDITS_ACQUIRED_AD          0x01
+UMASK_SBO0_CREDITS_ACQUIRED_BL          0x02
+
+EVENT_SBO1_CREDITS_ACQUIRED             0x29 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_SBO1_CREDITS_ACQUIRED_AD          0x01
+UMASK_SBO1_CREDITS_ACQUIRED_BL          0x02
+
+EVENT_STALL_NO_SBO_CREDIT               0x2C RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_STALL_NO_SBO_CREDIT_SBO0_AD       0x01
+UMASK_STALL_NO_SBO_CREDIT_SBO1_AD       0x02
+UMASK_STALL_NO_SBO_CREDIT_SBO0_BL       0x04
+UMASK_STALL_NO_SBO_CREDIT_SBO1_BL       0x08
+
+EVENT_VN0_CREDITS_USED                  0x36 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_USED_HOM              0x01
+UMASK_VN0_CREDITS_USED_SNP              0x02
+UMASK_VN0_CREDITS_USED_NDR              0x04
+UMASK_VN0_CREDITS_USED_DRS              0x08
+UMASK_VN0_CREDITS_USED_NCB              0x10
+UMASK_VN0_CREDITS_USED_NCS              0x20
+
+EVENT_VN0_CREDITS_REJECT                0x37 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_REJECT_HOM            0x01
+UMASK_VN0_CREDITS_REJECT_SNP            0x02
+UMASK_VN0_CREDITS_REJECT_NDR            0x04
+UMASK_VN0_CREDITS_REJECT_DRS            0x08
+UMASK_VN0_CREDITS_REJECT_NCB            0x10
+UMASK_VN0_CREDITS_REJECT_NCS            0x20
+
+EVENT_VN1_CREDITS_USED                  0x38 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN1_CREDITS_USED_HOM              0x01
+UMASK_VN1_CREDITS_USED_SNP              0x02
+UMASK_VN1_CREDITS_USED_NDR              0x04
+UMASK_VN1_CREDITS_USED_DRS              0x08
+UMASK_VN1_CREDITS_USED_NCB              0x10
+UMASK_VN1_CREDITS_USED_NCS              0x20
+
+EVENT_VN1_CREDITS_REJECT                0x39 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN1_CREDITS_REJECT_HOM            0x01
+UMASK_VN1_CREDITS_REJECT_SNP            0x02
+UMASK_VN1_CREDITS_REJECT_NDR            0x04
+UMASK_VN1_CREDITS_REJECT_DRS            0x08
+UMASK_VN1_CREDITS_REJECT_NCB            0x10
+UMASK_VN1_CREDITS_REJECT_NCS            0x20
+
+EVENT_VNA_CREDITS_ACQUIRED              0x33 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_ACQUIRED_AD           0x01
+UMASK_VNA_CREDITS_ACQUIRED_BL           0x04
+
+EVENT_VNA_CREDITS_REJECT                0x34 RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_REJECT_HOM            0x01
+UMASK_VNA_CREDITS_REJECT_SNP            0x02
+UMASK_VNA_CREDITS_REJECT_NDR            0x04
+UMASK_VNA_CREDITS_REJECT_DRS            0x08
+UMASK_VNA_CREDITS_REJECT_NCB            0x10
+UMASK_VNA_CREDITS_REJECT_NCS            0x20
+
+EVENT_QBOX_CLOCKTICKS                   0x14 QBOX
+UMASK_QBOX_CLOCKTICKS                   0x00
+
+EVENT_CTO_COUNT                         0x38 QBOX
+OPTIONS_CTO_COUNT                       EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MASK2_MASK|EVENT_OPTION_MASK3_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK|EVENT_OPTION_MATCH2_MASK|EVENT_OPTION_MATCH3_MASK
+UMASK_CTO_COUNT                         0x00 0x01
+
+EVENT_DIRECT2CORE                       0x13 QBOX
+UMASK_DIRECT2CORE_SUCCESS_RBT_HIT       0x01
+UMASK_DIRECT2CORE_FAILURE_CREDITS       0x02
+UMASK_DIRECT2CORE_FAILURE_RBT_HIT       0x04
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT   0x08
+UMASK_DIRECT2CORE_FAILURE_MISS          0x10
+UMASK_DIRECT2CORE_FAILURE_CREDITS_MISS  0x20
+UMASK_DIRECT2CORE_FAILURE_RBT_MISS      0x40
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT_MISS 0x80
+
+EVENT_L1_POWER_CYCLES                   0x12 QBOX
+UMASK_L1_POWER_CYCLES                   0x00
+
+EVENT_RXL0P_POWER_CYCLES                0x10 QBOX
+UMASK_RXL0P_POWER_CYCLES                0x00
+
+EVENT_RXL0_POWER_CYCLES                 0x0F QBOX
+UMASK_RXL0_POWER_CYCLES                 0x0F
+
+EVENT_TXL0P_POWER_CYCLES                0x0D QBOX
+UMASK_TXL0P_POWER_CYCLES                0x00
+
+EVENT_TXL0_POWER_CYCLES                 0x0C QBOX
+UMASK_TXL0_POWER_CYCLES                 0x00
+
+EVENT_RXL_BYPASSED                      0x09 QBOX
+UMASK_RXL_BYPASSED                      0x00
+
+EVENT_TXL_BYPASSED                      0x05 QBOX
+UMASK_TXL_BYPASSED                      0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN0          0x1E QBOX
+UMASK_RXL_CREDITS_CONSUMED_VN0_DRS      0x01 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCB      0x02 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCS      0x04 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_HOM      0x08 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_SNP      0x10 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_NDR      0x20 0x01
+
+EVENT_RXL_CREDITS_CONSUMED_VN1          0x39 QBOX
+UMASK_RXL_CREDITS_CONSUMED_VN1_DRS      0x01 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCB      0x02 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCS      0x04 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_HOM      0x08 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_SNP      0x10 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_NDR      0x20 0x01
+
+EVENT_RXL_CREDITS_CONSUMED_VNA          0x1D QBOX
+UMASK_RXL_CREDITS_CONSUMED_VNA          0x00 0x01
+
+EVENT_RXL_CYCLES_NE                     0x0A QBOX
+UMASK_RXL_CYCLES_NE                     0x00
+
+EVENT_TXL_CYCLES_NE                     0x06 QBOX
+UMASK_TXL_CYCLES_NE                     0x00
+
+EVENT_RXL_FLITS_G1                      0x02 QBOX
+UMASK_RXL_FLITS_G1_SNP                  0x01 0x01
+UMASK_RXL_FLITS_G1_HOM_REQ              0x02 0x01
+UMASK_RXL_FLITS_G1_HOM_NONREQ           0x04 0x01
+UMASK_RXL_FLITS_G1_HOM                  0x06 0x01
+UMASK_RXL_FLITS_G1_DRS_DATA             0x08 0x01
+UMASK_RXL_FLITS_G1_DRS_NONDATA          0x10 0x01
+UMASK_RXL_FLITS_G1_DRS                  0x18 0x01
+
+EVENT_RXL_FLITS_G2                      0x03 QBOX
+UMASK_RXL_FLITS_G2_NDR_AD               0x01 0x01
+UMASK_RXL_FLITS_G2_NDR_AK               0x02 0x01
+UMASK_RXL_FLITS_G2_NCB_DATA             0x04 0x01
+UMASK_RXL_FLITS_G2_NCB_NONDATA          0x08 0x01
+UMASK_RXL_FLITS_G2_NCB                  0x0C 0x01
+UMASK_RXL_FLITS_G2_NCS                  0x10 0x01
+
+EVENT_RXL_FLITS_G0                      0x01 QBOX
+UMASK_RXL_FLITS_G0_DATA                 0x02
+UMASK_RXL_FLITS_G0_NON_DATA             0x04
+
+EVENT_TXL_FLITS_G0                      0x00 QBOX
+UMASK_TXL_FLITS_G0_DATA                 0x02
+UMASK_TXL_FLITS_G0_NON_DATA             0x04
+
+EVENT_TXL_FLITS_G1                      0x00 QBOX
+UMASK_TXL_FLITS_G1_SNP                  0x01 0x01
+UMASK_TXL_FLITS_G1_HOM_REQ              0x02 0x01
+UMASK_TXL_FLITS_G1_HOM_NONREQ           0x04 0x01
+UMASK_TXL_FLITS_G1_HOM                  0x06 0x01
+UMASK_TXL_FLITS_G1_DRS_DATA             0x08 0x01
+UMASK_TXL_FLITS_G1_DRS_NONDATA          0x10 0x01
+UMASK_TXL_FLITS_G1_DRS                  0x18 0x01
+
+EVENT_TXL_FLITS_G2                      0x01 QBOX
+UMASK_TXL_FLITS_G2_NDR_AD               0x01 0x01
+UMASK_TXL_FLITS_G2_NDR_AK               0x02 0x01
+UMASK_TXL_FLITS_G2_NCB_DATA             0x04 0x01
+UMASK_TXL_FLITS_G2_NCB_NONDATA          0x08 0x01
+UMASK_TXL_FLITS_G2_NCB                  0x0C 0x01
+UMASK_TXL_FLITS_G2_NCS                  0x10 0x01
+
+EVENT_RXL_INSERTS                       0x08 QBOX
+UMASK_RXL_INSERTS                       0x00
+
+EVENT_TXL_INSERTS                       0x04 QBOX
+UMASK_TXL_INSERTS                       0x00
+
+EVENT_RXL_INSERTS_DRS                   0x09 QBOX
+UMASK_RXL_INSERTS_DRS_VN0               0x01 0x01
+UMASK_RXL_INSERTS_DRS_VN1               0x02 0x01
+
+EVENT_RXL_INSERTS_HOM                   0x0C QBOX
+UMASK_RXL_INSERTS_HOM_VN0               0x01 0x01
+UMASK_RXL_INSERTS_HOM_VN1               0x02 0x01
+
+EVENT_RXL_INSERTS_NCB                   0x0A QBOX
+UMASK_RXL_INSERTS_NCB_VN0               0x01 0x01
+UMASK_RXL_INSERTS_NCB_VN1               0x02 0x01
+
+EVENT_RXL_INSERTS_NCS                   0x0B QBOX
+UMASK_RXL_INSERTS_NCS_VN0               0x01 0x01
+UMASK_RXL_INSERTS_NCS_VN1               0x02 0x01
+
+EVENT_RXL_INSERTS_NDR                   0x0E QBOX
+UMASK_RXL_INSERTS_NDR_VN0               0x01 0x01
+UMASK_RXL_INSERTS_NDR_VN1               0x02 0x01
+
+EVENT_RXL_INSERTS_SNP                   0x0D QBOX
+UMASK_RXL_INSERTS_SNP_VN0               0x01 0x01
+UMASK_RXL_INSERTS_SNP_VN1               0x02 0x01
+
+EVENT_RXL_OCCUPANCY                     0x0B QBOX
+UMASK_RXL_OCCUPANCY                     0x00
+
+EVENT_TXL_OCCUPANCY                     0x07 QBOX
+UMASK_TXL_OCCUPANCY                     0x00
+
+EVENT_RXL_OCCUPANCY_DRS                 0x15 QBOX
+UMASK_RXL_OCCUPANCY_DRS_VN0             0x01 0x01
+UMASK_RXL_OCCUPANCY_DRS_VN1             0x02 0x01
+
+EVENT_RXL_OCCUPANCY_HOM                 0x18 QBOX
+UMASK_RXL_OCCUPANCY_HOM_VN0             0x01 0x01
+UMASK_RXL_OCCUPANCY_HOM_VN1             0x02 0x01
+
+EVENT_RXL_OCCUPANCY_NCB                 0x16 QBOX
+UMASK_RXL_OCCUPANCY_NCB_VN0             0x01 0x01
+UMASK_RXL_OCCUPANCY_NCB_VN1             0x02 0x01
+
+EVENT_RXL_OCCUPANCY_NCS                 0x17 QBOX
+UMASK_RXL_OCCUPANCY_NCS_VN0             0x01 0x01
+UMASK_RXL_OCCUPANCY_NCS_VN1             0x02 0x01
+
+EVENT_RXL_OCCUPANCY_NDR                 0x1A QBOX
+UMASK_RXL_OCCUPANCY_NDR_VN0             0x01 0x01
+UMASK_RXL_OCCUPANCY_NDR_VN1             0x02 0x01
+
+EVENT_RXL_OCCUPANCY_SNP                 0x19 QBOX
+UMASK_RXL_OCCUPANCY_SNP_VN0             0x01 0x01
+UMASK_RXL_OCCUPANCY_SNP_VN1             0x02 0x01
+
+EVENT_TXR_AD_HOM_CREDIT_ACQUIRED        0x26 QBOX
+UMASK_TXR_AD_HOM_CREDIT_ACQUIRED_VN0    0x01 0x01
+UMASK_TXR_AD_HOM_CREDIT_ACQUIRED_VN1    0x02 0x01
+
+EVENT_TXR_AD_HOM_CREDIT_OCCUPANCY       0x22 QBOX
+UMASK_TXR_AD_HOM_CREDIT_OCCUPANCY_VN0   0x01 0x01
+UMASK_TXR_AD_HOM_CREDIT_OCCUPANCY_VN1   0x02 0x01
+
+EVENT_TXR_AD_NDR_CREDIT_ACQUIRED        0x28 QBOX
+UMASK_TXR_AD_NDR_CREDIT_ACQUIRED_VN0    0x01 0x01
+UMASK_TXR_AD_NDR_CREDIT_ACQUIRED_VN1    0x02 0x01
+
+EVENT_TXR_AD_NDR_CREDIT_OCCUPANCY       0x24 QBOX
+UMASK_TXR_AD_NDR_CREDIT_OCCUPANCY_VN0   0x01 0x01
+UMASK_TXR_AD_NDR_CREDIT_OCCUPANCY_VN1   0x02 0x01
+
+EVENT_TXR_AD_SNP_CREDIT_ACQUIRED        0x27 QBOX
+UMASK_TXR_AD_SNP_CREDIT_ACQUIRED_VN0    0x01 0x01
+UMASK_TXR_AD_SNP_CREDIT_ACQUIRED_VN1    0x02 0x01
+
+EVENT_TXR_AD_SNP_CREDIT_OCCUPANCY       0x23 QBOX
+UMASK_TXR_AD_SNP_CREDIT_OCCUPANCY_VN0   0x01 0x01
+UMASK_TXR_AD_SNP_CREDIT_OCCUPANCY_VN1   0x02 0x01
+
+EVENT_TXR_AK_NDR_CREDIT_ACQUIRED        0x29 QBOX
+UMASK_TXR_AK_NDR_CREDIT_ACQUIRED        0x00 0x01
+
+EVENT_TXR_AK_NDR_CREDIT_OCCUPANCY       0x25 QBOX
+UMASK_TXR_AK_NDR_CREDIT_OCCUPANCY       0x00 0x01
+
+EVENT_TXR_BL_DRS_CREDIT_ACQUIRED        0x2A QBOX
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN0    0x01 0x01
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN1    0x02 0x01
+UMASK_TXR_BL_DRS_CREDIT_ACQUIRED_VN_SHR 0x04 0x01
+
+EVENT_TXR_BL_DRS_CREDIT_OCCUPANCY       0x1F QBOX
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN0   0x01 0x01
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN1   0x02 0x01
+UMASK_TXR_BL_DRS_CREDIT_OCCUPANCY_VN_SHR 0x04 0x01
+
+EVENT_TXR_BL_NCB_CREDIT_ACQUIRED        0x2B QBOX
+UMASK_TXR_BL_NCB_CREDIT_ACQUIRED_VN0    0x01 0x01
+UMASK_TXR_BL_NCB_CREDIT_ACQUIRED_VN1    0x02 0x01
+
+EVENT_TXR_BL_NCB_CREDIT_OCCUPANCY       0x20 QBOX
+UMASK_TXR_BL_NCB_CREDIT_OCCUPANCY_VN0   0x01 0x01
+UMASK_TXR_BL_NCB_CREDIT_OCCUPANCY_VN1   0x02 0x01
+
+EVENT_TXR_BL_NCS_CREDIT_ACQUIRED        0x2C QBOX
+UMASK_TXR_BL_NCS_CREDIT_ACQUIRED_VN0    0x01 0x01
+UMASK_TXR_BL_NCS_CREDIT_ACQUIRED_VN1    0x02 0x01
+
+EVENT_TXR_BL_NCS_CREDIT_OCCUPANCY       0x21 QBOX
+UMASK_TXR_BL_NCS_CREDIT_OCCUPANCY_VN0   0x01 0x01
+UMASK_TXR_BL_NCS_CREDIT_OCCUPANCY_VN1   0x02 0x01
+
+EVENT_VNA_CREDIT_RETURNS                0x1C QBOX
+UMASK_VNA_CREDIT_RETURNS                0x00 0x01
+
+EVENT_VNA_CREDIT_RETURN_OCCUPANCY       0x1B QBOX
+UMASK_VNA_CREDIT_RETURN_OCCUPANCY       0x00 0x01
+
+EVENT_QPI_RATE                          0x00 QBOX0FIX0|QBOX1FIX0
+UMASK_QPI_RATE                          0x00
+
+EVENT_QPI_RX_IDLE                       0x01 QBOX0FIX1|QBOX1FIX1
+UMASK_QPI_RX_IDLE                       0x00
+
+EVENT_QPI_RX_LLR                        0x02 QBOX0FIX2|QBOX1FIX2
+UMASK_QPI_RX_LLR                        0x00
diff --git a/src/includes/perfmon_haswell_counters.h b/src/includes/perfmon_haswell_counters.h
index 3dc7247..9c2d754 100644
--- a/src/includes/perfmon_haswell_counters.h
+++ b/src/includes/perfmon_haswell_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_haswell_counters.h
  *
- *      Description:  Counter Header File of perfmon module for Haswell.
+ *      Description:  Counter Header File of perfmon module for Intel Haswell.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,25 +30,51 @@
  */
 
 #define NUM_COUNTERS_HASWELL 12
-#define NUM_COUNTERS_UNCORE_HASWELL 4
 #define NUM_COUNTERS_CORE_HASWELL 8
+#define NUM_COUNTERS_UNCORE_HASWELL 4
+
+#define HAS_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define HAS_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_IN_TRANS_MASK|EVENT_OPTION_THRESHOLD_MASK
 
-static PerfmonCounterMap haswell_counter_map[NUM_COUNTERS_HASWELL] = {
+static RegisterMap haswell_counter_map[NUM_COUNTERS_HASWELL] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
-    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
-    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, HAS_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, HAS_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, HAS_VALID_OPTIONS_FIXED},
     /* PMC Counters: 4 48bit wide */
-    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
-    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
-    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, HAS_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, HAS_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, HAS_VALID_OPTIONS_PMC|EVENT_OPTION_IN_TRANS_ABORT_MASK},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, HAS_VALID_OPTIONS_PMC},
     /* Temperature Sensor*/
-    {"TMP0", PMC7, THERMAL, 0, 0, 0, 0},
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
     /* RAPL counters */
-    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
-    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0},
-    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0},
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0, EVENT_OPTION_NONE_MASK},
+    /*{"CBOX0C0", PMC12, CBOX0, MSR_UNC_C0_PMON_CTL0, MSR_UNC_C0_PMON_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX0C1", PMC13, CBOX0, MSR_UNC_C0_PMON_CTL1, MSR_UNC_C0_PMON_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX1C0", PMC14, CBOX1, MSR_UNC_C1_PMON_CTL0, MSR_UNC_C1_PMON_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX1C1", PMC15, CBOX1, MSR_UNC_C1_PMON_CTL1, MSR_UNC_C1_PMON_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX2C0", PMC16, CBOX2, MSR_UNC_C2_PMON_CTL0, MSR_UNC_C2_PMON_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX2C1", PMC17, CBOX2, MSR_UNC_C2_PMON_CTL1, MSR_UNC_C2_PMON_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX3C0", PMC18, CBOX3, MSR_UNC_C3_PMON_CTL0, MSR_UNC_C3_PMON_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"CBOX3C1", PMC19, CBOX3, MSR_UNC_C3_PMON_CTL1, MSR_UNC_C3_PMON_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"UBOX0", PMC20, UBOX, MSR_UNC_U_PMON_CTL0, MSR_UNC_U_PMON_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"UBOX1", PMC21, UBOX, MSR_UNC_U_PMON_CTL1, MSR_UNC_U_PMON_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},*/
 };
 
+
+static BoxMap haswell_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, 0, 8},
+    [FIXED] =  {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [POWER] = {0, 0, 0, 0, 0, 0, 32},
+    /*[CBOX0] = {MSR_UNC_C0_PMON_BOX_CTL, 0, 0, 0, 0, 44},
+    [CBOX1] = {MSR_UNC_C1_PMON_BOX_CTL, 0, 0, 0, 0, 44},
+    [CBOX2] = {MSR_UNC_C2_PMON_BOX_CTL, 0, 0, 0, 0, 44},
+    [CBOX3] = {MSR_UNC_C3_PMON_BOX_CTL, 0, 0, 0, 0, 44},
+    [UBOX] = {0, MSR_UNC_U_PMON_BOX_STATUS, MSR_UNC_U_PMON_BOX_STATUS, 0, 0, 44}*/
+};
diff --git a/src/includes/perfmon_haswell_events.txt b/src/includes/perfmon_haswell_events.txt
index f958a3a..a483d08 100644
--- a/src/includes/perfmon_haswell_events.txt
+++ b/src/includes/perfmon_haswell_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_haswell_events.txt
-# 
-#      Description:  Event list for Intel Ivy Bridge
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Description:  Event list for Intel Haswell
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -103,7 +104,7 @@ UMASK_L2_RQSTS_REFERENCES        0xFF
 EVENT_L2_DEMAND_RQST_WB_HIT            0x27   PMC
 UMASK_L2_DEMAND_RQST_WB_HIT       0x50
 
-EVENT_LONGEST_LAT_CACHE_REFERENCE               0x2E   PMC
+EVENT_LONGEST_LAT_CACHE               0x2E   PMC
 UMASK_LONGEST_LAT_CACHE_REFERENCE     0x4F
 UMASK_LONGEST_LAT_CACHE_MISS          0x41
 
@@ -115,7 +116,7 @@ EVENT_L1D_PEND_MISS              0x48   PMC1
 UMASK_L1D_PEND_MISS_PENDING      0x01
 
 EVENT_DTLB_STORE_MISSES                0x49   PMC
-UMASK_DTLB_STORE_MISSES_MISS_CAUSES_A_WALK   0x01
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK   0x01
 UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_4K       0x02
 UMASK_DTLB_STORE_MISSES_WALK_COMPLETED_LARGE    0x04
 UMASK_DTLB_STORE_MISSES_WALK_COMPLETED          0x0E
@@ -141,9 +142,9 @@ UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED    0x08
 UMASK_MOVE_ELIMINATION_INT_ELIMINATED         0x01
 UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED        0x02
 
-EVENT_CPL_CYCLES               0x5C    PMC
+EVENT_CPL_CYCLES                   0x5C    PMC
 UMASK_CPL_CYCLES_RING0             0x01
-UMASK_CPL_CYCLES_RING123             0x02
+UMASK_CPL_CYCLES_RING123           0x02
 
 EVENT_RS_EVENTS               0x5E    PMC
 UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
@@ -201,19 +202,28 @@ EVENT_BR_INST_EXEC                                      0x88   PMC
 UMASK_BR_INST_EXEC_COND_TAKEN                           0x81
 UMASK_BR_INST_EXEC_COND_NON_TAKEN                       0x41
 UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN                     0x82
+UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN                 0x42
 UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
 UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN                0x48
 UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
 UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60
 UMASK_BR_INST_EXEC_ALL_BRANCHES                         0xFF
 
 EVENT_BR_MISP_EXEC                                      0x89   PMC
 UMASK_BR_MISP_EXEC_COND_TAKEN                           0x81
 UMASK_BR_MISP_EXEC_COND_NON_TAKEN                       0x41
 UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
 UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN                0x48
 UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
 UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60
 UMASK_BR_MISP_EXEC_ALL_BRANCHES                         0xFF
 
 EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
@@ -323,9 +333,6 @@ UMASK_FP_ASSIST_ANY               0x1E
 EVENT_ROB_MISC_EVENT_LBR_INSERTS               0xCC  PMC
 UMASK_ROB_MISC_EVENT_LBR_INSERTS               0x20
 
-EVENT_MEM_TRANS_RETIRED_LOAD_LAT               0xCD  PMC
-UMASK_MEM_TRANS_RETIRED_LOAD_LATENCY           0x01
-
 EVENT_MEM_UOP_RETIRED            0xD0    PMC
 UMASK_MEM_UOP_RETIRED_LOADS            0x81
 UMASK_MEM_UOP_RETIRED_STORES           0x82
@@ -380,39 +387,36 @@ EVENT_L2_LINES_OUT                  0xF2   PMC
 UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x05
 UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x06
 
-EVENT_TX_MEM_ABORT_CONFLICT          0x54   PMC
-UMASK_TX_MEM_ABORT_CONFLICT     0x01
-UMASK_TX_MEM_ABORT_CAPACITY     0x02
-UMASK_TX_MEM_ABORT_HLE_STORE_TO_ELIDED_LOCK     0x04
-UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_NOT_EMPTY     0x08
-UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_MISMATCH     0x10
-UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_UNSUPP_ALIGNMENT     0x20
-UMASK_TX_MEM_ABORT_HLE_ELISION_BUFFER_FULL     0x40
-
-EVENT_TX_EXEC          0x5D   PMC
-UMASK_TX_EXEC_MISC1     0x01
-UMASK_TX_EXEC_MISC2     0x02
-UMASK_TX_EXEC_MISC3     0x04
-UMASK_TX_EXEC_MISC4     0x08
-UMASK_TX_EXEC_MISC5     0x10
-
-
-EVENT_HLE_RETIRED                  0xC8   PMC
-UMASK_HLE_RETIRED_START            0x01
-UMASK_HLE_RETIRED_COMMIT           0x02
-UMASK_HLE_RETIRED_ABORTED           0x04
-UMASK_HLE_RETIRED_ABORTED_MISC1     0x08
-UMASK_HLE_RETIRED_ABORTED_MISC2     0x10
-UMASK_HLE_RETIRED_ABORTED_MISC3     0x20
-UMASK_HLE_RETIRED_ABORTED_MISC4     0x40
-UMASK_HLE_RETIRED_ABORTED_MISC5     0x80
-
-EVENT_RTM_RETIRED                  0xC9   PMC
-UMASK_RTM_RETIRED_START            0x01
-UMASK_RTM_RETIRED_COMMIT           0x02
-UMASK_RTM_RETIRED_ABORTED           0x04
-UMASK_RTM_RETIRED_ABORTED_MISC1     0x08
-UMASK_RTM_RETIRED_ABORTED_MISC2     0x10
-UMASK_RTM_RETIRED_ABORTED_MISC3     0x20
-UMASK_RTM_RETIRED_ABORTED_MISC4     0x40
-UMASK_RTM_RETIRED_ABORTED_MISC5     0x80
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
diff --git a/src/includes/perfmon_interlagos.h b/src/includes/perfmon_interlagos.h
index d28bb18..bf8eea0 100644
--- a/src/includes/perfmon_interlagos.h
+++ b/src/includes/perfmon_interlagos.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Header file of perfmon module for AMD Interlagos
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,219 +30,264 @@
  */
 
 #include <perfmon_interlagos_events.h>
-#include <perfmon_interlagos_groups.h>
 #include <perfmon_interlagos_counters.h>
+#include <error.h>
 
 static int perfmon_numCountersInterlagos = NUM_COUNTERS_INTERLAGOS;
-static int perfmon_numGroupsInterlagos = NUM_GROUPS_INTERLAGOS;
 static int perfmon_numArchEventsInterlagos = NUM_ARCH_EVENTS_INTERLAGOS;
 
 
-void perfmon_init_interlagos(PerfmonThread *thread)
+int perfmon_init_interlagos(int cpu_id)
+{
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    return 0;
+}
+
+int ilg_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
 {
     uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL4, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL5, 0x0ULL);
-
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire(
-                (int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id)
-       )
+
+    flags |= (1ULL<<16);
+    flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
+
+    if (event->numberOfOptions > 0)
     {
-        msr_write(cpu_id, MSR_AMD15_NB_PERFEVTSEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_AMD15_NB_PERFEVTSEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_AMD15_NB_PERFEVTSEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_AMD15_NB_PERFEVTSEL3, 0x0ULL);
+        for(int j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    if ((event->options[j].value & 0xFFULL) < 0x20)
+                    {
+                        flags |= (event->options[j].value & 0xFFULL) << 24;
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
     }
-
-    //flags |= (1<<16);  /* user mode flag */
-    /*msr_write(cpu_id, MSR_AMD15_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL3, flags);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL4, flags);
-    msr_write(cpu_id, MSR_AMD15_PERFEVTSEL5, flags);*/
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
 }
 
-
-void perfmon_setupCounterThread_interlagos(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int ilg_uncore_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
 {
-    uint64_t flags;
-    uint64_t reg = interlagos_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
-
-    /* only one thread accesses Uncore */
-    if ( (interlagos_counter_map[index].type == UNCORE) &&
-            !(socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) )
+    uint64_t flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
     {
-        return;
+        return 0;
     }
 
-    flags = (1<<16);
-    /* AMD uses a 12 bit Event mask: [35:32][7:0] */
     flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
 
-    if (perfmon_verbose)
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UNCORE);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+
+int perfmon_setupCounterThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
-        printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                cpu_id,
-                LLU_CAST reg,
-                LLU_CAST flags);
+        haveLock = 1;
     }
 
-    msr_write(cpu_id, reg , flags);
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        switch(type)
+        {
+            case PMC:
+                ilg_pmc_setup(cpu_id, index, event);
+                break;
+            case UNCORE:
+                ilg_uncore_setup(cpu_id, index, event);
+                break;
+            default:
+                break;
+        }
+    }
+    return 0;
 }
 
 
-void perfmon_startCountersThread_interlagos(int thread_id)
+int perfmon_startCountersThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    uint64_t flags;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    for ( int i=0; i<NUM_COUNTERS_INTERLAGOS; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (interlagos_counter_map[i].type == PMC)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                msr_write(cpu_id, interlagos_counter_map[i].counterRegister , 0x0ULL);
-                flags = msr_read(cpu_id, interlagos_counter_map[i].configRegister);
-                flags |= (1<<22);  /* enable flag */
-
-                if (perfmon_verbose) 
-                {
-                    printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                            LLU_CAST interlagos_counter_map[i].configRegister,
-                            LLU_CAST flags);
-                }
-
-                msr_write(cpu_id, interlagos_counter_map[i].configRegister , flags);
-
+                continue;
             }
-            else if ( interlagos_counter_map[i].type == UNCORE )
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t counter = counter_map[index].counterRegister;
+            uint32_t reg = counter_map[index].configRegister;
+            if (type == PMC || ((type == UNCORE) && (haveLock)))
             {
-                if(haveLock)
-                {
-                    msr_write(cpu_id, interlagos_counter_map[i].counterRegister , 0x0ULL);
-                    flags = msr_read(cpu_id, interlagos_counter_map[i].configRegister);
-                    flags |= (1<<22);  /* enable flag */
-
-                    if (perfmon_verbose)
-                    {
-                        printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                                LLU_CAST interlagos_counter_map[i].configRegister,
-                                LLU_CAST flags);
-                    }
-
-                    msr_write(cpu_id, interlagos_counter_map[i].configRegister , flags);
-                }
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+                flags |= (1<<22);  /* enable flag */
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
             }
         }
     }
+    return 0;
 }
 
-void perfmon_stopCountersThread_interlagos(int thread_id)
+int perfmon_stopCountersThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
+    uint64_t flags = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t tmp;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    for ( int i=0; i<NUM_COUNTERS_INTERLAGOS; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if ( interlagos_counter_map[i].type == PMC )
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                flags = msr_read(cpu_id,interlagos_counter_map[i].configRegister);
-                flags &= ~(1<<22);  /* clear enable flag */
-                msr_write(cpu_id, interlagos_counter_map[i].configRegister , flags);
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, interlagos_counter_map[i].counterRegister);
-
-                if (perfmon_verbose)
-                {
-                    printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                            LLU_CAST interlagos_counter_map[i].configRegister,
-                            LLU_CAST flags);
-                    printf("perfmon_stop_counters: Read Register 0x%llX , Flags: 0x%llX \n",
-                            LLU_CAST interlagos_counter_map[i].counterRegister,
-                            LLU_CAST perfmon_threadData[thread_id].counters[i].counterData);
-                }
-
+                continue;
             }
-            else if (interlagos_counter_map[i].type == UNCORE)
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t counter = counter_map[index].counterRegister;
+            uint32_t reg = counter_map[index].configRegister;
+            switch (type)
             {
-                if(haveLock)
-                {
-                    flags = msr_read(cpu_id, interlagos_counter_map[i].configRegister);
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
                     flags &= ~(1<<22);  /* clear enable flag */
-                    msr_write(cpu_id, interlagos_counter_map[i].configRegister , flags);
-
-                    if (perfmon_verbose)
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+                    break;
+                case UNCORE:
+                    if (haveLock)
                     {
-                        printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                                LLU_CAST interlagos_counter_map[i].configRegister,
-                                LLU_CAST flags);
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+                        flags &= ~(1<<22);  /* clear enable flag */
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
                     }
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, interlagos_counter_map[i].counterRegister);
-                }
+                    break;
+                default:
+                    break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(tmp, 0, box_map[type].regWidth);
         }
     }
+    return 0;
 }
 
 
-void perfmon_readCountersThread_interlagos(int thread_id)
+int perfmon_readCountersThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t tmp;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-
-    for (int i=0;i<NUM_COUNTERS_INTERLAGOS;i++)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if ( interlagos_counter_map[i].type == UNCORE )
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                if ( haveLock )
-                {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, interlagos_counter_map[i].counterRegister);
-                }
+                continue;
             }
-            else
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t counter = counter_map[index].counterRegister;
+            switch (type)
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, interlagos_counter_map[i].counterRegister);
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST tmp, READ_PMC);
+                    break;
+                case UNCORE:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST tmp, READ_UNCORE);
+                    break;
+                default:
+                    break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(tmp, 0, box_map[type].regWidth);
         }
     }
+    return 0;
 }
 
+
+int perfmon_finalizeCountersThread_interlagos(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        uint32_t reg = counter_map[index].configRegister;
+        if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+        {
+            VERBOSEPRINTREG(cpu_id, reg, LLU_CAST 0x0ULL, CLEAR_CTRL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_interlagos_counters.h b/src/includes/perfmon_interlagos_counters.h
index a593f5a..d54caaa 100644
--- a/src/includes/perfmon_interlagos_counters.h
+++ b/src/includes/perfmon_interlagos_counters.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Counter Header File of perfmon module for AMD Interlagos
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,18 +32,24 @@
 #define NUM_COUNTERS_INTERLAGOS 10
 #define NUM_COUNTERS_CORE_INTERLAGOS 6
 
-static PerfmonCounterMap interlagos_counter_map[NUM_COUNTERS_INTERLAGOS] = {
+#define ILG_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD
+
+static RegisterMap interlagos_counter_map[NUM_COUNTERS_INTERLAGOS] = {
     /* Core counters */
-    {"PMC0",PMC0, PMC, MSR_AMD15_PERFEVTSEL0, MSR_AMD15_PMC0, 0, 0},
-    {"PMC1",PMC1, PMC, MSR_AMD15_PERFEVTSEL1, MSR_AMD15_PMC1, 0, 0},
-    {"PMC2",PMC2, PMC, MSR_AMD15_PERFEVTSEL2, MSR_AMD15_PMC2, 0, 0},
-    {"PMC3",PMC3, PMC, MSR_AMD15_PERFEVTSEL3, MSR_AMD15_PMC3, 0, 0},
-    {"PMC4",PMC4, PMC, MSR_AMD15_PERFEVTSEL4, MSR_AMD15_PMC4, 0, 0},
-    {"PMC5",PMC5, PMC, MSR_AMD15_PERFEVTSEL5, MSR_AMD15_PMC5, 0, 0},
+    {"PMC0",PMC0, PMC, MSR_AMD15_PERFEVTSEL0, MSR_AMD15_PMC0, 0, 0, ILG_VALID_OPTIONS_PMC},
+    {"PMC1",PMC1, PMC, MSR_AMD15_PERFEVTSEL1, MSR_AMD15_PMC1, 0, 0, ILG_VALID_OPTIONS_PMC},
+    {"PMC2",PMC2, PMC, MSR_AMD15_PERFEVTSEL2, MSR_AMD15_PMC2, 0, 0, ILG_VALID_OPTIONS_PMC},
+    {"PMC3",PMC3, PMC, MSR_AMD15_PERFEVTSEL3, MSR_AMD15_PMC3, 0, 0, ILG_VALID_OPTIONS_PMC},
+    {"PMC4",PMC4, PMC, MSR_AMD15_PERFEVTSEL4, MSR_AMD15_PMC4, 0, 0, ILG_VALID_OPTIONS_PMC},
+    {"PMC5",PMC5, PMC, MSR_AMD15_PERFEVTSEL5, MSR_AMD15_PMC5, 0, 0, ILG_VALID_OPTIONS_PMC},
     /* Northbridge counters */
     {"UPMC0",PMC6, UNCORE, MSR_AMD15_NB_PERFEVTSEL0, MSR_AMD15_NB_PMC0, 0, 0},
-    {"UPMC1",PMC7, UNCORE, MSR_AMD15_NB_PERFEVTSEL0, MSR_AMD15_NB_PMC0, 0, 0},
-    {"UPMC2",PMC8, UNCORE, MSR_AMD15_NB_PERFEVTSEL0, MSR_AMD15_NB_PMC0, 0, 0},
-    {"UPMC3",PMC9, UNCORE, MSR_AMD15_NB_PERFEVTSEL0, MSR_AMD15_NB_PMC0, 0, 0}
+    {"UPMC1",PMC7, UNCORE, MSR_AMD15_NB_PERFEVTSEL1, MSR_AMD15_NB_PMC1, 0, 0},
+    {"UPMC2",PMC8, UNCORE, MSR_AMD15_NB_PERFEVTSEL2, MSR_AMD15_NB_PMC2, 0, 0},
+    {"UPMC3",PMC9, UNCORE, MSR_AMD15_NB_PERFEVTSEL3, MSR_AMD15_NB_PMC3, 0, 0}
 };
 
+static BoxMap interlagos_box_map[NUM_UNITS] = {
+    [PMC] = {0, 0, 0, 0, 0, 0, 48},
+    [UNCORE] = {0, 0, 0, 0, 0, 0, 48},
+};
diff --git a/src/includes/perfmon_interlagos_events.txt b/src/includes/perfmon_interlagos_events.txt
index 1fa0a44..11fbdb0 100644
--- a/src/includes/perfmon_interlagos_events.txt
+++ b/src/includes/perfmon_interlagos_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_interlagos_events.txt
-# 
+#
 #      Description:  Event list for AMD Interlagos
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -131,17 +132,23 @@ EVENT_UNIFIED_TLB_HIT       0x45    PMC0|PMC1|PMC2
 UMASK_UNIFIED_TLB_HIT_4KB_DATA       0x01
 UMASK_UNIFIED_TLB_HIT_2MB_DATA       0x02
 UMASK_UNIFIED_TLB_HIT_1GB_DATA       0x04
+UMASK_UNIFIED_TLB_HIT_ANY_DATA       0x07
 UMASK_UNIFIED_TLB_HIT_4KB_INSTR      0x10
 UMASK_UNIFIED_TLB_HIT_2MB_INSTR      0x20
 UMASK_UNIFIED_TLB_HIT_1GB_INSTR      0x40
+UMASK_UNIFIED_TLB_HIT_ANY_INSTR      0x70
+UMASK_UNIFIED_TLB_HIT_ANY            0x77
 
 EVENT_UNIFIED_TLB_MISS       0x46    PMC0|PMC1|PMC2
 UMASK_UNIFIED_TLB_MISS_4KB_DATA       0x01
 UMASK_UNIFIED_TLB_MISS_2MB_DATA       0x02
 UMASK_UNIFIED_TLB_MISS_1GB_DATA       0x04
+UMASK_UNIFIED_TLB_MISS_ANY_DATA       0x07
 UMASK_UNIFIED_TLB_MISS_4KB_INSTR      0x10
 UMASK_UNIFIED_TLB_MISS_2MB_INSTR      0x20
 UMASK_UNIFIED_TLB_MISS_1GB_INSTR      0x40
+UMASK_UNIFIED_TLB_MISS_ANY_INSTR      0x70
+UMASK_UNIFIED_TLB_MISS_ANY            0x77
 
 EVENT_MISALIGNED_ACCESS       0x47    PMC
 UMASK_MISALIGNED_ACCESS       0x00
@@ -230,6 +237,7 @@ EVENT_ITLB_L1_MISS_L2_MISS        0x085     PMC0|PMC1|PMC2
 UMASK_ITLB_L1_MISS_L2_MISS_4KB         0x01
 UMASK_ITLB_L1_MISS_L2_MISS_2MB         0x02
 UMASK_ITLB_L1_MISS_L2_MISS_1GB         0x04
+UMASK_ITLB_L1_MISS_L2_MISS_ANY         0x07
 
 EVENT_PIPELINE_RESTART_DUE_TO_ISB        0x086     PMC0|PMC1|PMC2
 UMASK_PIPELINE_RESTART_DUE_TO_ISB         0x00
@@ -387,6 +395,14 @@ UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_REMOTE_IO_MEM          0x92
 UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_REMOTE_IO_IO           0x91
 UMASK_UNC_CPU_REQUEST_TO_MEMORY_REMOTE_LOCAL_CPU_IO          0x64
 UMASK_UNC_CPU_REQUEST_TO_MEMORY_REMOTE_LOCAL_IO_IO           0x61
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_ANY_CPU_MEM            0xB8
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_ANY_CPU_IO             0xB4
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_ANY_IO_MEM             0xB2
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_LOCAL_ANY_IO_IO              0xA1
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_ANY_LOCAL_CPU_IO             0xE4
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_ANY_LOCAL_IO_IO              0xE1
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_ANY_ANY_CPU_IO               0xF4
+UMASK_UNC_CPU_REQUEST_TO_MEMORY_ANY_ANY_IO_IO                0xF1
 
 EVENT_UNC_CACHE_BLOCK_COMMANDS                             0x0EA     UPMC
 UMASK_UNC_CACHE_BLOCK_COMMANDS_VICTIM_BLOCK                0x01
@@ -420,21 +436,97 @@ UMASK_UNC_GART_EVENTS_MISS              0x04
 UMASK_UNC_GART_EVENTS_REQUEST_WALK      0x08
 UMASK_UNC_GART_EVENTS_MULTIPLE_WALK     0x80
 
-EVENT_UNC_LINK_TRANSMIT_BW_L0         0x0F6     UPMC
-UMASK_UNC_LINK_TRANSMIT_BW_L0_USE     0x17
+EVENT_UNC_LINK_TRANSMIT_BW_L0           0x0F6     UPMC
+UMASK_UNC_LINK_TRANSMIT_BW_L0_USE     0x37
 UMASK_UNC_LINK_TRANSMIT_BW_L0_NOP     0x08
-
-EVENT_UNC_LINK_TRANSMIT_BW_L1         0x0F7     UPMC
-UMASK_UNC_LINK_TRANSMIT_BW_L1_USE     0x17
+UMASK_UNC_LINK_TRANSMIT_BW_L0_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L0_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L0_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L0_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_USE     0x37
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_NOP     0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S0_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_USE     0xB7
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_NOP     0x88
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_CMDS    0x80
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_DATA    0x81
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_BUF_REL 0x82
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_ADDR    0x90
+UMASK_UNC_LINK_TRANSMIT_BW_L0_S1_CRC     0xA0
+
+EVENT_UNC_LINK_TRANSMIT_BW_L1           0x0F7     UPMC
+UMASK_UNC_LINK_TRANSMIT_BW_L1_USE     0x37
 UMASK_UNC_LINK_TRANSMIT_BW_L1_NOP     0x08
-
-EVENT_UNC_LINK_TRANSMIT_BW_L2         0x0F8     UPMC
-UMASK_UNC_LINK_TRANSMIT_BW_L2_USE     0x17
+UMASK_UNC_LINK_TRANSMIT_BW_L1_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L1_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L1_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L1_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L1_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_USE     0x37
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_NOP     0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S0_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_USE     0xB7
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_NOP     0x88
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_CMDS    0x80
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_DATA    0x81
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_BUF_REL 0x82
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_ADDR    0x90
+UMASK_UNC_LINK_TRANSMIT_BW_L1_S1_CRC     0xA0
+
+EVENT_UNC_LINK_TRANSMIT_BW_L2           0x0F8     UPMC
+UMASK_UNC_LINK_TRANSMIT_BW_L2_USE     0x37
 UMASK_UNC_LINK_TRANSMIT_BW_L2_NOP     0x08
-
-EVENT_UNC_LINK_TRANSMIT_BW_L3         0x1F9     UPMC
-UMASK_UNC_LINK_TRANSMIT_BW_L3_USE     0x17
+UMASK_UNC_LINK_TRANSMIT_BW_L2_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L2_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L2_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L2_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L2_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_USE     0x37
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_NOP     0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S0_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_USE     0xB7
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_NOP     0x88
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_CMDS    0x80
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_DATA    0x81
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_BUF_REL 0x82
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_ADDR    0x90
+UMASK_UNC_LINK_TRANSMIT_BW_L2_S1_CRC     0xA0
+
+EVENT_UNC_LINK_TRANSMIT_BW_L3           0x1F9     UPMC
+UMASK_UNC_LINK_TRANSMIT_BW_L3_USE     0x37
 UMASK_UNC_LINK_TRANSMIT_BW_L3_NOP     0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L3_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L3_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L3_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L3_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L3_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_USE     0x37
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_NOP     0x08
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_CMDS    0x00
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_DATA    0x01
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_BUF_REL 0x02
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_ADDR    0x10
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S0_CRC     0x20
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_USE     0xB7
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_NOP     0x88
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_CMDS    0x80
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_DATA    0x81
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_BUF_REL 0x82
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_ADDR    0x90
+UMASK_UNC_LINK_TRANSMIT_BW_L3_S1_CRC     0xA0
 
 EVENT_UNC_CPU_TO_DRAM             0x1E0     UPMC
 UMASK_UNC_CPU_TO_DRAM_LOCAL_TO_0  0x01
diff --git a/src/includes/perfmon_ivybridge.h b/src/includes/perfmon_ivybridge.h
index 0615c27..ec72900 100644
--- a/src/includes/perfmon_ivybridge.h
+++ b/src/includes/perfmon_ivybridge.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_ivybridge.h
  *
- *      Description:  Header File of perfmon module for Ivy Bridge.
+ *      Description:  Header File of perfmon module for Intel Ivy Bridge.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,777 +29,1377 @@
  * =======================================================================================
  */
 
+
 #include <perfmon_ivybridge_events.h>
-#include <perfmon_ivybridge_groups.h>
+#include <perfmon_ivybridgeEP_events.h>
 #include <perfmon_ivybridge_counters.h>
+#include <error.h>
+#include <affinity.h>
+#include <limits.h>
 
-
+static int perfmon_numCountersIvybridgeEP = NUM_COUNTERS_IVYBRIDGEEP;
+static int perfmon_numCoreCountersIvybridgeEP = NUM_COUNTERS_CORE_IVYBRIDGEEP;
+static int perfmon_numArchEventsIvybridgeEP = NUM_ARCH_EVENTS_IVYBRIDGEEP;
 static int perfmon_numCountersIvybridge = NUM_COUNTERS_IVYBRIDGE;
-static int perfmon_numGroupsIvybridge = NUM_GROUPS_IVYBRIDGE;
+static int perfmon_numCoreCountersIvybridge = NUM_COUNTERS_CORE_IVYBRIDGE;
 static int perfmon_numArchEventsIvybridge = NUM_ARCH_EVENTS_IVYBRIDGE;
 
-#define OFFSET_PMC 3
 
-void perfmon_init_ivybridge(PerfmonThread *thread)
+int perfmon_init_ivybridge(int cpu_id)
 {
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    /* Initialize registers */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC2, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC3, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
-    /* initialize fixed counters
-     * FIXED 0: Instructions retired
-     * FIXED 1: Clocks unhalted core
-     * FIXED 2: Clocks unhalted ref */
-    //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
-    /* Preinit of PERFEVSEL registers */
-    //flags |= (1<<22);  /* enable flag */
-    //flags |= (1<<16);  /* user mode flag */
-
-    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
-    /* TODO Robust implementation which also works if stuff is not there */
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
-    {
-        if ( cpuid_info.model == IVYBRIDGE_EP )
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
+}
+
+
+uint32_t ivb_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint32_t flags = 0x0UL;
+    flags |= (1ULL<<(1+(index*4)));
+    for(int j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
         {
-            /* Only root can access pci address space in direct mode */
-            if (accessClient_mode != DAEMON_AM_DIRECT)
-            {
-                uint32_t  uflags = 0x10100U; /* enable freeze (bit 16), freeze (bit 8) */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-
-                uflags = 0x0U;
-                uflags |= (1<<22);  /* enable flag */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_3, uflags);
-
-                uflags |= (1<<19);  /* reset fixed counter */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-
-                /* iMC counters need to be manually reset to zero */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-
-#if 0
-                /* FIXME: Not yet tested/ working due to BIOS issues on test
-                 * machines */
-
-                /* QPI registers can be zeroed with single write */
-                uflags = 0x0103UL; /* freeze (bit 8), reset */
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
-                uflags = 0x0UL;
-                uflags |= (1UL<<22);  /* enable flag */
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_3, uflags);
-
-
-                /* Cbo counters */
-                uflags = 0xF0103UL; /*enable freeze (bit 8), reset */
-                msr_write(cpu_id, MSR_UNC_C0_PMON_BOX_CTL, uflags);
-                msr_write(cpu_id, MSR_UNC_C1_PMON_BOX_CTL, uflags);
-                msr_write(cpu_id, MSR_UNC_C2_PMON_BOX_CTL, uflags);
-                msr_write(cpu_id, MSR_UNC_C3_PMON_BOX_CTL, uflags);
-
-                switch ( cpuid_topology.numCoresPerSocket )
-                {
-                    case 12:
-                        msr_write(cpu_id, MSR_UNC_C11_PMON_BOX_CTL, uflags);
-                        msr_write(cpu_id, MSR_UNC_C10_PMON_BOX_CTL, uflags);
-                    case 10:
-                        msr_write(cpu_id, MSR_UNC_C9_PMON_BOX_CTL, uflags);
-                        msr_write(cpu_id, MSR_UNC_C8_PMON_BOX_CTL, uflags);
-                    case 8:
-                        msr_write(cpu_id, MSR_UNC_C7_PMON_BOX_CTL, uflags);
-                        msr_write(cpu_id, MSR_UNC_C6_PMON_BOX_CTL, uflags);
-                    case 6:
-                        msr_write(cpu_id, MSR_UNC_C5_PMON_BOX_CTL, uflags);
-                        msr_write(cpu_id, MSR_UNC_C4_PMON_BOX_CTL, uflags);
-                }
-#endif
-            }
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+                break;
+            default:
+                break;
         }
     }
+    return flags;
 }
 
-#define BOX_GATE_IVB(channel,label) \
-    if (perfmon_verbose) { \
-        printf("[%d] perfmon_setup_counter (##label): Write Register 0x%llX , Flags: 0x%llX \n", \
-                cpu_id, \
-                LLU_CAST reg, \
-                LLU_CAST flags); \
-    } \
-    if(haveLock) { \
-        uflags = (1UL<<22);\
-        uflags |= (event->umask<<8) + event->eventId;  \
-        if (event->cfgBits == 0xFF) \
-        { \
-            uflags |= (1<<21); \
-        } \
-        pci_write(cpu_id, channel,  reg, uflags);  \
-    }
-
-
-void perfmon_setupCounterThread_ivybridge(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+
+int ivb_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
-    int haveLock = 0;
-    uint64_t flags;
-    uint32_t uflags;
-    uint64_t reg = ivybridge_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-    uint64_t orig_fixed_flags = fixed_flags;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
+    uint32_t flags = 0x0UL;
+    uint64_t offcore_flags = 0x0ULL;
+    flags = (1ULL<<22)|(1ULL<<16);
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    /* Intel with standard 8 bit event mask: [7:0] */
+    flags |= (event->umask<<8) + event->eventId;
+
+    /* set custom cfg and cmask */
+    if ((event->cfgBits != 0) &&
+        (event->eventId != 0xB7) &&
+        (event->eventId != 0xBB))
     {
-        haveLock = 1;
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
     }
 
-    switch (ivybridge_counter_map[index].type)
+    if (event->numberOfOptions > 0)
     {
-        case PMC:
-
-
-            //flags = msr_read(cpu_id,reg);
-            //flags &= ~(0xFFFFU);   /* clear lower 16bits */
-            flags = (1<<22)|(1<<16);
+        for(int j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
 
-            /* Intel with standard 8 bit event mask: [7:0] */
-            flags |= (event->umask<<8) + event->eventId;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0x8FFF);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value<<16);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+    }
+    else if (event->eventId == 0xBB)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
 
-            if (event->cfgBits != 0) /* set custom cfg and cmask */
+int ivb_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x0UL;
+    uint64_t filter = 0x0UL;
+    uint32_t reg = counter_map[index].configRegister;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
             {
-                flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-                flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1FULL) << 24);
+                    break;
+                case EVENT_OPTION_OPCODE:
+                    filter = (event->options[j].value & 0x3FULL);
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, flags, SETUP_OPCODE_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH, filter));
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    filter = ((event->options[j].value & 0xFFFFFFC0ULL));
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter, SETUP_ADDR0_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, filter));
+                    filter = (((event->options[j].value>>32) & 0x3FFFULL));
+                    VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter, SETUP_ADDR1_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, filter));
+                    break;
+                default:
+                    break;
             }
+        }
+    }
+    VERBOSEPRINTPCIREG(cpu_id, dev, reg, flags, SETUP_BBOX);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, flags));
+    return 0;
+}
 
-            if (perfmon_verbose)
+int ivb_pci_box_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x0UL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
             {
-                printf("[%d] perfmon_setup_counter PMC: Write Register 0x%llX , Flags: 0x%llX \n",
-                        cpu_id,
-                        LLU_CAST reg,
-                        LLU_CAST flags);
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1FULL) << 24);
+                    break;
+                default:
+                    break;
             }
+        }
+    }
+    VERBOSEPRINTPCIREG(cpu_id, counter_map[index].device, counter_map[index].configRegister,
+                        flags, SETUP_BOX);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device,
+                                     counter_map[index].configRegister, flags));
+    return 0;
+}
 
-            msr_write(cpu_id, reg , flags);
-            break;
-
-        case FIXED:
-            fixed_flags |= (0x2ULL<<(index*4));
-            break;
-
-        case POWER:
-            break;
+int ivb_mboxfix_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x0ULL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+    flags = (1ULL<<22);
+    VERBOSEPRINTPCIREG(cpu_id, counter_map[index].device,
+        counter_map[index].configRegister, flags, SETUP_MBOXFIX);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, counter_map[index].device,
+        counter_map[index].configRegister, flags));
+    return 0;
+}
 
-        case MBOX0:
-            BOX_GATE_IVB(PCI_IMC_DEVICE_CH_0,MBOX0);
-            break;
+int ivb_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event, PciDeviceIndex filterdev)
+{
+    uint64_t flags = 0x0UL;
+    uint32_t filterreg = 0x0U;
+    uint64_t filterval = 0x0ULL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(counter_map[index].device, cpu_id))
+    {
+        return -ENODEV;
+    }
+    PciDeviceIndex dev = counter_map[index].device;
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->cfgBits != 0x0)
+    {
+        flags = (1ULL<<21);
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1FULL) << 24);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    if (pci_checkDevice(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_QPI_PMON_MATCH_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_MATCH0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    if (pci_checkDevice(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_QPI_PMON_MATCH_1;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_MATCH1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK0:
+                    if (pci_checkDevice(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_QPI_PMON_MASK_0;
+                        filterval = event->options[j].value & 0x8003FFF8ULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_MASK0);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                case EVENT_OPTION_MASK1:
+                    if (pci_checkDevice(filterdev, cpu_id))
+                    {
+                        filterreg = PCI_UNC_QPI_PMON_MASK_1;
+                        filterval = event->options[j].value & 0x000F000FULL;
+                        VERBOSEPRINTPCIREG(cpu_id, filterdev, filterreg, filterval, SETUP_SBOX_MASK1);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, filterreg, filterval));
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_SBOX);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    return 0;
+}
 
-        case MBOX1:
-            BOX_GATE_IVB(PCI_IMC_DEVICE_CH_1,MBOX1);
-            break;
+int ivb_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint32_t flags = 0x0UL;
+    uint64_t mask = 0x0ULL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        RegisterType type = counter_map[index].type;
+        uint64_t filter0 = 0x0ULL;
+        uint64_t filter1 = 0x0ULL;
+        int state_set = 0;
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1FULL) << 24);
+                    break;
+                case EVENT_OPTION_TID:
+                    flags |= (1<<19);
+                    filter0 |= (event->options[j].value & 0x1FULL);
+                    break;
+                case EVENT_OPTION_STATE:
+                    filter0 |= ((event->options[j].value & 0x3FULL) << 17);
+                    state_set = 1;
+                    break;
+                case EVENT_OPTION_NID:
+                    mask = 0x0ULL;
+                    for (int i=0; i<affinityDomains.numberOfNumaDomains;i++)
+                        mask |= (1ULL<<i);
 
-        case MBOX2:
-            BOX_GATE_IVB(PCI_IMC_DEVICE_CH_2,MBOX2);
-            break;
+                    if (event->options[j].value & mask)
+                    {
+                        filter1 |= (event->options[j].value & 0xFFFFULL);
+                    }
+                    break;
+                case EVENT_OPTION_OPCODE:
+                    filter1 |= ((event->options[j].value & 0x1FFULL) << 20);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    filter1 |= ((event->options[j].value & 0x3) << 30);
+                    break;
+                default:
+                    break;
+            }
+        }
+        if (state_set == 0 && event->eventId == 0x34)
+        {
+            filter0 |= (0x1FULL<<17);
+        }
+        if (filter0 != 0x0ULL)
+        {
+            VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, filter0, SETUP_CBOX_FILTER0);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, filter0));
+        }
+        if (filter1 != 0x0ULL)
+        {
+            VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, filter1, SETUP_CBOX_FILTER1);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, filter1));
+        }
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_CBOX);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
 
-        case MBOX3:
-            BOX_GATE_IVB(PCI_IMC_DEVICE_CH_3,MBOX3);
-            break;
+int ivb_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint32_t flags = 0x0UL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    flags = (1ULL<<22)|(1ULL<<20)|(1ULL<<17);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1F) << 24);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_UBOX);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
 
-        case SBOX0:
+int ivb_uboxfix_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint32_t flags = 0x0UL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    flags = (1ULL<<22)|(1ULL<<20);
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UBOXFIX)
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
 
-            /* CTO_COUNT event requires programming of MATCH/MASK registers */
-            if (event->eventId == 0x38)
+int ivb_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint32_t flags = 0x0UL;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    flags = (1ULL<<22)|(1ULL<<20);
+    flags |= event->eventId;
+    if (event->cfgBits != 0x0)
+    {
+        flags |= ((event->cfgBits & 0x1) << 21);
+    }
+    if (event->numberOfOptions > 0)
+    {
+        RegisterType type = counter_map[index].type;
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
             {
-                if(haveLock)
-                {
-                    //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0, reg);
-                    //uflags &= ~(0xFFFFU);
-                    uflags = (1UL<<22);
-                    uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
-                    printf("UFLAGS 0x%x \n",uflags);
-                    pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  reg, uflags);
-
-                    /* program MATCH0 */
-                    uflags = 0x0UL;
-                    uflags = (event->cmask<<13) + (event->umask<<8);
-                    printf("MATCH UFLAGS 0x%x \n",uflags);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_0, PCI_UNC_QPI_PMON_MATCH_0, uflags);
-
-                    /* program MASK0 */
-                    uflags = 0x0UL;
-                    uflags = (0x3F<<12) + (event->cfgBits<<4);
-                    printf("MASK UFLAGS 0x%x \n",uflags);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_0, PCI_UNC_QPI_PMON_MASK_0, uflags);
-                }
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1F) << 24);
+                    break;
+                case EVENT_OPTION_OCCUPANCY:
+                    flags |= ((event->options[j].value & 0x3) << 14);
+                    break;
+                case EVENT_OPTION_OCCUPANCY_INVERT:
+                    flags |= (1ULL<<30);
+                    break;
+                case EVENT_OPTION_OCCUPANCY_EDGE:
+                    flags |= (1ULL<<31);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1,
+                                    event->options[j].value & 0xFFFFFFFFULL, SETUP_WBOX_FILTER);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+                                    box_map[type].filterRegister1,
+                                    event->options[j].value & 0xFFFFFFFFULL));
+                    break;
+                default:
+                    break;
             }
-            else
+        }
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_WBOX);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+int ivb_ibox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint32_t flags = 0x0UL;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for (int j=0;j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
             {
-                BOX_GATE_IVB(PCI_QPI_DEVICE_PORT_0,SBOX0);
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL) << 24);
+                    break;
+                default:
+                    break;
             }
+        }
+    }
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, flags, SETUP_IBOX);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    return 0;
+}
 
-            break;
-
-        case SBOX1:
 
-            /* CTO_COUNT event requires programming of MATCH/MASK registers */
-            if (event->eventId == 0x38)
+int ivb_uncore_freeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
+{
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (eventSet->regTypeMask & ~(0xF))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<31), FREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_U_PMON_GLOBAL_CTL, (1ULL<<31)));
+    }
+    if ((flags != FREEZE_FLAG_ONLYFREEZE) && (eventSet->regTypeMask & ~(0xF)))
+    {
+        for (int j=UNCORE; j<NUM_UNITS; j++)
+        {
+            if (eventSet->regTypeMask & REG_TYPE_MASK(j))
             {
-                if(haveLock)
+                if ((box_map[j].ctrlRegister != 0x0) && (box_map[j].isPci))
                 {
-                    //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1, reg);
-                    //uflags &= ~(0xFFFFU);
-                    uflags = (1UL<<22);
-                    uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
-                    pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  reg, uflags);
-
-                    /* program MATCH0 */
-                    uflags = 0x0UL;
-                    uflags = (event->cmask<<13) + (event->umask<<8);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_1, PCI_UNC_QPI_PMON_MATCH_0, uflags);
-
-                    /* program MASK0 */
-                    uflags = 0x0UL;
-                    uflags = (0x3F<<12) + (event->cfgBits<<4);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_1, PCI_UNC_QPI_PMON_MASK_0, uflags);
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[j].device,
+                                                    box_map[j].ctrlRegister, flags));
+                }
+                else if (box_map[j].ctrlRegister != 0x0)
+                {
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+                                                     box_map[j].ctrlRegister, flags));
                 }
             }
-            else
-            {
-                BOX_GATE_IVB(PCI_QPI_DEVICE_PORT_0,SBOX0);
-            }
-            break;
-
-        case CBOX0:
-        case CBOX1:
-        case CBOX2:
-        case CBOX3:
-        case CBOX4:
-        case CBOX5:
-        case CBOX6:
-        case CBOX7:
-        case CBOX8:
-        case CBOX9:
-        case CBOX10:
-        case CBOX11:
-
-            if(haveLock)
-            {
-                perfmon_threadData[thread_id].counters[index].init = TRUE;
-                uflags = 0x0U;
-
-                /* set local enable flag */
-                uflags |= 1<<22;
-                /* Intel with standard 8 bit event mask: [7:0] */
-                uflags |= (event->umask<<8) + event->eventId;
-                msr_write(cpu_id, reg , uflags);
+        }
+    }
+    return 0;
+}
 
-                if (perfmon_verbose)
+int ivb_uncore_unfreeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
+{
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if ((flags != FREEZE_FLAG_ONLYFREEZE) && (eventSet->regTypeMask & ~(0xF)))
+    {
+        for (int j=UNCORE; j<NUM_UNITS; j++)
+        {
+            if (eventSet->regTypeMask & REG_TYPE_MASK(j))
+            {
+                if ((box_map[j].ctrlRegister != 0x0) && (box_map[j].isPci))
                 {
-                    printf("[%d] perfmon_setup_counter: Write Register 0x%llX , uFlags: 0x%lX \n",
-                            cpu_id,
-                            LLU_CAST reg,
-                            (unsigned long) uflags);
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[j].device,
+                                                    box_map[j].ctrlRegister, flags));
+                }
+                else if (box_map[j].ctrlRegister != 0x0)
+                {
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, Clearing %s registers of %s, (flags == FREEZE_FLAG_CLEAR_CTL ? "control" : "counter"), RegisterTypeNames[j]);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+                                                     box_map[j].ctrlRegister, flags));
                 }
             }
-            break;
-
-        default:
-            /* should never be reached */
-            break;
+        }
     }
-    if (fixed_flags != orig_fixed_flags)
+    if (eventSet->regTypeMask & ~(0xF))
     {
-        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_U_PMON_GLOBAL_STATUS, LLU_CAST 0x0ULL, CLEAR_UNCORE_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_U_PMON_GLOBAL_STATUS, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_U_PMON_GLOBAL_CTL, LLU_CAST (1ULL<<29), UNFREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_U_PMON_GLOBAL_CTL, (1ULL<<29)));
     }
+    return 0;
 }
 
-#define CBOX_START(NUM) \
-if(haveLock) { \
-    msr_write(cpu_id, MSR_UNC_C##NUM##_PMON_BOX_CTL, uflags);  \
-}
 
-#define MBOX_START(NUM) \
-if(haveLock) { \
-    pci_write(cpu_id, PCI_IMC_DEVICE_CH_##NUM,  PCI_UNC_MC_PMON_BOX_CTL, uflags); \
-}
+int perfmon_setupCounterThread_ivybridge(
+        int thread_id,
+        PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    }
+
+    ivb_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
 
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        switch (eventSet->events[i].type)
+        {
+            case PMC:
+                ivb_pmc_setup(cpu_id, index, event);
+                break;
+
+            case FIXED:
+                fixed_flags |= ivb_fixed_setup(cpu_id, index, event);
+                break;
+
+            case POWER:
+                break;
+
+            case MBOX0:
+            case MBOX1:
+            case MBOX2:
+            case MBOX3:
+            case MBOX4:
+            case MBOX5:
+            case MBOX6:
+            case MBOX7:
+            case PBOX:
+            case RBOX0:
+            case RBOX1:
+                ivb_pci_box_setup(cpu_id, index, event);
+                break;
+
+            case BBOX0:
+            case BBOX1:
+                ivb_bbox_setup(cpu_id, index, event);
+                break;
+
+            case MBOX0FIX:
+            case MBOX1FIX:
+            case MBOX2FIX:
+            case MBOX3FIX:
+            case MBOX4FIX:
+            case MBOX5FIX:
+            case MBOX6FIX:
+            case MBOX7FIX:
+                ivb_mboxfix_setup(cpu_id, index, event);
+                break;
+
+            case SBOX0:
+                ivb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_0);
+                break;
+            case SBOX1:
+                ivb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_1);
+                break;
+            case SBOX2:
+                ivb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_2);
+                break;
+
+            case CBOX0:
+            case CBOX1:
+            case CBOX2:
+            case CBOX3:
+            case CBOX4:
+            case CBOX5:
+            case CBOX6:
+            case CBOX7:
+            case CBOX8:
+            case CBOX9:
+            case CBOX10:
+            case CBOX11:
+            case CBOX12:
+            case CBOX13:
+            case CBOX14:
+                ivb_cbox_setup(cpu_id, index, event);
+                break;
+
+            case UBOX:
+                ivb_ubox_setup(cpu_id, index, event);
+                break;
+            case UBOXFIX:
+                ivb_uboxfix_setup(cpu_id, index, event);
+                break;
+
+            case WBOX:
+                ivb_wbox_setup(cpu_id, index, event);
+                break;
+
+            case IBOX0:
+            case IBOX1:
+                ivb_ibox_setup(cpu_id, index, event);
+                break;
+
+            default:
+                break;
+        }
+    }
+    if (fixed_flags > 0x0)
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+    }
+    return 0;
+}
 
 
-void perfmon_startCountersThread_ivybridge(int thread_id)
+int perfmon_startCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    uint64_t flags = 0x0ULL;
-    uint32_t uflags = 0x10000UL; /* Clear freeze bit */
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t fixed_flags = 0x0ULL;
+    uint64_t tmp = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-
-    for ( int i=0; i<perfmon_numCountersIvybridge; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            switch (ivybridge_counter_map[i].type)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            switch (type)
             {
                 case PMC:
-                    msr_write(cpu_id, ivybridge_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
+                    if (eventSet->regTypeMask & REG_TYPE_MASK(PMC))
+                    {
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                        fixed_flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));  /* enable counter */
+                    }
                     break;
 
                 case FIXED:
-                    msr_write(cpu_id, ivybridge_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1ULL<<(i+32));  /* enable fixed counter */
+                    if (eventSet->regTypeMask & REG_TYPE_MASK(FIXED))
+                    {
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                        fixed_flags |= (1ULL<<(index+32));  /* enable fixed counter */
+                    }
                     break;
 
                 case POWER:
-                    if(haveLock)
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_read(cpu_id, ivybridge_counter_map[i].counterRegister);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&tmp));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST field64(tmp, 0, box_map[type].regWidth), START_POWER)
+                        eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
                     }
-
                     break;
 
-                case MBOX0:
-                    MBOX_START(0);
+                default:
                     break;
+            }
+        }
+    }
 
-                case MBOX1:
-                    MBOX_START(1);
-                    break;
+    ivb_uncore_unfreeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTR);
 
-                case MBOX2:
-                    MBOX_START(2);
-                    break;
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST fixed_flags, UNFREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, fixed_flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|fixed_flags));
+    }
+    return 0;
+}
 
-                case MBOX3:
-                    MBOX_START(3);
-                    break;
 
-                case MBOXFIX:
-                    break;
 
-                case SBOX0:
-                    if(haveLock)
+uint64_t ivb_uncore_read(int cpu_id, RegisterIndex index, PerfmonEvent *event, int flags)
+{
+    uint64_t result = 0x0ULL;
+    uint64_t tmp = 0x0ULL;
+    RegisterType type = counter_map[index].type;
+    PciDeviceIndex dev = counter_map[index].device;
+    uint64_t counter1 = counter_map[index].counterRegister;
+    uint64_t counter2 = counter_map[index].counterRegister2;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return result;
+    }
+    if (box_map[type].isPci && !pci_checkDevice(dev, cpu_id))
+    {
+        return result;
+    }
+
+    CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter1, tmp, UNCORE_READ);
+
+    if (flags & FREEZE_FLAG_CLEAR_CTR)
+    {
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0U));
+    }
+    if (counter2 != 0x0)
+    {
+        result = (tmp<<32);
+        tmp = 0x0ULL;
+        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter2, &tmp));
+        VERBOSEPRINTPCIREG(cpu_id, dev, counter2, tmp, UNCORE_READ);
+        result += (tmp & 0xFFFFFFFF);
+        if (flags & FREEZE_FLAG_CLEAR_CTR)
+        {
+            CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter2, 0x0U));
+        }
+    }
+    else
+    {
+        result = tmp;
+    }
+    return result;
+}
+
+int ivb_uncore_overflow(int cpu_id, RegisterIndex index, PerfmonEvent *event,
+                         int* overflows, uint64_t result, uint64_t cur_result,
+                         int global_offset, int box_offset)
+{
+    int test_local = 0;
+    uint64_t ovf_values = 0x0ULL;
+    RegisterType type = counter_map[index].type;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (result < cur_result)
+    {
+        if (global_offset != -1)
+        {
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV,
+                                           MSR_UNC_U_PMON_GLOBAL_STATUS,
+                                           &ovf_values));
+            if (ovf_values & (1<<global_offset))
+            {
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+                                                 MSR_UNC_U_PMON_GLOBAL_STATUS,
+                                                 (1<<global_offset)));
+                test_local = 1;
+            }
+        }
+        else
+        {
+            test_local = 1;
+        }
+
+        if (test_local)
+        {
+            ovf_values = 0x0ULL;
+            if (ivybridge_box_map[type].isPci)
+            {
+                CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev,
+                                              box_map[type].statusRegister,
+                                              &ovf_values));
+            }
+            else
+            {
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV,
+                                              box_map[type].statusRegister,
+                                              &ovf_values));
+            }
+            if (ovf_values & (1<<box_offset))
+            {
+                (*overflows)++;
+                if (ivybridge_box_map[type].isPci)
+                {
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev,
+                                                    box_map[type].statusRegister,
+                                                    (1<<box_offset)));
+                }
+                else
+                {
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV,
+                                                     box_map[type].statusRegister,
+                                                     (1<<box_offset)));
+                }
+            }
+        }
+    }
+    return 0;
+}
+
+int perfmon_stopCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSet)
+{
+    uint64_t counter_result = 0x0ULL;
+    int haveLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    ivb_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTL);
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result= 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            PerfmonEvent *event = &(eventSet->events[i].event);
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, counter1, &counter_result));
+                    if (counter_result < *current)
                     {
-                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_BOX_CTL, 0x0ULL);
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1<<index-cpuid_info.perf_num_fixed_ctr))
+                        {
+                            (*overflows)++;
+                        }
                     }
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
                     break;
-
-                case SBOX1:
-                    if(haveLock)
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, counter1, &counter_result));
+                    if (counter_result < *current)
                     {
-                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_BOX_CTL, 0x0ULL);
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, dev, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1<<(index+32)))
+                        {
+                            (*overflows)++;
+                        }
                     }
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
                     break;
 
-                case CBOX0:
-                    CBOX_START(0);
+                case POWER:
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
+                    {
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+                        if (counter_result < *current)
+                        {
+                            (*overflows)++;
+                        }
+                    }
                     break;
 
-                case CBOX1:
-                    CBOX_START(1);
+                case THERMAL:
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
                     break;
 
-                case CBOX2:
-                    CBOX_START(2);
+                case SBOX0FIX:
+                case SBOX1FIX:
+                case SBOX2FIX:
+                    if (haveLock && pci_checkDevice(dev, cpu_id))
+                    {
+                        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &counter_result));
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED)
+                        switch (extractBitField(counter_result,3,0))
+                        {
+                            case 0x2:
+                                counter_result = 5600000000ULL;
+                                break;
+                            case 0x3:
+                                counter_result = 6400000000ULL;
+                                break;
+                            case 0x4:
+                                counter_result = 7200000000ULL;
+                                break;
+                            case 0x5:
+                                counter_result = 8000000000ULL;
+                                break;
+                            case 0x6:
+                                counter_result = 8800000000ULL;
+                                break;
+                            case 0x7:
+                                counter_result = 9600000000ULL;
+                                break;
+                            default:
+                                counter_result = 0x0ULL;
+                                break;
+                        }
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED_REAL)
+                    }
                     break;
 
-                case CBOX3:
-                    CBOX_START(3);
+                case MBOX0:
+                case MBOX1:
+                case MBOX2:
+                case MBOX3:
+                case MBOX4:
+                case MBOX5:
+                case MBOX6:
+                case MBOX7:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_CLEAR_CTR);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, box_map[type].ovflOffset, getCounterTypeOffset(index)+1);
                     break;
 
-                case CBOX4:
-                    CBOX_START(4);
-                    break;
 
-                case CBOX5:
-                    CBOX_START(5);
+                case MBOX0FIX:
+                case MBOX1FIX:
+                case MBOX2FIX:
+                case MBOX3FIX:
+                case MBOX4FIX:
+                case MBOX5FIX:
+                case MBOX6FIX:
+                case MBOX7FIX:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_CLEAR_CTR);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, box_map[type].ovflOffset, 0);
                     break;
 
-                case CBOX6:
-                    CBOX_START(6);
+                case IBOX1:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_CLEAR_CTR);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, -1, getCounterTypeOffset(index)+2);
                     break;
 
+                case SBOX0:
+                case SBOX1:
+                case SBOX2:
+                case CBOX0:
+                case CBOX1:
+                case CBOX2:
+                case CBOX3:
+                case CBOX4:
+                case CBOX5:
+                case CBOX6:
                 case CBOX7:
-                    CBOX_START(7);
-                    break;
-
                 case CBOX8:
-                    CBOX_START(8);
-                    break;
-
                 case CBOX9:
-                    CBOX_START(9);
-                    break;
-
                 case CBOX10:
-                    CBOX_START(10);
-                    break;
-
                 case CBOX11:
-                    CBOX_START(11);
+                case CBOX12:
+                case CBOX13:
+                case CBOX14:
+                case UBOX:
+                case UBOXFIX:
+                case BBOX0:
+                case BBOX1:
+                case WBOX:
+                case PBOX:
+                case RBOX0:
+                case RBOX1:
+                case RBOX2:
+                case IBOX0:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_CLEAR_CTR);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, box_map[type].ovflOffset, getCounterTypeOffset(index));
                     break;
 
                 default:
-                    /* should never be reached */
                     break;
             }
+            *current = field64(counter_result, 0, box_map[type].regWidth);
         }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
 
-    if (perfmon_verbose)
-    {
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
-    }
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+    return 0;
 }
 
-#define CBOX_STOP(NUM) \
-if(haveLock) { \
-    msr_write(cpu_id, MSR_UNC_C##NUM##_PMON_BOX_CTL, uflags);  \
-    perfmon_threadData[thread_id].counters[i].counterData =   \
-    msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);    \
-}
-
-#define MBOX_STOP(NUM) \
-if(haveLock) { \
-    pci_write(cpu_id, PCI_IMC_DEVICE_CH_##NUM ,  PCI_UNC_MC_PMON_BOX_CTL, uflags); \
-    counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_##NUM , ivybridge_counter_map[i].counterRegister); \
-    counter_result = (counter_result<<32) + pci_read(cpu_id, PCI_IMC_DEVICE_CH_##NUM , ivybridge_counter_map[i].counterRegister2);  \
-    perfmon_threadData[thread_id].counters[i].counterData = counter_result; \
-}
-
-#define SBOX_STOP(NUM) \
-if(haveLock) { \
-    pci_write(cpu_id, PCI_QPI_DEVICE_PORT_##NUM ,  PCI_UNC_QPI_PMON_BOX_CTL, (1<<8)); \
-    counter_result = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_##NUM , ivybridge_counter_map[i].counterRegister); \
-    counter_result = (counter_result<<32) + pci_read(cpu_id, PCI_QPI_DEVICE_PORT_##NUM , ivybridge_counter_map[i].counterRegister2);  \
-    perfmon_threadData[thread_id].counters[i].counterData = counter_result; \
-}
-
-
-void perfmon_stopCountersThread_ivybridge(int thread_id)
+int perfmon_readCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    uint32_t uflags = 0x10100UL; /* Set freeze bit */
     uint64_t counter_result = 0x0ULL;
+    uint64_t pmc_flags = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &pmc_flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    ivb_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
 
-    for ( int i=0; i < NUM_COUNTERS_IVYBRIDGE; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            switch (ivybridge_counter_map[i].type)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            PerfmonEvent *event = &(eventSet->events[i].event);
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t* current = &(eventSet->events[i].threadCounter[thread_id].counterData);
+            int* overflows = &(eventSet->events[i].threadCounter[thread_id].overflows);
+            switch (type)
             {
                 case PMC:
-
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < *current)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1<<index-cpuid_info.perf_num_fixed_ctr))
+                        {
+                            (*overflows)++;
+                        }
+                    }
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC)
+                    break;
                 case FIXED:
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, ivybridge_counter_map[i].counterRegister);
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < *current)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1<<(index+32)))
+                        {
+                            (*overflows)++;
+                        }
+                    }
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED)
                     break;
 
                 case POWER:
-                    if(haveLock)
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(POWER)))
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_info.energyUnit *
-                            ( power_read(cpu_id, ivybridge_counter_map[i].counterRegister) -
-                              perfmon_threadData[thread_id].counters[i].counterData);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, STOP_POWER)
+                        if (counter_result < *current)
+                        {
+                            (*overflows)++;
+                        }
                     }
                     break;
 
                 case THERMAL:
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                             thermal_read(cpu_id);
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
                     break;
 
-                case MBOX0:
-                    MBOX_STOP(0);
+                case SBOX0FIX:
+                case SBOX1FIX:
+                case SBOX2FIX:
+                    if (haveLock && pci_checkDevice(dev, cpu_id))
+                    {
+                        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &counter_result));
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED)
+                        switch (extractBitField(counter_result,3,0))
+                        {
+                            case 0x2:
+                                counter_result = 5600000000ULL;
+                                break;
+                            case 0x3:
+                                counter_result = 6400000000ULL;
+                                break;
+                            case 0x4:
+                                counter_result = 7200000000ULL;
+                                break;
+                            case 0x5:
+                                counter_result = 8000000000ULL;
+                                break;
+                            case 0x6:
+                                counter_result = 8800000000ULL;
+                                break;
+                            case 0x7:
+                                counter_result = 9600000000ULL;
+                                break;
+                            default:
+                                counter_result = 0x0ULL;
+                                break;
+                        }
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1, LLU_CAST counter_result, READ_SBOX_FIXED_REAL)
+                        eventSet->events[i].threadCounter[thread_id].startData = 0;
+                    }
                     break;
 
+                case MBOX0:
                 case MBOX1:
-                    MBOX_STOP(1);
-                    break;
-
                 case MBOX2:
-                    MBOX_STOP(2);
-                    break;
-
                 case MBOX3:
-                    MBOX_STOP(3);
+                case MBOX4:
+                case MBOX5:
+                case MBOX6:
+                case MBOX7:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_ONLYFREEZE);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, box_map[type].ovflOffset, getCounterTypeOffset(index)+1);
                     break;
 
-                case MBOXFIX:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
 
-                        counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                ivybridge_counter_map[i].counterRegister);
-
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                    ivybridge_counter_map[i].counterRegister2);
-
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                    }
+                case MBOX0FIX:
+                case MBOX1FIX:
+                case MBOX2FIX:
+                case MBOX3FIX:
+                case MBOX4FIX:
+                case MBOX5FIX:
+                case MBOX6FIX:
+                case MBOX7FIX:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_ONLYFREEZE);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, box_map[type].ovflOffset, 0);
                     break;
 
-                case SBOX0:
-                    SBOX_STOP(0);
+                case IBOX1:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_ONLYFREEZE);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, -1, getCounterTypeOffset(index)+2);
                     break;
 
+                case SBOX0:
                 case SBOX1:
-                    SBOX_STOP(1);
-                    break;
-
+                case SBOX2:
                 case CBOX0:
-                    CBOX_STOP(0);
-                    break;
-
                 case CBOX1:
-                    CBOX_STOP(1);
-                    break;
-
                 case CBOX2:
-                    CBOX_STOP(2);
-                    break;
-
                 case CBOX3:
-                    CBOX_STOP(3);
-                    break;
-
                 case CBOX4:
-                    CBOX_STOP(4);
-                    break;
-
                 case CBOX5:
-                    CBOX_STOP(5);
-                    break;
-
                 case CBOX6:
-                    CBOX_STOP(6);
-                    break;
-
                 case CBOX7:
-                    CBOX_STOP(7);
-                    break;
-
                 case CBOX8:
-                    CBOX_STOP(8);
-                    break;
-
                 case CBOX9:
-                    CBOX_STOP(9);
-                    break;
-
                 case CBOX10:
-                    CBOX_STOP(10);
-                    break;
-
                 case CBOX11:
-                    CBOX_STOP(11);
+                case CBOX12:
+                case CBOX13:
+                case CBOX14:
+                case UBOX:
+                case UBOXFIX:
+                case BBOX0:
+                case BBOX1:
+                case WBOX:
+                case PBOX:
+                case RBOX0:
+                case RBOX1:
+                case RBOX2:
+                case IBOX0:
+                    counter_result = ivb_uncore_read(cpu_id, index, event, FREEZE_FLAG_ONLYFREEZE);
+                    ivb_uncore_overflow(cpu_id, index, event, overflows, counter_result,
+                                        *current, box_map[type].ovflOffset, getCounterTypeOffset(index));
                     break;
 
-
                 default:
                     /* should never be reached */
                     break;
             }
+            *current = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
 
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    //    printf ("Status: 0x%llX \n", LLU_CAST flags);
-    if ( (flags & 0x3) || (flags & (0x3ULL<<32)) ) 
+    ivb_uncore_unfreeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        printf ("Overflow occured \n");
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, pmc_flags));
     }
+    return 0;
 }
 
-void perfmon_readCountersThread_ivybridge(int thread_id)
+
+int perfmon_finalizeCountersThread_ivybridge(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t counter_result = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int haveTileLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
-    for ( int i=0; i<NUM_COUNTERS_IVYBRIDGE; i++ )
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        haveTileLock = 1;
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if ((ivybridge_counter_map[i].type == PMC) || (ivybridge_counter_map[i].type == FIXED))
+            RegisterIndex index = eventSet->events[i].index;
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t reg = counter_map[index].configRegister;
+            RegisterType type = eventSet->events[i].type;
+            if (eventSet->events[i].type == NOTYPE)
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, ivybridge_counter_map[i].counterRegister);
+                continue;
             }
-            else
+
+            switch(type)
             {
-                if(haveLock)
-                {
-                    switch (ivybridge_counter_map[i].type)
+                case PMC:
+                    ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                    if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
                     {
-                        case POWER:
-                            perfmon_threadData[thread_id].counters[i].counterData =
-                                power_info.energyUnit *
-                                power_read(cpu_id, ivybridge_counter_map[i].counterRegister);
-                            break;
-
-                        case MBOX0:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                    ivybridge_counter_map[i].counterRegister);
-
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                        ivybridge_counter_map[i].counterRegister2);
-
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
-
-                        case MBOX1:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
-                                    ivybridge_counter_map[i].counterRegister);
-
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
-                                        ivybridge_counter_map[i].counterRegister2);
-
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
-
-                        case MBOX2:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
-                                    ivybridge_counter_map[i].counterRegister);
-
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
-                                        ivybridge_counter_map[i].counterRegister2);
-
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
-
-                        case MBOX3:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
-                                    ivybridge_counter_map[i].counterRegister);
-
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
-                                        ivybridge_counter_map[i].counterRegister2);
-
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
-
-                        default:
-                            /* should never be reached */
-                            break;
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
                     }
-                }
+                    else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                    }
+                    break;
+                case FIXED:
+                    ovf_values_core |= (1ULL<<(index+32));
+                    break;
+                default:
+                    break;
+            }
+            if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+                VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL, CLEAR_CTR);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].counterRegister, 0x0ULL));
             }
+            eventSet->events[i].threadCounter[thread_id].init = FALSE;
         }
     }
-}
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_U_PMON_GLOBAL_STATUS, LLU_CAST 0x0ULL, CLEAR_UNCORE_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_U_PMON_GLOBAL_STATUS, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_UNC_U_PMON_GLOBAL_CTL, LLU_CAST 0x0ULL, CLEAR_UNCORE_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_U_PMON_GLOBAL_CTL, 0x0ULL));
+    }
 
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_ivybridgeEP_events.txt b/src/includes/perfmon_ivybridgeEP_events.txt
new file mode 100644
index 0000000..f128270
--- /dev/null
+++ b/src/includes/perfmon_ivybridgeEP_events.txt
@@ -0,0 +1,1955 @@
+# =======================================================================================
+#  
+#      Filename:  perfmon_ivybridgeEP_events.txt
+# 
+#      Description:  Event list for Intel Ivy Bridge EP/EN/EX
+# 
+#      Version:   4.0
+#      Released:  16.6.2015
+# 
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE          0x00   TMP0
+UMASK_TEMP_CORE          0x00
+
+EVENT_PWR_PKG_ENERGY          0x00   PWR0
+UMASK_PWR_PKG_ENERGY          0x00
+
+EVENT_PWR_PP0_ENERGY          0x00   PWR1
+UMASK_PWR_PP0_ENERGY          0x00
+
+EVENT_PWR_DRAM_ENERGY          0x00   PWR3
+UMASK_PWR_DRAM_ENERGY          0x00
+
+EVENT_INSTR_RETIRED              0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY          0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE      0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF       0x00
+
+EVENT_LD_BLOCKS                 0x03  PMC
+UMASK_LD_BLOCKS_STORE_FORWARD   0x02
+
+EVENT_MISALIGN_MEM_REF           0x05  PMC
+UMASK_MISALIGN_MEM_REF_LOADS      0x01
+UMASK_MISALIGN_MEM_REF_STORES     0x02
+UMASK_MISALIGN_MEM_REF_ANY        0x03
+
+EVENT_LD_BLOCKS_PARTIAL      0x07  PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01  PMC
+
+EVENT_DTLB_LOAD_MISSES                 0x08  PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK   0x81
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED  0x82
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION   0x84
+
+EVENT_UOPS_ISSUED                0x0E  PMC
+UMASK_UOPS_ISSUED_ANY            0x01
+UMASK_UOPS_ISSUED_FLAGS_MERGE    0x10
+UMASK_UOPS_ISSUED_SLOW_LEA       0x20
+UMASK_UOPS_ISSUED_SINGLE_MUL     0x40
+
+EVENT_FP_COMP_OPS_EXE            0x10   PMC
+UMASK_FP_COMP_OPS_EXE_X87                      0x01
+UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE     0x10
+UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE     0x20
+UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE     0x40
+UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE     0x80
+
+EVENT_SIMD_FP_256_PACKED       0x11   PMC
+UMASK_SIMD_FP_256_PACKED_SINGLE     0x01
+UMASK_SIMD_FP_256_PACKED_DOUBLE     0x02
+
+EVENT_ARITH                      0x14   PMC
+UMASK_ARITH_FPU_DIV_ACTIVE       0x01
+UMASK_ARITH_NUM_DIV              0x01 0xC5 0x01
+
+EVENT_L2_RQSTS                   0x24   PMC
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_HIT 0x01
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD     0x03
+UMASK_L2_RQSTS_RFO_HITS           0x04
+UMASK_L2_RQSTS_RFO_MISS          0x08
+UMASK_L2_RQSTS_RFO_ANY           0x0C
+UMASK_L2_RQSTS_CODE_RD_HITS        0x10
+UMASK_L2_RQSTS_CODE_RD_MISS       0x20
+UMASK_L2_RQSTS_ALL_CODE_CODE_RD   0x30
+UMASK_L2_RQSTS_PF_HIT      0x40
+UMASK_L2_RQSTS_PF_MISS     0x80
+UMASK_L2_RQSTS_ALL_PF        0xC0
+UMASK_L2_RQSTS_MISS              0xAA
+
+EVENT_L2_STORE_LOCK_RQSTS            0x27   PMC
+UMASK_L2_STORE_LOCK_RQSTS_MISS       0x01
+UMASK_L2_STORE_LOCK_RQSTS_HIT_M       0x08
+UMASK_L2_STORE_LOCK_RQSTS_ALL        0x0F
+
+EVENT_L1D_WB_RQST                  0x28   PMC
+UMASK_L1D_WB_RQST_HIT_E          0x04
+UMASK_L1D_WB_RQST_HIT_M          0x08
+UMASK_L1D_WB_RQST_ALL            0x0F
+
+EVENT_L3_LAT_CACHE               0x2E   PMC
+UMASK_L3_LAT_CACHE_REFERENCE     0x4F
+UMASK_L3_LAT_CACHE_MISS          0x41
+
+EVENT_CPU_CLOCK_UNHALTED         0x3C   PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_P     0x01
+
+EVENT_L1D_PEND_MISS              0x48   PMC1
+UMASK_L1D_PEND_MISS_PENDING      0x01
+
+EVENT_DTLB_STORE_MISSES                0x49   PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK   0x01
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED       0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION       0x04
+UMASK_DTLB_STORE_MISSES_STLB_HIT             0x10
+
+EVENT_LOAD_HIT_PRE               0x4C    PMC
+UMASK_LOAD_HIT_PRE_SW_PF               0x01
+UMASK_LOAD_HIT_PRE_HW_PF               0x02
+
+EVENT_L1D                        0x51   PMC
+UMASK_L1D_REPLACEMENT             0x01
+UMASK_L1D_ALLOCATED_IN_M          0x02
+UMASK_L1D_M_EVICT                 0x04
+UMASK_L1D_ALL_M_REPLACEMENT       0x08
+
+EVENT_MOVE_ELIMINATION                        0x58   PMC
+UMASK_MOVE_ELIMINATION_INT_NOT_ELIMINATED     0x04
+UMASK_MOVE_ELIMINATION_SIMD_NOT_ELIMINATED    0x08
+UMASK_MOVE_ELIMINATION_INT_ELIMINATED         0x01
+UMASK_MOVE_ELIMINATION_SIMD_ELIMINATED        0x02
+
+EVENT_CPL_CYCLES               0x5C    PMC
+UMASK_CPL_CYCLES_RING0             0x01
+UMASK_CPL_CYCLES_RING123             0x02
+
+EVENT_RS_EVENTS               0x5E    PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING          0x60   PMC
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO   0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD   0x08
+
+EVENT_CACHE_LOCK_CYCLES          0x63   PMC
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION      0x01
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION       0x02
+
+EVENT_IDQ               0x79   PMC
+UMASK_IDQ_EMPTY         0x02
+UMASK_IDQ_MITE_UOPS     0x04
+UMASK_IDQ_DSB_UOPS      0x08
+UMASK_IDQ_MS_DSB_UOPS   0x10
+UMASK_IDQ_MS_MITE_UOPS  0x20
+UMASK_IDQ_MS_UOPS       0x30
+UMASK_IDQ_ALL_DSB_CYCLES_ANY_UOPS      0x18 0x00 0x01
+UMASK_IDQ_ALL_DSB_CYCLES_4_UOPS        0x18 0x00 0x04
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24 0x00 0x01
+UMASK_IDQ_ALL_MITE_CYCLES_4_UOPS       0x24 0x00 0x04
+UMASK_IDQ_ALL_MITE_ALL_UOPS       0x3C
+
+EVENT_ICACHE                  0x80   PMC
+UMASK_ICACHE_HITS             0x01
+UMASK_ICACHE_MISSES             0x02
+UMASK_ICACHE_ACCESSES           0x03
+UMASK_ICACHE_IFETCH_STALL       0x04
+
+EVENT_ITLB_MISSES                 0x85      PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK   0x01
+UMASK_ITLB_MISSES_WALK_COMPLETED  0x02
+UMASK_ITLB_MISSES_WALK_DURATION   0x04
+UMASK_ITLB_MISSES_STLB_HIT   0x10
+
+EVENT_ILD_STALL                 0x87      PMC
+UMASK_ILD_STALL_LCP             0x01
+UMASK_ILD_STALL_IQ_FULL         0x04
+
+EVENT_BR_INST_EXEC                                      0x88   PMC
+UMASK_BR_INST_EXEC_COND_TAKEN                           0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN                     0x82
+UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN                 0x42
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN                0x48
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0 
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60 
+UMASK_BR_INST_EXEC_ALL_BRANCHES                         0xFF 
+
+EVENT_BR_MISP_EXEC                                      0x89   PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN                           0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN                       0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN      0x84
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN                    0x88
+UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN                0x48
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN               0x90
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN           0x50
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN             0xA0
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN         0x60
+UMASK_BR_MISP_EXEC_ALL_BRANCHES                         0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE               0x01
+
+EVENT_UOPS_DISPATCHED_PORT                 0xA1   PMC
+UMASK_UOPS_DISPATCHED_PORT_PORT_0           0x01
+UMASK_UOPS_DISPATCHED_PORT_PORT_1           0x02
+UMASK_UOPS_DISPATCHED_PORT_PORT_2_LD        0x04
+UMASK_UOPS_DISPATCHED_PORT_PORT_2_STA       0x08
+UMASK_UOPS_DISPATCHED_PORT_PORT_2           0x0C
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_LD           0x10
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_STA           0x20
+UMASK_UOPS_DISPATCHED_PORT_PORT_3           0x30
+UMASK_UOPS_DISPATCHED_PORT_PORT_4           0x40
+UMASK_UOPS_DISPATCHED_PORT_PORT_5           0x80
+
+EVENT_RESOURCE_STALLS                 0xA2   PMC
+UMASK_RESOURCE_STALLS_ANY             0x01
+UMASK_RESOURCE_STALLS_RS              0x04
+UMASK_RESOURCE_STALLS_B               0x08
+UMASK_RESOURCE_STALLS_ROB             0x10
+
+EVENT_CYCLE_ACTIVITY                 0xA3   PMC
+UMASK_CYCLE_ACTIVITY_CYCLES_L2_PENDING             0x01
+UMASK_CYCLE_ACTIVITY_CYCLES_LDM_PENDING              0x02
+UMASK_CYCLE_ACTIVITY_L1D_PENDING               0x08
+UMASK_CYCLE_ACTIVITY_CYCLES_NO_EXECUTE             0x04
+
+EVENT_DSB2MITE_SWITCHES                  0xAB   PMC
+UMASK_DSB2MITE_SWITCHES_COUNT            0x01
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES   0x02
+
+EVENT_DSB_FILL                         0xAC   PMC
+UMASK_DSB_FILL_EXCEED_DSB_LINES        0x08
+
+EVENT_ITLB                         0xAE   PMC
+UMASK_ITLB_ITLB_FLUSH            0x01
+
+EVENT_OFFCORE_REQUESTS     0xB0   PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD      0x08
+
+EVENT_UOPS_EXECUTED               0xB1   PMC
+UMASK_UOPS_EXECUTED_THREAD            0x01
+UMASK_UOPS_EXECUTED_CORE              0x02
+
+EVENT_TLB_FLUSH          0xBD  PMC
+UMASK_TLB_FLUSH_DTLB_THREAD     0x01
+UMASK_TLB_FLUSH_STLB_ANY        0x20
+
+EVENT_INST_RETIRED                  0xC0  PMC1
+UMASK_INST_RETIRED_ANY_P            0x00
+UMASK_INST_RETIRED_ALL              0x01
+
+EVENT_OTHER_ASSISTS                  0xC1  PMC
+UMASK_OTHER_ASSISTS_AVX_STORE     0x08
+UMASK_OTHER_ASSISTS_AVX_TO_SSE            0x10
+UMASK_OTHER_ASSISTS_SSE_TO_AVX            0x20
+
+EVENT_UOPS_RETIRED                  0xC2  PMC
+UMASK_UOPS_RETIRED_ALL              0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS     0x02
+
+EVENT_MACHINE_CLEARS              0xC3  PMC
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
+UMASK_MACHINE_CLEARS_SMC                0x04
+UMASK_MACHINE_CLEARS_MASKMOV            0x20
+
+EVENT_BR_INST_RETIRED               0xC4  PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x04
+UMASK_BR_INST_RETIRED_NEAR_RETURN   0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH    0x40
+
+EVENT_BR_MISP_RETIRED               0xC5  PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL  0x01
+UMASK_BR_MISP_RETIRED_NEAR_CALL     0x02
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES     0x04
+UMASK_BR_MISP_RETIRED_NOT_TAKEN      0x10
+UMASK_BR_MISP_RETIRED_TAKEN      0x20
+
+EVENT_FP_ASSIST               0xCA  PMC
+UMASK_FP_ASSIST_X87_OUTPUT               0x02
+UMASK_FP_ASSIST_X87_INPUT                0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT               0x08
+UMASK_FP_ASSIST_SIMD_INPUT               0x10
+UMASK_FP_ASSIST_ANY               0x1E
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS               0xCC  PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS               0x20
+
+EVENT_MEM_UOP_RETIRED            0xD0    PMC
+UMASK_MEM_UOP_RETIRED_LOADS            0x81
+UMASK_MEM_UOP_RETIRED_STORES           0x82
+UMASK_MEM_UOP_RETIRED_LOADS_STLB_MISS         0x11
+UMASK_MEM_UOP_RETIRED_STORES_STLB_MISS        0x12
+UMASK_MEM_UOP_RETIRED_LOADS_LOCK              0x21
+UMASK_MEM_UOP_RETIRED_STORES_LOCK             0x22
+UMASK_MEM_UOP_RETIRED_LOADS_SPLIT             0x41
+UMASK_MEM_UOP_RETIRED_STORES_SPLIT            0x42
+
+EVENT_MEM_LOAD_UOPS_RETIRED               0xD1   PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS      0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL       0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT       0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS      0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL       0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
+
+EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED               0xD2   PMC
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS         0x01
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT          0x02
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM         0x04
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE         0x08
+
+EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED               0xD3   PMC
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM      0x01
+
+EVENT_BACLEARS               0xE6   PMC
+UMASK_BACLEARS_ANY           0x1F
+
+EVENT_L2_TRANS               0xF0  PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD          0x01
+UMASK_L2_TRANS_RFO           0x02
+UMASK_L2_TRANS_CODE_RD       0x04
+UMASK_L2_TRANS_ALL_PREF      0x08
+UMASK_L2_TRANS_L1D_WB        0x10
+UMASK_L2_TRANS_L2_FILL       0x20
+UMASK_L2_TRANS_L2_WB         0x40
+UMASK_L2_TRANS_ALL_REQUESTS  0x80
+
+EVENT_L2_LINES_IN                   0xF1   PMC
+UMASK_L2_LINES_IN_I           0x01
+UMASK_L2_LINES_IN_S            0x02
+UMASK_L2_LINES_IN_E           0x04
+UMASK_L2_LINES_IN_ALL               0x07
+
+EVENT_L2_LINES_OUT                  0xF2   PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x01
+UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x02
+UMASK_L2_LINES_OUT_PF_CLEAN   0x04
+UMASK_L2_LINES_OUT_PF_DIRTY   0x08
+UMASK_L2_LINES_OUT_DIRTY_ALL              0x0A
+
+EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED               0xD3   PMC
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM     0x03
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_DRAM     0x0C
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM     0x10
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_FWD     0x20
+
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM               EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM       EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x78040
+UMASK_OFFCORE_RESPONSE_0_LOCAL_DRAM                 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM      EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x7FF80
+UMASK_OFFCORE_RESPONSE_0_REMOTE_DRAM                0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM               EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM       EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x78040
+UMASK_OFFCORE_RESPONSE_1_LOCAL_DRAM                 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM      EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x7FF80
+UMASK_OFFCORE_RESPONSE_1_REMOTE_DRAM                0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_ANY            0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_CBOX_CLOCKTICKS                         0x00  CBOX
+UMASK_CBOX_CLOCKTICKS                         0x00
+
+EVENT_COUNTER0_OCCUPANCY              0x1F  CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1|CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_COUNTER0_OCCUPANCY              0x00
+
+EVENT_LLC_LOOKUP              0x34  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+DEFAULT_OPTIONS_LLC_LOOKUP          EVENT_OPTION_STATE=0x1F
+OPTIONS_LLC_LOOKUP_DATA_READ        EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_DATA_READ          0x03
+OPTIONS_LLC_LOOKUP_WRITE            EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_WRITE              0x05
+OPTIONS_LLC_LOOKUP_REMOTE_SNOOP     EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_REMOTE_SNOOP       0x09
+OPTIONS_LLC_LOOKUP_ANY              EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_ANY                0x11
+OPTIONS_LLC_LOOKUP_NID              EVENT_OPTION_NID_MASK|EVENT_OPTION_STATE_MASK
+UMASK_LLC_LOOKUP_NID                0x41
+
+EVENT_LLC_VICTIMS              0x37  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_LLC_VICTIMS_M_STATE      0x01
+UMASK_LLC_VICTIMS_E_STATE      0x02
+UMASK_LLC_VICTIMS_S_STATE      0x04
+UMASK_LLC_VICTIMS_ANY          0x07
+UMASK_LLC_VICTIMS_MISS         0x08
+OPTIONS_LLC_VICTIMS_NID        EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID          0x40
+
+EVENT_CBO_MISC              0x39  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_CBO_MISC_RSPI_WAS_FSE      0x01
+UMASK_CBO_MISC_WC_ALIASING       0x02
+UMASK_CBO_MISC_STARTED           0x04
+UMASK_CBO_MISC_RFO_HIT_S         0x08
+
+EVENT_RING_AD_USED               0x1B  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_RING_AD_USED_0_UP_EVEN      0x01
+UMASK_RING_AD_USED_0_UP_ODD       0x02
+UMASK_RING_AD_USED_0_DOWN_EVEN    0x04
+UMASK_RING_AD_USED_0_DOWN_ODD     0x08
+UMASK_RING_AD_USED_1_UP_EVEN      0x10
+UMASK_RING_AD_USED_1_UP_ODD       0x20
+UMASK_RING_AD_USED_1_DOWN_EVEN    0x40
+UMASK_RING_AD_USED_1_DOWN_ODD     0x80
+UMASK_RING_AD_USED_DOWN           0xCC
+UMASK_RING_AD_USED_UP             0x33
+
+EVENT_RING_AK_USED              0x1C  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_RING_AK_USED_0_UP_EVEN      0x01
+UMASK_RING_AK_USED_0_UP_ODD       0x02
+UMASK_RING_AK_USED_0_DOWN_EVEN    0x04
+UMASK_RING_AK_USED_0_DOWN_ODD     0x08
+UMASK_RING_AK_USED_1_UP_EVEN      0x10
+UMASK_RING_AK_USED_1_UP_ODD       0x20
+UMASK_RING_AK_USED_1_DOWN_EVEN    0x40
+UMASK_RING_AK_USED_1_DOWN_ODD     0x80
+UMASK_RING_AK_USED_DOWN           0xCC
+UMASK_RING_AK_USED_UP             0x33
+
+EVENT_RING_BL_USED              0x1D  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_RING_BL_USED_0_UP_EVEN      0x01
+UMASK_RING_BL_USED_0_UP_ODD       0x02
+UMASK_RING_BL_USED_0_DOWN_EVEN    0x04
+UMASK_RING_BL_USED_0_DOWN_ODD     0x08
+UMASK_RING_BL_USED_1_UP_EVEN      0x10
+UMASK_RING_BL_USED_1_UP_ODD       0x20
+UMASK_RING_BL_USED_1_DOWN_EVEN    0x40
+UMASK_RING_BL_USED_1_DOWN_ODD     0x80
+UMASK_RING_BL_USED_DOWN           0xCC
+UMASK_RING_BL_USED_UP             0x33
+
+EVENT_RING_BOUNCES              0x05  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RING_BOUNCES_AK_IRQ       0x02
+UMASK_RING_BOUNCES_AK_CORE      0x04
+UMASK_RING_BOUNCES_BL_CORE      0x08
+UMASK_RING_BOUNCES_IV_CORE      0x01
+
+EVENT_RING_IV_USED              0x1E  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX8C2|CBOX9C2|CBOX10C2|CBOX11C2|CBOX12C2|CBOX13C2|CBOX14C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3|CBOX8C3|CBOX9C3|CBOX10C3|CBOX11C3|CBOX12C3|CBOX13C3|CBOX14C3
+UMASK_RING_IV_USED_ANY           0x0F
+UMASK_RING_IV_USED_UP            0x33
+UMASK_RING_IV_USED_DOWN          0xCC
+
+EVENT_RING_SRC_THRTL            0x07  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RING_SRC_THRTL            0x00
+
+EVENT_RXR_EXT_STARVED               0x12  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_EXT_STARVED_IRQ           0x01
+UMASK_RXR_EXT_STARVED_IPQ           0x02
+UMASK_RXR_EXT_STARVED_PRQ           0x04
+UMASK_RXR_EXT_STARVED_ISMQ_BIDS     0x08
+
+EVENT_RXR_INSERTS                0x13  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_INSERTS_IRQ            0x01
+UMASK_RXR_INSERTS_IRQ_REJECTED   0x02
+UMASK_RXR_INSERTS_IPQ            0x04
+UMASK_RXR_INSERTS_VFIFO          0x10
+
+EVENT_RXR_IPQ_RETRY                0x31  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_IPQ_RETRY_ANY            0x01
+UMASK_RXR_IPQ_RETRY_FULL           0x02
+UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT  0x04
+UMASK_RXR_IPQ_RETRY_QPI_CREDITS    0x10
+
+EVENT_RXR_IRQ_RETRY                0x32  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_IRQ_RETRY_ANY            0x01
+UMASK_RXR_IRQ_RETRY_FULL           0x02
+UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT  0x04
+UMASK_RXR_IRQ_RETRY_RTID           0x08
+UMASK_RXR_IRQ_RETRY_QPI_CREDITS    0x10
+UMASK_RXR_IRQ_RETRY_HO_CREDITS     0x20
+
+EVENT_RXR_ISMQ_RETRY                0x33  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_RXR_ISMQ_RETRY_ANY            0x01
+UMASK_RXR_ISMQ_RETRY_FULL           0x02
+UMASK_RXR_ISMQ_RETRY_RTID           0x08
+UMASK_RXR_ISMQ_RETRY_QPI_CREDITS    0x10
+UMASK_RXR_ISMQ_RETRY_HO_CREDITS     0x20
+UMASK_RXR_ISMQ_RETRY_WB_CREDITS     0x80
+
+EVENT_RXR_OCCUPANCY                0x11  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0
+UMASK_RXR_OCCUPANCY_IRQ            0x01
+UMASK_RXR_OCCUPANCY_IRQ_REJECTED   0x02
+UMASK_RXR_OCCUPANCY_IPQ            0x04
+UMASK_RXR_OCCUPANCY_VIFO           0x10
+
+EVENT_TOR_INSERTS                    0x35  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_TOR_INSERTS_OPCODE             0x01
+UMASK_TOR_INSERTS_MISS_OPCODE        0x03
+UMASK_TOR_INSERTS_EVICTION           0x04
+UMASK_TOR_INSERTS_ALL                0x08
+UMASK_TOR_INSERTS_WB                 0x10
+UMASK_TOR_INSERTS_MISS_ALL           0x0A
+UMASK_TOR_INSERTS_MISS_LOCAL         0x2A
+UMASK_TOR_INSERTS_MISS_LOCAL_OPCODE  0x23
+UMASK_TOR_INSERTS_NID_OPCODE         0x41
+UMASK_TOR_INSERTS_NID_EVICTION       0x44
+UMASK_TOR_INSERTS_NID_ALL            0x48
+UMASK_TOR_INSERTS_NID_WB             0x50
+UMASK_TOR_INSERTS_NID_MISS_OPCODE    0x43
+UMASK_TOR_INSERTS_NID_MISS_ALL       0x4A
+UMASK_TOR_INSERTS_REMOTE_OPCODE      0x81
+UMASK_TOR_INSERTS_MISS_REMOTE_OPCODE   0x83
+UMASK_TOR_INSERTS_REMOTE             0x88
+UMASK_TOR_INSERTS_MISS_REMOTE        0x8A
+
+EVENT_TOR_OCCUPANCY                    0x36  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0
+UMASK_TOR_OCCUPANCY_OPCODE             0x01
+UMASK_TOR_OCCUPANCY_MISS_OPCODE        0x03
+UMASK_TOR_OCCUPANCY_EVICTION           0x04
+UMASK_TOR_OCCUPANCY_ALL                0x08
+UMASK_TOR_OCCUPANCY_MISS_ALL           0x0A
+UMASK_TOR_OCCUPANCY_WB                 0x10
+UMASK_TOR_OCCUPANCY_LOCAL_OPCODE       0x21
+UMASK_TOR_OCCUPANCY_MISS_LOCAL_OPCODE  0x23
+UMASK_TOR_OCCUPANCY_LOCAL              0x28
+UMASK_TOR_OCCUPANCY_MISS_LOCAL         0x2A
+UMASK_TOR_OCCUPANCY_NID_OPCODE         0x41
+UMASK_TOR_OCCUPANCY_NID_EVICTION       0x44
+UMASK_TOR_OCCUPANCY_NID_ALL            0x48
+UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE    0x43
+UMASK_TOR_OCCUPANCY_NID_MISS_ALL       0x4A
+UMASK_TOR_OCCUPANCY_NID_WB             0x50
+UMASK_TOR_OCCUPANCY_REMOTE_OPCODE       0x81
+UMASK_TOR_OCCUPANCY_MISS_REMOTE_OPCODE  0x83
+UMASK_TOR_OCCUPANCY_REMOTE              0x88
+UMASK_TOR_OCCUPANCY_MISS_REMOTE         0x8A
+
+EVENT_TXR_ADS_USED                0x04  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_TXR_ADS_USED_AD            0x01
+UMASK_TXR_ADS_USED_AK            0x02
+UMASK_TXR_ADS_USED_BL            0x04
+
+EVENT_TXR_INSERTS                0x02  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX8C0|CBOX9C0|CBOX10C0|CBOX11C0|CBOX12C0|CBOX13C0|CBOX14C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1|CBOX8C1|CBOX9C1|CBOX10C1|CBOX11C1|CBOX12C1|CBOX13C1|CBOX14C1
+UMASK_TXR_INSERTS_AD_CACHE            0x01
+UMASK_TXR_INSERTS_AK_CACHE            0x02
+UMASK_TXR_INSERTS_BL_CACHE            0x04
+UMASK_TXR_INSERTS_IV_CACHE            0x08
+UMASK_TXR_INSERTS_AD_CORE             0x10
+UMASK_TXR_INSERTS_AK_CORE             0x20
+UMASK_TXR_INSERTS_BL_CORE             0x40
+
+EVENT_DRAM_CLOCKTICKS             0x00  MBOX0FIX|MBOX1FIX|MBOX2FIX|MBOX3FIX
+UMASK_DRAM_CLOCKTICKS             0x00
+
+EVENT_ACT_COUNT                  0x01  MBOX
+UMASK_ACT_COUNT_RD                 0x01
+UMASK_ACT_COUNT_WR                 0x02
+UMASK_ACT_COUNT_BYP                0x08
+
+EVENT_BYP_CMDS                  0xA1  MBOX
+UMASK_BYP_CMDS_ACT                 0x01
+UMASK_BYP_CMDS_CAS                 0x02
+UMASK_BYP_CMDS_PRE                 0x04
+
+EVENT_CAS_COUNT                  0x04  MBOX
+UMASK_CAS_COUNT_RD_REG           0x01
+UMASK_CAS_COUNT_RD_UNDERFILL     0x02
+UMASK_CAS_COUNT_RD               0x03
+UMASK_CAS_COUNT_WR_WMM           0x04
+UMASK_CAS_COUNT_WR_RMM           0x08
+UMASK_CAS_COUNT_WR               0x0C
+UMASK_CAS_COUNT_ALL              0x0F
+UMASK_CAS_COUNT_RD_WMM           0x01
+UMASK_CAS_COUNT_RD_RMM           0x02
+
+EVENT_DRAM_PRE_ALL                  0x06  MBOX
+UMASK_DRAM_PRE_ALL                  0x00
+
+EVENT_DRAM_REFRESH                  0x05  MBOX
+UMASK_DRAM_REFRESH_PANIC            0x02
+UMASK_DRAM_REFRESH_HIGH             0x04
+
+EVENT_ECC_CORRECTABLE_ERRORS           0x09  MBOX
+UMASK_ECC_CORRECTABLE_ERRORS           0x00
+
+EVENT_MAJOR_MODES                  0x07  MBOX
+UMASK_MAJOR_MODES_READ             0x01
+UMASK_MAJOR_MODES_WRITE            0x02
+UMASK_MAJOR_MODES_PARTIAL          0x04
+UMASK_MAJOR_MODES_ISOCH            0x08
+
+EVENT_POWER_CHANNEL_DLLOFF           0x84  MBOX
+UMASK_POWER_CHANNEL_DLLOFF           0x00
+
+EVENT_POWER_CHANNEL_PPD           0x85  MBOX
+UMASK_POWER_CHANNEL_PPD           0x00
+
+EVENT_POWER_CKE_CYCLES                  0x83  MBOX
+UMASK_POWER_CKE_CYCLES_RANK0            0x01
+UMASK_POWER_CKE_CYCLES_RANK1            0x02
+UMASK_POWER_CKE_CYCLES_RANK2            0x04
+UMASK_POWER_CKE_CYCLES_RANK3            0x08
+UMASK_POWER_CKE_CYCLES_RANK4            0x10
+UMASK_POWER_CKE_CYCLES_RANK5            0x20
+UMASK_POWER_CKE_CYCLES_RANK6            0x40
+UMASK_POWER_CKE_CYCLES_RANK7            0x80
+
+EVENT_POWER_CRITICAL_THROTTLE_CYCLES           0x86  MBOX
+UMASK_POWER_CRITICAL_THROTTLE_CYCLES           0x00
+
+EVENT_POWER_PCU_THROTTLING           0x42  MBOX
+UMASK_POWER_PCU_THROTTLING           0x00
+
+EVENT_POWER_SELF_REFRESH           0x43  MBOX
+UMASK_POWER_SELF_REFRESH           0x00
+
+EVENT_POWER_THROTTLE_CYCLES                  0x41  MBOX
+UMASK_POWER_THROTTLE_CYCLES_RANK0            0x01
+UMASK_POWER_THROTTLE_CYCLES_RANK1            0x02
+UMASK_POWER_THROTTLE_CYCLES_RANK2            0x04
+UMASK_POWER_THROTTLE_CYCLES_RANK3            0x08
+UMASK_POWER_THROTTLE_CYCLES_RANK4            0x10
+UMASK_POWER_THROTTLE_CYCLES_RANK5            0x20
+UMASK_POWER_THROTTLE_CYCLES_RANK6            0x40
+UMASK_POWER_THROTTLE_CYCLES_RANK7            0x80
+
+EVENT_PREEMPTION           0x08  MBOX
+UMASK_PREEMPTION_RD_PREEMPT_RD           0x01
+UMASK_PREEMPTION_RD_PREEMPT_WR           0x02
+
+EVENT_PRE_COUNT           0x02  MBOX
+UMASK_PRE_COUNT_PAGE_MISS           0x01
+UMASK_PRE_COUNT_PAGE_CLOSE           0x02
+
+EVENT_RD_CAS_PRIO           0xA0  MBOX
+UMASK_RD_CAS_PRIO_LOW           0x01
+UMASK_RD_CAS_PRIO_MED           0x02
+UMASK_RD_CAS_PRIO_HIGH          0x04
+UMASK_RD_CAS_PRIO_PANIC         0x08
+
+EVENT_RD_CAS_RANK0           0xB0  MBOX
+UMASK_RD_CAS_RANK0_BANK0           0x01
+UMASK_RD_CAS_RANK0_BANK1           0x02
+UMASK_RD_CAS_RANK0_BANK2           0x04
+UMASK_RD_CAS_RANK0_BANK3           0x08
+UMASK_RD_CAS_RANK0_BANK4           0x10
+UMASK_RD_CAS_RANK0_BANK5           0x20
+UMASK_RD_CAS_RANK0_BANK6           0x40
+UMASK_RD_CAS_RANK0_BANK7           0x80
+
+EVENT_RD_CAS_RANK1           0xB1  MBOX
+UMASK_RD_CAS_RANK1_BANK0           0x01
+UMASK_RD_CAS_RANK1_BANK1           0x02
+UMASK_RD_CAS_RANK1_BANK2           0x04
+UMASK_RD_CAS_RANK1_BANK3           0x08
+UMASK_RD_CAS_RANK1_BANK4           0x10
+UMASK_RD_CAS_RANK1_BANK5           0x20
+UMASK_RD_CAS_RANK1_BANK6           0x40
+UMASK_RD_CAS_RANK1_BANK7           0x80
+
+EVENT_RD_CAS_RANK2           0xB2  MBOX
+UMASK_RD_CAS_RANK2_BANK0           0x01
+UMASK_RD_CAS_RANK2_BANK1           0x02
+UMASK_RD_CAS_RANK2_BANK2           0x04
+UMASK_RD_CAS_RANK2_BANK3           0x08
+UMASK_RD_CAS_RANK2_BANK4           0x10
+UMASK_RD_CAS_RANK2_BANK5           0x20
+UMASK_RD_CAS_RANK2_BANK6           0x40
+UMASK_RD_CAS_RANK2_BANK7           0x80
+
+EVENT_RD_CAS_RANK3           0xB3  MBOX
+UMASK_RD_CAS_RANK3_BANK0           0x01
+UMASK_RD_CAS_RANK3_BANK1           0x02
+UMASK_RD_CAS_RANK3_BANK2           0x04
+UMASK_RD_CAS_RANK3_BANK3           0x08
+UMASK_RD_CAS_RANK3_BANK4           0x10
+UMASK_RD_CAS_RANK3_BANK5           0x20
+UMASK_RD_CAS_RANK3_BANK6           0x40
+UMASK_RD_CAS_RANK3_BANK7           0x80
+
+EVENT_RD_CAS_RANK4           0xB4  MBOX
+UMASK_RD_CAS_RANK4_BANK0           0x01
+UMASK_RD_CAS_RANK4_BANK1           0x02
+UMASK_RD_CAS_RANK4_BANK2           0x04
+UMASK_RD_CAS_RANK4_BANK3           0x08
+UMASK_RD_CAS_RANK4_BANK4           0x10
+UMASK_RD_CAS_RANK4_BANK5           0x20
+UMASK_RD_CAS_RANK4_BANK6           0x40
+UMASK_RD_CAS_RANK4_BANK7           0x80
+
+EVENT_RD_CAS_RANK5           0xB5  MBOX
+UMASK_RD_CAS_RANK5_BANK0           0x01
+UMASK_RD_CAS_RANK5_BANK1           0x02
+UMASK_RD_CAS_RANK5_BANK2           0x04
+UMASK_RD_CAS_RANK5_BANK3           0x08
+UMASK_RD_CAS_RANK5_BANK4           0x10
+UMASK_RD_CAS_RANK5_BANK5           0x20
+UMASK_RD_CAS_RANK5_BANK6           0x40
+UMASK_RD_CAS_RANK5_BANK7           0x80
+
+EVENT_RD_CAS_RANK6           0xB6  MBOX
+UMASK_RD_CAS_RANK6_BANK0           0x01
+UMASK_RD_CAS_RANK6_BANK1           0x02
+UMASK_RD_CAS_RANK6_BANK2           0x04
+UMASK_RD_CAS_RANK6_BANK3           0x08
+UMASK_RD_CAS_RANK6_BANK4           0x10
+UMASK_RD_CAS_RANK6_BANK5           0x20
+UMASK_RD_CAS_RANK6_BANK6           0x40
+UMASK_RD_CAS_RANK6_BANK7           0x80
+
+EVENT_RD_CAS_RANK7           0xB7  MBOX
+UMASK_RD_CAS_RANK7_BANK0           0x01
+UMASK_RD_CAS_RANK7_BANK1           0x02
+UMASK_RD_CAS_RANK7_BANK2           0x04
+UMASK_RD_CAS_RANK7_BANK3           0x08
+UMASK_RD_CAS_RANK7_BANK4           0x10
+UMASK_RD_CAS_RANK7_BANK5           0x20
+UMASK_RD_CAS_RANK7_BANK6           0x40
+UMASK_RD_CAS_RANK7_BANK7           0x80
+
+EVENT_RPQ_CYCLES_NE           0x11  MBOX
+UMASK_RPQ_CYCLES_NE           0x00
+
+EVENT_RPQ_INSERTS           0x10  MBOX
+UMASK_RPQ_INSERTS           0x00
+
+EVENT_VMSE_MXB_WR_OCCUPANCY           0x91  MBOX
+UMASK_VMSE_MXB_WR_OCCUPANCY           0x00
+
+EVENT_VMSE_WR_PUSH           0x90  MBOX
+UMASK_VMSE_WR_PUSH           0x00
+
+EVENT_WMM_TO_RMM           0xC0  MBOX
+UMASK_WMM_TO_RMM           0x00
+
+EVENT_WPQ_CYCLES_FULL           0x22  MBOX
+UMASK_WPQ_CYCLES_FULL           0x00
+
+EVENT_WPQ_CYCLES_NE           0x21  MBOX
+UMASK_WPQ_CYCLES_NE           0x00
+
+EVENT_WPQ_INSERTS           0x20  MBOX
+UMASK_WPQ_INSERTS           0x00
+
+EVENT_WPQ_READ_HIT           0x23  MBOX
+UMASK_WPQ_READ_HIT           0x00
+
+EVENT_WPQ_WRITE_HIT           0x24  MBOX
+UMASK_WPQ_WRITE_HIT           0x00
+
+EVENT_WRONG_MM           0xC1  MBOX
+UMASK_WRONG_MM           0x00
+
+EVENT_WR_CAS_RANK0           0xB8  MBOX
+UMASK_WR_CAS_RANK0_BANK0           0x01
+UMASK_WR_CAS_RANK0_BANK1           0x02
+UMASK_WR_CAS_RANK0_BANK2           0x04
+UMASK_WR_CAS_RANK0_BANK3           0x08
+UMASK_WR_CAS_RANK0_BANK4           0x10
+UMASK_WR_CAS_RANK0_BANK5           0x20
+UMASK_WR_CAS_RANK0_BANK6           0x40
+UMASK_WR_CAS_RANK0_BANK7           0x80
+
+EVENT_WR_CAS_RANK1           0xB9  MBOX
+UMASK_WR_CAS_RANK1_BANK0           0x01
+UMASK_WR_CAS_RANK1_BANK1           0x02
+UMASK_WR_CAS_RANK1_BANK2           0x04
+UMASK_WR_CAS_RANK1_BANK3           0x08
+UMASK_WR_CAS_RANK1_BANK4           0x10
+UMASK_WR_CAS_RANK1_BANK5           0x20
+UMASK_WR_CAS_RANK1_BANK6           0x40
+UMASK_WR_CAS_RANK1_BANK7           0x80
+
+EVENT_WR_CAS_RANK2           0xBA  MBOX
+UMASK_WR_CAS_RANK2_BANK0           0x01
+UMASK_WR_CAS_RANK2_BANK1           0x02
+UMASK_WR_CAS_RANK2_BANK2           0x04
+UMASK_WR_CAS_RANK2_BANK3           0x08
+UMASK_WR_CAS_RANK2_BANK4           0x10
+UMASK_WR_CAS_RANK2_BANK5           0x20
+UMASK_WR_CAS_RANK2_BANK6           0x40
+UMASK_WR_CAS_RANK2_BANK7           0x80
+
+EVENT_WR_CAS_RANK3           0xBB  MBOX
+UMASK_WR_CAS_RANK3_BANK0           0x01
+UMASK_WR_CAS_RANK3_BANK1           0x02
+UMASK_WR_CAS_RANK3_BANK2           0x04
+UMASK_WR_CAS_RANK3_BANK3           0x08
+UMASK_WR_CAS_RANK3_BANK4           0x10
+UMASK_WR_CAS_RANK3_BANK5           0x20
+UMASK_WR_CAS_RANK3_BANK6           0x40
+UMASK_WR_CAS_RANK3_BANK7           0x80
+
+EVENT_WR_CAS_RANK4           0xBC  MBOX
+UMASK_WR_CAS_RANK4_BANK0           0x01
+UMASK_WR_CAS_RANK4_BANK1           0x02
+UMASK_WR_CAS_RANK4_BANK2           0x04
+UMASK_WR_CAS_RANK4_BANK3           0x08
+UMASK_WR_CAS_RANK4_BANK4           0x10
+UMASK_WR_CAS_RANK4_BANK5           0x20
+UMASK_WR_CAS_RANK4_BANK6           0x40
+UMASK_WR_CAS_RANK4_BANK7           0x80
+
+EVENT_WR_CAS_RANK5           0xBD  MBOX
+UMASK_WR_CAS_RANK5_BANK0           0x01
+UMASK_WR_CAS_RANK5_BANK1           0x02
+UMASK_WR_CAS_RANK5_BANK2           0x04
+UMASK_WR_CAS_RANK5_BANK3           0x08
+UMASK_WR_CAS_RANK5_BANK4           0x10
+UMASK_WR_CAS_RANK5_BANK5           0x20
+UMASK_WR_CAS_RANK5_BANK6           0x40
+UMASK_WR_CAS_RANK5_BANK7           0x80
+
+EVENT_WR_CAS_RANK6           0xBE  MBOX
+UMASK_WR_CAS_RANK6_BANK0           0x01
+UMASK_WR_CAS_RANK6_BANK1           0x02
+UMASK_WR_CAS_RANK6_BANK2           0x04
+UMASK_WR_CAS_RANK6_BANK3           0x08
+UMASK_WR_CAS_RANK6_BANK4           0x10
+UMASK_WR_CAS_RANK6_BANK5           0x20
+UMASK_WR_CAS_RANK6_BANK6           0x40
+UMASK_WR_CAS_RANK6_BANK7           0x80
+
+EVENT_WR_CAS_RANK7           0xBF  MBOX
+UMASK_WR_CAS_RANK7_BANK0           0x01
+UMASK_WR_CAS_RANK7_BANK1           0x02
+UMASK_WR_CAS_RANK7_BANK2           0x04
+UMASK_WR_CAS_RANK7_BANK3           0x08
+UMASK_WR_CAS_RANK7_BANK4           0x10
+UMASK_WR_CAS_RANK7_BANK5           0x20
+UMASK_WR_CAS_RANK7_BANK6           0x40
+UMASK_WR_CAS_RANK7_BANK7           0x80
+
+
+EVENT_QPI_RATE                     0x00    SBOX0FIX|SBOX1FIX|SBOX2FIX
+UMASK_QPI_RATE                     0x00
+
+EVENT_SBOX_CLOCKTICKS               0x14 SBOX0|SBOX1|SBOX2
+UMASK_SBOX_CLOCKTICKS               0x00
+
+EVENT_CTO_COUNT                     0x38 SBOX0|SBOX1|SBOX2
+OPTIONS_CTO_COUNT                   EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_CTO_COUNT                     0x00 0x01
+
+EVENT_DIRECT2CORE                               0x13 SBOX0|SBOX1|SBOX2
+OPTIONS_DIRECT2CORE_SUCCESS_RBT_HIT             EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_SUCCESS_RBT_HIT               0x01
+OPTIONS_DIRECT2CORE_FAILURE_CREDITS             EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_CREDITS               0x02
+OPTIONS_DIRECT2CORE_FAILURE_RBT_HIT             EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_RBT_HIT               0x04
+OPTIONS_DIRECT2CORE_FAILURE_CREDITS_RBT         EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT           0x08
+OPTIONS_DIRECT2CORE_FAILURE_MISS                EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_MISS                  0x10
+OPTIONS_DIRECT2CORE_FAILURE_CREDITS_MISS        EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_CREDITS_MISS          0x20
+OPTIONS_DIRECT2CORE_FAILURE_RBT_MISS            EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_RBT_MISS              0x40
+OPTIONS_DIRECT2CORE_FAILURE_CREDITS_RBT_MISS    EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MASK1_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_DIRECT2CORE_FAILURE_CREDITS_RBT_MISS      0x80
+
+EVENT_L1_POWER_CYCLES           0x12 SBOX0|SBOX1|SBOX2
+UMASK_L1_POWER_CYCLES           0x00
+
+EVENT_RXL0P_POWER_CYCLES        0x10 SBOX0|SBOX1|SBOX2
+UMASK_RXL0P_POWER_CYCLES        0x00
+
+EVENT_RXL0_POWER_CYCLES         0x0F SBOX0|SBOX1|SBOX2
+UMASK_RXL0_POWER_CYCLES         0x00
+
+EVENT_RXL_BYPASSED              0x09 SBOX0|SBOX1|SBOX2
+UMASK_RXL_BYPASSED              0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN0      0x1E SBOX0|SBOX1|SBOX2
+UMASK_RXL_CREDITS_CONSUMED_VN0_DRS  0x01 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCB  0x02 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCS  0x04 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_HOM  0x08 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_SNP  0x10 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN0_NDR  0x20 0x01
+
+EVENT_RXL_CREDITS_CONSUMED_VN1      0x39 SBOX0|SBOX1|SBOX2
+UMASK_RXL_CREDITS_CONSUMED_VN1_DRS  0x01 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCB  0x02 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_NCS  0x04 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_HOM  0x08 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_SNP  0x10 0x01
+UMASK_RXL_CREDITS_CONSUMED_VN1_NDR  0x20 0x01
+
+EVENT_RXL_CREDITS_CONSUMED_VNA  0x1D SBOX0|SBOX1|SBOX2
+UMASK_RXL_CREDITS_CONSUMED_VNA  0x00 0x01
+
+EVENT_RXL_CYCLES_NE             0x0A SBOX0|SBOX1|SBOX2
+UMASK_RXL_CYCLES_NE             0x00
+
+EVENT_RXL_FLITS_G0              0x01 SBOX0|SBOX1|SBOX2
+UMASK_RXL_FLITS_G0_IDLE         0x01
+UMASK_RXL_FLITS_G0_DATA         0x02
+UMASK_RXL_FLITS_G0_NON_DATA     0x04
+
+EVENT_RXL_FLITS_G1              0x02 SBOX0|SBOX1|SBOX2
+UMASK_RXL_FLITS_G1_SNP          0x01 0x01
+UMASK_RXL_FLITS_G1_HOM_REQ      0x02 0x01
+UMASK_RXL_FLITS_G1_HOM_NONREQ   0x04 0x01
+UMASK_RXL_FLITS_G1_HOM          0x06 0x01
+UMASK_RXL_FLITS_G1_DRS_DATA     0x08 0x01
+UMASK_RXL_FLITS_G1_DRS_NONDATA  0x10 0x01
+UMASK_RXL_FLITS_G1_DRS          0x18 0x01
+
+EVENT_RXL_FLITS_G2              0x03 SBOX0|SBOX1|SBOX2
+UMASK_RXL_FLITS_G2_NDR_AD       0x01 0x01
+UMASK_RXL_FLITS_G2_NDR_AK       0x02 0x01
+UMASK_RXL_FLITS_G2_NCB_DATA     0x04 0x01
+UMASK_RXL_FLITS_G2_NCB_NONDATA  0x08 0x01
+UMASK_RXL_FLITS_G2_NCB          0x0C 0x01
+UMASK_RXL_FLITS_G2_NCS          0x10 0x01
+
+EVENT_RXL_INSERTS               0x08 SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS               0x00
+
+EVENT_RXL_INSERTS_DRS           0x09 SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_DRS_VN0       0x01 0x01
+UMASK_RXL_INSERTS_DRS_VN1       0x02 0x01
+
+EVENT_RXL_INSERTS_HOM           0x0C SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_HOM_VN0       0x01 0x01
+UMASK_RXL_INSERTS_HOM_VN1       0x02 0x01
+
+EVENT_RXL_INSERTS_NCB           0x0A SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_NCB_VN0       0x01 0x01
+UMASK_RXL_INSERTS_NCB_VN1       0x02 0x01
+
+EVENT_RXL_INSERTS_NCS           0x0B SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_NCS_VN0       0x01 0x01
+UMASK_RXL_INSERTS_NCS_VN1       0x02 0x01
+
+EVENT_RXL_INSERTS_NDR           0x0E SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_NDR_VN0       0x01 0x01
+UMASK_RXL_INSERTS_NDR_VN1       0x02 0x01
+
+EVENT_RXL_INSERTS_SNP           0x0D SBOX0|SBOX1|SBOX2
+UMASK_RXL_INSERTS_SNP_VN0       0x01 0x01
+UMASK_RXL_INSERTS_SNP_VN1       0x02 0x01
+
+EVENT_RXL_OCCUPANCY             0x0B SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY             0x00
+
+EVENT_RXL_OCCUPANCY_DRS         0x15 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_DRS_VN0     0x01 0x01
+UMASK_RXL_OCCUPANCY_DRS_VN1     0x02 0x01
+
+EVENT_RXL_OCCUPANCY_HOM         0x18 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_HOM_VN0     0x01 0x01
+UMASK_RXL_OCCUPANCY_HOM_VN1     0x02 0x01
+
+EVENT_RXL_OCCUPANCY_NCB         0x16 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_NCB_VN0     0x01 0x01
+UMASK_RXL_OCCUPANCY_NCB_VN1     0x02 0x01
+
+EVENT_RXL_OCCUPANCY_NCS         0x17 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_NCS_VN0     0x01 0x01
+UMASK_RXL_OCCUPANCY_NCS_VN1     0x02 0x01
+
+EVENT_RXL_OCCUPANCY_NDR         0x1A SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_NDR_VN0     0x01 0x01
+UMASK_RXL_OCCUPANCY_NDR_VN1     0x02 0x01
+
+EVENT_RXL_OCCUPANCY_SNP         0x19 SBOX0|SBOX1|SBOX2
+UMASK_RXL_OCCUPANCY_SNP_VN0     0x01 0x01
+UMASK_RXL_OCCUPANCY_SNP_VN1     0x02 0x01
+
+EVENT_TXL0P_POWER_CYCLES        0x0D SBOX0|SBOX1|SBOX2
+UMASK_TXL0P_POWER_CYCLES        0x00
+
+EVENT_TXL0_POWER_CYCLES         0x0C SBOX0|SBOX1|SBOX2
+UMASK_TXL0_POWER_CYCLES         0x00
+
+EVENT_TXL_BYPASSED              0x05 SBOX0|SBOX1|SBOX2
+UMASK_TXL_BYPASSED              0x00
+
+EVENT_TXL_CYCLES_NE             0x06 SBOX0|SBOX1|SBOX2
+UMASK_TXL_CYCLES_NE             0x00
+
+EVENT_TXL_FLITS_G0              0x00 SBOX0|SBOX1|SBOX2
+UMASK_TXL_FLITS_G0_DATA         0x02
+UMASK_TXL_FLITS_G0_NON_DATA     0x04
+
+EVENT_TXL_FLITS_G1              0x00 SBOX0|SBOX1|SBOX2
+UMASK_TXL_FLITS_G1_SNP          0x01 0x01
+UMASK_TXL_FLITS_G1_HOM_REQ      0x02 0x01
+UMASK_TXL_FLITS_G1_HOM_NONREQ   0x04 0x01
+UMASK_TXL_FLITS_G1_HOM          0x06 0x01
+UMASK_TXL_FLITS_G1_DRS_DATA     0x08 0x01
+UMASK_TXL_FLITS_G1_DRS_NONDATA  0x10 0x01
+UMASK_TXL_FLITS_G1_DRS          0x18 0x01
+
+EVENT_TXL_FLITS_G2              0x01 SBOX0|SBOX1|SBOX2
+UMASK_TXL_FLITS_G2_NDR_AD       0x01 0x01
+UMASK_TXL_FLITS_G2_NDR_AK       0x02 0x01
+UMASK_TXL_FLITS_G2_NCB_DATA     0x04 0x01
+UMASK_TXL_FLITS_G2_NCB_NONDATA  0x08 0x01
+UMASK_TXL_FLITS_G2_NCB          0x0C 0x01
+UMASK_TXL_FLITS_G2_NCS          0x10 0x01
+
+EVENT_TXL_INSERTS               0x04 SBOX0|SBOX1|SBOX2
+UMASK_TXL_INSERTS               0x00
+
+EVENT_TXL_OCCUPANCY             0x07 SBOX0|SBOX1|SBOX2
+UMASK_TXL_OCCUPANCY             0x00
+
+EVENT_TXL_AD_HOM_CREDIT_ACQUIRED         0x26 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_HOM_CREDIT_ACQUIRED_VN0     0x01 0x01
+UMASK_TXL_AD_HOM_CREDIT_ACQUIRED_VN1     0x02 0x01
+
+EVENT_TXL_AD_HOM_CREDIT_OCCUPANCY        0x22 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_HOM_CREDIT_OCCUPANCY_VN0    0x01 0x01
+UMASK_TXL_AD_HOM_CREDIT_OCCUPANCY_VN1    0x02 0x01
+
+EVENT_TXL_AD_NDR_CREDIT_ACQUIRED         0x28 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_NDR_CREDIT_ACQUIRED_VN0     0x01 0x01
+UMASK_TXL_AD_NDR_CREDIT_ACQUIRED_VN1     0x02 0x01
+
+EVENT_TXL_AD_NDR_CREDIT_OCCUPANCY        0x24 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_NDR_CREDIT_OCCUPANCY_VN0    0x01 0x01
+UMASK_TXL_AD_NDR_CREDIT_OCCUPANCY_VN1    0x02 0x01
+
+EVENT_TXL_AD_SNP_CREDIT_ACQUIRED         0x27 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_SNP_CREDIT_ACQUIRED_VN0     0x01 0x01
+UMASK_TXL_AD_SNP_CREDIT_ACQUIRED_VN1     0x02 0x01
+
+EVENT_TXL_AD_SNP_CREDIT_OCCUPANCY        0x23 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AD_SNP_CREDIT_OCCUPANCY_VN0    0x01 0x01
+UMASK_TXL_AD_SNP_CREDIT_OCCUPANCY_VN1    0x02 0x01
+
+EVENT_TXL_AK_NDR_CREDIT_ACQUIRED         0x29 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AK_NDR_CREDIT_ACQUIRED         0x00 0x01
+
+EVENT_TXL_AK_NDR_CREDIT_OCCUPANCY        0x25 SBOX0|SBOX1|SBOX2
+UMASK_TXL_AK_NDR_CREDIT_OCCUPANCY        0x00 0x01
+
+EVENT_TXL_BL_DRS_CREDIT_ACQUIRED         0x2A SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_DRS_CREDIT_ACQUIRED_VN0     0x01 0x01
+UMASK_TXL_BL_DRS_CREDIT_ACQUIRED_VN1     0x02 0x01
+UMASK_TXL_BL_DRS_CREDIT_ACQUIRED_VN_SHR  0x04 0x01
+
+EVENT_TXL_BL_DRS_CREDIT_OCCUPANCY        0x1F SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_DRS_CREDIT_OCCUPANCY_VN0    0x01 0x01
+UMASK_TXL_BL_DRS_CREDIT_OCCUPANCY_VN1    0x02 0x01
+UMASK_TXL_BL_DRS_CREDIT_OCCUPANCY_VN_SHR 0x04 0x01
+
+EVENT_TXL_BL_NCB_CREDIT_ACQUIRED         0x2B SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_NCB_CREDIT_ACQUIRED_VN0     0x01 0x01
+UMASK_TXL_BL_NCB_CREDIT_ACQUIRED_VN1     0x02 0x01
+
+EVENT_TXL_BL_NCB_CREDIT_OCCUPANCY        0x20 SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_NCB_CREDIT_OCCUPANCY_VN0    0x01 0x01
+UMASK_TXL_BL_NCB_CREDIT_OCCUPANCY_VN1    0x02 0x01
+
+EVENT_TXL_BL_NCS_CREDIT_ACQUIRED         0x2C SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_NCS_CREDIT_ACQUIRED_VN0     0x01 0x01
+UMASK_TXL_BL_NCS_CREDIT_ACQUIRED_VN1     0x02 0x01
+
+EVENT_TXL_BL_NCS_CREDIT_OCCUPANCY        0x21 SBOX0|SBOX1|SBOX2
+UMASK_TXL_BL_NCS_CREDIT_OCCUPANCY_VN0    0x01 0x01
+UMASK_TXL_BL_NCS_CREDIT_OCCUPANCY_VN1    0x02 0x01
+
+EVENT_VNA_CREDIT_RETURNS            0x1C SBOX0|SBOX1|SBOX2
+UMASK_VNA_CREDIT_RETURNS            0x00 0x01
+
+EVENT_VNA_CREDIT_RETURN_OCCUPANCY   0x1B SBOX0|SBOX1|SBOX2
+UMASK_VNA_CREDIT_RETURN_OCCUPANCY   0x00 0x01
+
+EVENT_UBOX_CLOCKTICKS           0x00 UBOXFIX
+UMASK_UBOX_CLOCKTICKS           0x00
+
+EVENT_EVENT_MSG                 0x42 UBOX
+UMASK_EVENT_MSG_VLW_RCVD        0x01
+UMASK_EVENT_MSG_MSI_RCVD        0x02
+UMASK_EVENT_MSG_IPI_RCVD        0x02
+UMASK_EVENT_MSG_DOORBELL_RCVD   0x08
+UMASK_EVENT_MSG_INT_PRIO        0x10
+
+EVENT_LOCK_CYCLES               0x44 UBOX
+UMASK_LOCK_CYCLES               0x00
+
+EVENT_PHOLD_CYCLES               0x45 UBOX
+UMASK_PHOLD_CYCLES_ASSERT_TO_ACK 0x01
+
+EVENT_RACU_REQUESTS              0x46 UBOX
+UMASK_RACU_REQUESTS              0x00
+
+EVENT_BBOX_CLOCKTICKS           0x00 BBOX0|BBOX1
+UMASK_BBOX_CLOCKTICKS           0x00
+
+EVENT_ADDR_OPC_MATCH            0x20 BBOX0|BBOX1
+OPTIONS_ADDR_OPC_MATCH_ADDR     EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+EVENT_ADDR_OPC_MATCH_ADDR       0x01
+OPTIONS_ADDR_OPC_MATCH_OPC      EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_OPC        0x02
+OPTIONS_ADDR_OPC_MATCH_FILT     EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK|EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_FILT       0x03
+OPTIONS_ADDR_OPC_MATCH_AD       EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_AD         0x02
+OPTIONS_ADDR_OPC_MATCH_BL       EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_BL         0x02
+OPTIONS_ADDR_OPC_MATCH_AK       EVENT_OPTION_OPCODE_MASK
+EVENT_ADDR_OPC_MATCH_AK         0x02
+
+EVENT_BT_BYPASS                 0x52 BBOX0|BBOX1
+UMASK_BT_BYPASS                 0x00
+
+EVENT_BT_CYCLES_NE              0x42 BBOX0|BBOX1
+UMASK_BT_CYCLES_NE              0x00
+
+EVENT_BT_OCCUPANCY              0x43 BBOX0|BBOX1
+UMASK_BT_OCCUPANCY_LOCAL        0x01
+UMASK_BT_OCCUPANCY_REMOTE       0x02
+UMASK_BT_OCCUPANCY_READS_LOCAL  0x04
+UMASK_BT_OCCUPANCY_READS_REMOTE 0x08
+UMASK_BT_OCCUPANCY_WRITES_LOCAL  0x10
+UMASK_BT_OCCUPANCY_WRITES_REMOTE 0x20
+
+EVENT_BYPASS_IMC                0x14 BBOX0|BBOX1
+UMASK_BYPASS_IMC_TAKEN          0x01
+UMASK_BYPASS_IMC_NOT_TAKEN      0x02
+
+EVENT_CONFLICT_CYCLES           0x0B BBOX0|BBOX1
+UMASK_CONFLICT_CYCLES_CONFLICT  0x02
+UMASK_CONFLICT_CYCLES_LAST      0x04
+UMASK_CONFLICT_CYCLES_ACKCNFLTS 0x08
+UMASK_CONFLICT_CYCLES_CMP_FWDS  0x10
+
+EVENT_DIRECT2CORE_COUNT         0x11 BBOX0|BBOX1
+UMASK_DIRECT2CORE_COUNT         0x00
+
+EVENT_DIRECT2CORE_CYCLES_DISABLED 0x12 BBOX0|BBOX1
+UMASK_DIRECT2CORE_CYCLES_DISABLED 0x00
+
+EVENT_DIRECT2CORE_TXN_OVERRIDE  0x13 BBOX0|BBOX1
+UMASK_DIRECT2CORE_TXN_OVERRIDE  0x00
+
+EVENT_DIRECTORY_LAT_OPT         0x41 BBOX0|BBOX1
+UMASK_DIRECTORY_LAT_OPT         0x00
+
+EVENT_DIRECTORY_LOOKUP          0x0C BBOX0|BBOX1
+UMASK_DIRECTORY_LOOKUP_SNP      0x01
+UMASK_DIRECTORY_LOOKUP_NO_SNP   0x02
+
+EVENT_DIRECTORY_UPDATE          0x0D BBOX0|BBOX1
+UMASK_DIRECTORY_UPDATE_SET      0x01
+UMASK_DIRECTORY_UPDATE_CLEAR    0x02
+UMASK_DIRECTORY_UPDATE_ANY      0x03
+
+EVENT_IGR_CREDITS_AD_QPI2       0x59 BBOX0|BBOX1
+UMASK_IGR_CREDITS_AD_QPI2       0x00
+
+EVENT_IGR_CREDITS_BL_QPI2       0x5A BBOX0|BBOX1
+UMASK_IGR_CREDITS_BL_QPI2       0x00
+
+EVENT_IGR_NO_CREDIT_CYCLES         0x22 BBOX0|BBOX1
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI0 0x01
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI1 0x02
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI0 0x04
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI1 0x08
+
+EVENT_IMC_READS                 0x17 BBOX0|BBOX1
+UMASK_IMC_READS_NORMAL          0x01
+
+EVENT_IMC_RETRY                 0x1E BBOX0|BBOX1
+UMASK_IMC_RETRY                 0x00
+
+EVENT_IMC_WRITES                 0x1A BBOX0|BBOX1
+UMASK_IMC_WRITES_FULL            0x01
+UMASK_IMC_WRITES_PARTIAL         0x02
+UMASK_IMC_WRITES_FULL_ISOCH      0x04
+UMASK_IMC_WRITES_PARTIAL_ISOCH   0x08
+UMASK_IMC_WRITES_ALL             0x0F
+
+EVENT_IODC_CONFLICTS            0x57 BBOX0|BBOX1
+UMASK_IODC_CONFLICTS_ANY        0x01
+UMASK_IODC_CONFLICTS_LAST       0x04
+
+EVENT_IODC_INSERTS              0x56 BBOX0|BBOX1
+UMASK_IODC_INSERTS              0x00
+
+EVENT_IODC_OLEN_WBMTOI          0x58 BBOX0|BBOX1
+UMASK_IODC_OLEN_WBMTOI          0x00
+
+EVENT_OSB                       0x53 BBOX0|BBOX1
+UMASK_OSB_READS_LOCAL           0x02
+UMASK_OSB_INVITOE_LOCAL         0x04
+UMASK_OSB_REMOTE                0x08
+
+EVENT_OSB_EDR                   0x54 BBOX0|BBOX1
+UMASK_OSB_EDR_ALL               0x01
+UMASK_OSB_EDR_READS_LOCAL_I     0x02
+UMASK_OSB_EDR_READS_REMOTE_I    0x04
+UMASK_OSB_EDR_READS_LOCAL_S     0x08
+UMASK_OSB_EDR_READS_REMOTE_S    0x10
+
+EVENT_REQUESTS                  0x01 BBOX0|BBOX1
+UMASK_REQUESTS_READS_LOCAL      0x01
+UMASK_REQUESTS_READS_REMOTE     0x02
+UMASK_REQUESTS_READS            0x03
+UMASK_REQUESTS_WRITES_LOCAL     0x04
+UMASK_REQUESTS_WRITES_REMOTE    0x08
+UMASK_REQUESTS_WRITES           0x0C
+UMASK_REQUESTS_INVITOE_LOCAL    0x10
+UMASK_REQUESTS_INVITOE_REMOTE   0x20
+UMASK_REQUESTS_INVITOE          0x30
+
+EVENT_RING_AD_USED              0x3E BBOX0|BBOX1
+UMASK_RING_AD_USED_CW_VR0_EVEN  0x01
+UMASK_RING_AD_USED_CW_VR0_ODD   0x02
+UMASK_RING_AD_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AD_USED_CCW_VR0_ODD  0x08
+UMASK_RING_AD_USED_CW_VR1_EVEN  0x10
+UMASK_RING_AD_USED_CW_VR1_ODD   0x20
+UMASK_RING_AD_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_AD_USED_CCW_VR1_ODD  0x80
+UMASK_RING_AD_USED_CW           0x33
+UMASK_RING_AD_USED_CCW          0xCC
+
+EVENT_RING_AK_USED              0x3F BBOX0|BBOX1
+UMASK_RING_AK_USED_CW_VR0_EVEN  0x01
+UMASK_RING_AK_USED_CW_VR0_ODD   0x02
+UMASK_RING_AK_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AK_USED_CCW_VR0_ODD  0x08
+UMASK_RING_AK_USED_CW_VR1_EVEN  0x10
+UMASK_RING_AK_USED_CW_VR1_ODD   0x20
+UMASK_RING_AK_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_AK_USED_CCW_VR1_ODD  0x80
+UMASK_RING_AK_USED_CW           0x33
+UMASK_RING_AK_USED_CCW          0xCC
+
+EVENT_RING_BL_USED              0x40 BBOX0|BBOX1
+UMASK_RING_BL_USED_CW_VR0_EVEN  0x01
+UMASK_RING_BL_USED_CW_VR0_ODD   0x02
+UMASK_RING_BL_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_BL_USED_CCW_VR0_ODD  0x08
+UMASK_RING_BL_USED_CW_VR1_EVEN  0x10
+UMASK_RING_BL_USED_CW_VR1_ODD   0x20
+UMASK_RING_BL_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_BL_USED_CCW_VR1_ODD  0x80
+UMASK_RING_BL_USED_CW           0x33
+UMASK_RING_BL_USED_CCW          0xCC
+
+EVENT_RPQ_CYCLES_NO_REG_CREDITS         0x15 BBOX0|BBOX1
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0    0x01
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1    0x02
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2    0x04
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3    0x08
+
+EVENT_SNOOP_RESP                0x21 BBOX0|BBOX1
+UMASK_SNOOP_RESP_RSPI           0x01
+UMASK_SNOOP_RESP_RSPS           0x02
+UMASK_SNOOP_RESP_RSPIFWD        0x04
+UMASK_SNOOP_RESP_RSPSFWD        0x08
+UMASK_SNOOP_RESP_RSP_WB         0x10
+UMASK_SNOOP_RESP_RSP_FWD_WB     0x20
+UMASK_SNOOP_RESP_RSPCNFLCT      0x40
+
+EVENT_SNP_RESP_RECV_LOCAL           0x60 BBOX0|BBOX1
+UMASK_SNP_RESP_RECV_LOCAL_RSPI      0x01
+UMASK_SNP_RESP_RECV_LOCAL_RSPS      0x02
+UMASK_SNP_RESP_RECV_LOCAL_RSPIFWD   0x04
+UMASK_SNP_RESP_RECV_LOCAL_RSPSFWD   0x08
+UMASK_SNP_RESP_RECV_LOCAL_RSPXWB    0x10
+UMASK_SNP_RESP_RECV_LOCAL_RSPxFWDxWB 0x20
+UMASK_SNP_RESP_RECV_LOCAL_RSPCNFLCT 0x40
+UMASK_SNP_RESP_RECV_LOCAL_OTHER     0x80
+
+EVENT_TAD_REQUESTS_G0               0x1B BBOX0|BBOX1
+UMASK_TAD_REQUESTS_G0_REGION0       0x01
+UMASK_TAD_REQUESTS_G0_REGION1       0x02
+UMASK_TAD_REQUESTS_G0_REGION2       0x04
+UMASK_TAD_REQUESTS_G0_REGION3       0x08
+UMASK_TAD_REQUESTS_G0_REGION4       0x10
+UMASK_TAD_REQUESTS_G0_REGION5       0x20
+UMASK_TAD_REQUESTS_G0_REGION6       0x40
+UMASK_TAD_REQUESTS_G0_REGION7       0x80
+
+EVENT_TAD_REQUESTS_G1               0x1C BBOX0|BBOX1
+UMASK_TAD_REQUESTS_G1_REGION8       0x01
+UMASK_TAD_REQUESTS_G1_REGION9       0x02
+UMASK_TAD_REQUESTS_G1_REGION10      0x04
+UMASK_TAD_REQUESTS_G1_REGION11      0x08
+
+EVENT_TXR_AD_CYCLES_FULL            0x2A BBOX0|BBOX1
+UMASK_TXR_AD_CYCLES_FULL_SCHED0     0x01
+UMASK_TXR_AD_CYCLES_FULL_SCHED1     0x02
+UMASK_TXR_AD_CYCLES_FULL_ALL        0x03
+
+EVENT_TXR_AK                        0x0E BBOX0|BBOX1
+UMASK_TXR_AK                        0x00
+
+EVENT_TXR_AK_CYCLES_FULL            0x32 BBOX0|BBOX1
+UMASK_TXR_AK_CYCLES_FULL_SCHED0     0x01
+UMASK_TXR_AK_CYCLES_FULL_SCHED1     0x02
+UMASK_TXR_AK_CYCLES_FULL_ALL        0x03
+
+EVENT_TXR_BL                        0x10 BBOX0|BBOX1
+UMASK_TXR_BL_DRS_CACHE              0x01
+UMASK_TXR_BL_DRS_CORE               0x02
+UMASK_TXR_BL_DRS_QPI                0x04
+
+EVENT_TXR_BL_CYCLES_FULL            0x36 BBOX0|BBOX1
+UMASK_TXR_BL_CYCLES_FULL_SCHED0     0x01
+UMASK_TXR_BL_CYCLES_FULL_SCHED1     0x02
+UMASK_TXR_BL_CYCLES_FULL_ALL        0x03
+
+EVENT_TXR_BL_OCCUPANCY              0x34 BBOX0|BBOX1
+UMASK_TXR_BL_OCCUPANCY_SCHED0       0x01
+UMASK_TXR_BL_OCCUPANCY_SCHED1       0x02
+
+EVENT_WPQ_CYCLES_NO_REG_CREDITS      0x18 BBOX0|BBOX1
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0 0x01
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN1 0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN2 0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN3 0x08
+
+EVENT_CORES_IN_C3               0x00 WBOX0FIX
+UMASK_CORES_IN_C3               0x00
+
+EVENT_CORES_IN_C6               0x00 WBOX1FIX
+UMASK_CORES_IN_C6               0x00
+
+EVENT_WBOX_CLOCKTICKS           0x00 WBOX
+UMASK_WBOX_CLOCKTICKS           0x00
+
+EVENT_CORE0_TRANSITION_CYCLES   0x70 WBOX
+UMASK_CORE0_TRANSITION_CYCLES   0x00
+
+EVENT_CORE1_TRANSITION_CYCLES   0x71 WBOX
+UMASK_CORE1_TRANSITION_CYCLES   0x00
+
+EVENT_CORE2_TRANSITION_CYCLES   0x72 WBOX
+UMASK_CORE2_TRANSITION_CYCLES   0x00
+
+EVENT_CORE3_TRANSITION_CYCLES   0x73 WBOX
+UMASK_CORE3_TRANSITION_CYCLES   0x00
+
+EVENT_CORE4_TRANSITION_CYCLES   0x74 WBOX
+UMASK_CORE4_TRANSITION_CYCLES   0x00
+
+EVENT_CORE5_TRANSITION_CYCLES   0x75 WBOX
+UMASK_CORE5_TRANSITION_CYCLES   0x00
+
+EVENT_CORE6_TRANSITION_CYCLES   0x76 WBOX
+UMASK_CORE6_TRANSITION_CYCLES   0x00
+
+EVENT_CORE7_TRANSITION_CYCLES   0x77 WBOX
+UMASK_CORE7_TRANSITION_CYCLES   0x00
+
+EVENT_CORE8_TRANSITION_CYCLES   0x78 WBOX
+UMASK_CORE8_TRANSITION_CYCLES   0x00
+
+EVENT_CORE9_TRANSITION_CYCLES   0x79 WBOX
+UMASK_CORE9_TRANSITION_CYCLES   0x00
+
+EVENT_CORE10_TRANSITION_CYCLES   0x7A WBOX
+UMASK_CORE10_TRANSITION_CYCLES   0x00
+
+EVENT_CORE11_TRANSITION_CYCLES   0x7B WBOX
+UMASK_CORE11_TRANSITION_CYCLES   0x00
+
+EVENT_CORE12_TRANSITION_CYCLES   0x7C WBOX
+UMASK_CORE12_TRANSITION_CYCLES   0x00
+
+EVENT_CORE13_TRANSITION_CYCLES   0x7D WBOX
+UMASK_CORE13_TRANSITION_CYCLES   0x00
+
+EVENT_CORE14_TRANSITION_CYCLES   0x7E WBOX
+UMASK_CORE14_TRANSITION_CYCLES   0x00
+
+EVENT_DELAYED_C_STATE_ABORT_CORE0 0x17 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE0 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE1 0x18 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE1 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE2 0x19 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE2 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE3 0x1A WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE3 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE4 0x1B WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE4 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE5 0x1C WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE5 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE6 0x1D WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE6 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE7 0x1E WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE7 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE8 0x1F WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE8 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE9 0x20 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE9 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE10 0x21 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE10 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE11 0x22 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE11 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE12 0x23 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE12 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE13 0x24 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE13 0x00 0x01
+
+EVENT_DELAYED_C_STATE_ABORT_CORE14 0x25 WBOX
+UMASK_DELAYED_C_STATE_ABORT_CORE14 0x00 0x01
+
+EVENT_DEMOTIONS_CORE0           0x1E WBOX
+OPTIONS_DEMOTIONS_CORE0         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE0           0x00
+
+EVENT_DEMOTIONS_CORE1           0x1F WBOX
+OPTIONS_DEMOTIONS_CORE1         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE1           0x00
+
+EVENT_DEMOTIONS_CORE2           0x20 WBOX
+OPTIONS_DEMOTIONS_CORE2         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE2           0x00
+
+EVENT_DEMOTIONS_CORE3           0x21 WBOX
+OPTIONS_DEMOTIONS_CORE3         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE3           0x00
+
+EVENT_DEMOTIONS_CORE4           0x22 WBOX
+OPTIONS_DEMOTIONS_CORE4         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE4           0x00
+
+EVENT_DEMOTIONS_CORE5           0x23 WBOX
+OPTIONS_DEMOTIONS_CORE5         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE5           0x00
+
+EVENT_DEMOTIONS_CORE6           0x24 WBOX
+OPTIONS_DEMOTIONS_CORE6         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE6           0x00
+
+EVENT_DEMOTIONS_CORE7           0x25 WBOX
+OPTIONS_DEMOTIONS_CORE7         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE7           0x00
+
+EVENT_DEMOTIONS_CORE8           0x40 WBOX
+OPTIONS_DEMOTIONS_CORE8         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE8           0x00
+
+EVENT_DEMOTIONS_CORE9           0x41 WBOX
+OPTIONS_DEMOTIONS_CORE9         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE9           0x00
+
+EVENT_DEMOTIONS_CORE10           0x42 WBOX
+OPTIONS_DEMOTIONS_CORE10         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE10           0x00
+
+EVENT_DEMOTIONS_CORE11           0x43 WBOX
+OPTIONS_DEMOTIONS_CORE11         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE11           0x00
+
+EVENT_DEMOTIONS_CORE12           0x44 WBOX
+OPTIONS_DEMOTIONS_CORE12         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE12           0x00
+
+EVENT_DEMOTIONS_CORE13           0x45 WBOX
+OPTIONS_DEMOTIONS_CORE13         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE13           0x00
+
+EVENT_DEMOTIONS_CORE14           0x46 WBOX
+OPTIONS_DEMOTIONS_CORE14         EVENT_OPTION_MATCH0_MASK
+UMASK_DEMOTIONS_CORE14           0x00
+
+EVENT_FREQ_BAND0_CYCLES          0x0B WBOX
+OPTIONS_FREQ_BAND0_CYCLES        EVENT_OPTION_MATCH0_MASK
+UMASK_FREQ_BAND0_CYCLES          0x00
+
+EVENT_FREQ_BAND1_CYCLES          0x0C WBOX
+OPTIONS_FREQ_BAND1_CYCLES        EVENT_OPTION_MATCH0_MASK
+UMASK_FREQ_BAND1_CYCLES          0x00
+
+EVENT_FREQ_BAND2_CYCLES          0x0D WBOX
+OPTIONS_FREQ_BAND2_CYCLES        EVENT_OPTION_MATCH0_MASK
+UMASK_FREQ_BAND2_CYCLES          0x00
+
+EVENT_FREQ_BAND3_CYCLES          0x0E WBOX
+OPTIONS_FREQ_BAND3_CYCLES        EVENT_OPTION_MATCH0_MASK
+UMASK_FREQ_BAND3_CYCLES          0x00
+
+EVENT_FREQ_MAX_CURRENT_CYCLES    0x07 WBOX
+UMASK_FREQ_MAX_CURRENT_CYCLES    0x00
+
+EVENT_FREQ_MAX_LIMIT_THERMAL_CYCLES 0x04 WBOX
+UMASK_FREQ_MAX_LIMIT_THERMAL_CYCLES 0x00
+
+EVENT_FREQ_MAX_OS_CYCLES         0x06 WBOX
+UMASK_FREQ_MAX_OS_CYCLES         0x00
+
+EVENT_FREQ_MAX_POWER_CYCLES      0x05 WBOX
+UMASK_FREQ_MAX_POWER_CYCLES      0x00
+
+EVENT_FREQ_MIN_IO_P_CYCLES       0x61 WBOX
+UMASK_FREQ_MIN_IO_P_CYCLES       0x00
+
+EVENT_FREQ_MIN_PERF_P_CYCLES     0x02 WBOX
+UMASK_FREQ_MIN_PERF_P_CYCLES     0x00
+
+EVENT_FREQ_TRANS_CYCLES          0x60 WBOX
+UMASK_FREQ_TRANS_CYCLES          0x00
+
+EVENT_MEMORY_PHASE_SHEDDING_CYCLES 0x2F WBOX
+UMASK_MEMORY_PHASE_SHEDDING_CYCLES 0x00
+
+EVENT_PKG_C_EXIT_LATENCY         0x26 WBOX
+UMASK_PKG_C_EXIT_LATENCY         0x00 0x01
+
+EVENT_POWER_STATE_OCCUPANCY          0x80 WBOX
+UMASK_POWER_STATE_OCCUPANCY_CORES_C0 0x40
+UMASK_POWER_STATE_OCCUPANCY_CORES_C3 0x80
+UMASK_POWER_STATE_OCCUPANCY_CORES_C6 0xC0
+
+EVENT_PROCHOT_EXTERNAL_CYCLES    0x0A WBOX
+UMASK_PROCHOT_EXTERNAL_CYCLES    0x00
+
+EVENT_PROCHOT_INTERNAL_CYCLES    0x09 WBOX
+UMASK_PROCHOT_INTERNAL_CYCLES    0x00
+
+EVENT_TOTAL_TRANSITION_CYCLES    0x63 WBOX
+UMASK_TOTAL_TRANSITION_CYCLES    0x00
+
+EVENT_VOLT_TRANS_CYCLES_CHANGE   0x03 WBOX
+UMASK_VOLT_TRANS_CYCLES_CHANGE   0x00
+
+EVENT_VOLT_TRANS_CYCLES_DECREASE 0x02 WBOX
+UMASK_VOLT_TRANS_CYCLES_DECREASE 0x00
+
+EVENT_VOLT_TRANS_CYCLES_INCREASE 0x01 WBOX
+UMASK_VOLT_TRANS_CYCLES_INCREASE 0x00
+
+EVENT_VR_HOT_CYCLES              0x32 WBOX
+UMASK_VR_HOT_CYCLES              0x00
+
+EVENT_PBOX_CLOCKTICKS           0x01 PBOX
+UMASK_PBOX_CLOCKTICKS           0x00
+
+EVENT_RING_AD_USED              0x07 PBOX
+UMASK_RING_AD_USED_CW_VR0_EVEN  0x01
+UMASK_RING_AD_USED_CW_VR0_ODD   0x02
+UMASK_RING_AD_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AD_USED_CCW_VR0_ODD  0x08
+UMASK_RING_AD_USED_CW_VR1_EVEN  0x10
+UMASK_RING_AD_USED_CW_VR1_ODD   0x20
+UMASK_RING_AD_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_AD_USED_CCW_VR1_ODD  0x80
+UMASK_RING_AD_USED_CW           0x33
+UMASK_RING_AD_USED_CCW          0xCC
+
+EVENT_RING_AK_USED              0x08 PBOX
+UMASK_RING_AK_USED_CW_VR0_EVEN  0x01
+UMASK_RING_AK_USED_CW_VR0_ODD   0x02
+UMASK_RING_AK_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AK_USED_CCW_VR0_ODD  0x08
+UMASK_RING_AK_USED_CW_VR1_EVEN  0x10
+UMASK_RING_AK_USED_CW_VR1_ODD   0x20
+UMASK_RING_AK_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_AK_USED_CCW_VR1_ODD  0x80
+UMASK_RING_AK_USED_CW           0x33
+UMASK_RING_AK_USED_CCW          0xCC
+
+EVENT_RING_BL_USED              0x09 PBOX
+UMASK_RING_BL_USED_CW_VR0_EVEN  0x01
+UMASK_RING_BL_USED_CW_VR0_ODD   0x02
+UMASK_RING_BL_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_BL_USED_CCW_VR0_ODD  0x08
+UMASK_RING_BL_USED_CW_VR1_EVEN  0x10
+UMASK_RING_BL_USED_CW_VR1_ODD   0x20
+UMASK_RING_BL_USED_CCW_VR1_EVEN 0x40
+UMASK_RING_BL_USED_CCW_VR1_ODD  0x80
+UMASK_RING_BL_USED_CW           0x33
+UMASK_RING_BL_USED_CCW          0xCC
+
+EVENT_RING_IV_USED              0x09 PBOX
+UMASK_RING_IV_USED_CW           0x33
+UMASK_RING_IV_USED_CCW          0xCC
+UMASK_RING_IV_USED_ANY          0xFF
+
+EVENT_RXR_AK_BOUNCES            0x12 PBOX
+UMASK_RXR_AK_BOUNCES_CW         0x01
+UMASK_RXR_AK_BOUNCES_CCW        0x02
+
+EVENT_RXR_CYCLES_NE             0x10 PBOX
+UMASK_RXR_CYCLES_NE_NCB         0x10
+UMASK_RXR_CYCLES_NE_NCS         0x20
+
+EVENT_RXR_INSERTS               0x11 PBOX
+UMASK_RXR_INSERTS_NCB           0x10
+UMASK_RXR_INSERTS_NCS           0x20
+
+EVENT_RXR_OCCUPANCY             0x13 PBOX
+UMASK_RXR_OCCUPANCY_DRS         0x08
+
+EVENT_TXR_CYCLES_FULL           0x25 PBOX
+UMASK_TXR_CYCLES_FULL_AD        0x01
+UMASK_TXR_CYCLES_FULL_AK        0x02
+UMASK_TXR_CYCLES_FULL_BL        0x04
+
+EVENT_TXR_CYCLES_NE             0x23 PBOX
+UMASK_TXR_CYCLES_NE_AD          0x01
+UMASK_TXR_CYCLES_NE_AK          0x02
+UMASK_TXR_CYCLES_NE_BL          0x04
+
+EVENT_TXR_NACK_CW               0x26 PBOX
+UMASK_TXR_NACK_CW_AD            0x01
+UMASK_TXR_NACK_CW_AK            0x02
+UMASK_TXR_NACK_CW_BL            0x04
+
+EVENT_TXR_NACK_CCW              0x28 PBOX
+UMASK_TXR_NACK_CCW_AD           0x01
+UMASK_TXR_NACK_CCW_AK           0x02
+UMASK_TXR_NACK_CCW_BL           0x04
+
+EVENT_RBOX_CLOCKTICKS           0x01 RBOX
+UMASK_RBOX_CLOCKTICKS           0x00
+
+EVENT_C_LO_AD_CREDITS_EMPTY       0x2B RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO0  0x01
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO1  0x02
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO2  0x04
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO3  0x08
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO4  0x10
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO5  0x20
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO6  0x40
+UMASK_C_LO_AD_CREDITS_EMPTY_CBO7  0x80
+
+EVENT_C_HI_AD_CREDITS_EMPTY       0x2C RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO8  0x01
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO9  0x02
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO10 0x04
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO11 0x08
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO12 0x10
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO13 0x20
+UMASK_C_HI_AD_CREDITS_EMPTY_CBO14 0x40
+
+EVENT_HA_R2_BL_CREDITS_EMPTY        0x2F RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_HA_R2_BL_CREDITS_EMPTY_HA0    0x01
+UMASK_HA_R2_BL_CREDITS_EMPTY_HA1    0x02
+UMASK_HA_R2_BL_CREDITS_EMPTY_R2_NCB 0x04
+UMASK_HA_R2_BL_CREDITS_EMPTY_R2_NCS 0x08
+
+EVENT_QPI0_AD_CREDITS_EMPTY         0x29 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_QPI0_AD_CREDITS_EMPTY_VNA     0x01
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI0_AD_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI0_AD_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI0_BL_CREDITS_EMPTY         0x2D RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_QPI0_BL_CREDITS_EMPTY_VNA     0x01
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI0_BL_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI0_BL_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI1_AD_CREDITS_EMPTY         0x2A RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_QPI1_AD_CREDITS_EMPTY_VNA     0x01
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI1_AD_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI1_AD_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_QPI1_BL_CREDITS_EMPTY         0x2E RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_QPI1_BL_CREDITS_EMPTY_VNA     0x01
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_HOM 0x02
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_SNP 0x04
+UMASK_QPI1_BL_CREDITS_EMPTY_VN0_NDR 0x08
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_HOM 0x10
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_SNP 0x20
+UMASK_QPI1_BL_CREDITS_EMPTY_VN1_NDR 0x40
+
+EVENT_RING_AD_USED              0x07 RBOX
+UMASK_RING_AD_USED_CW_VR0_EVEN  0x01
+UMASK_RING_AD_USED_CW_VR0_ODD   0x02
+UMASK_RING_AD_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AD_USED_CCW_VR0_ODD  0x08
+UMASK_RING_AD_USED_CW           0x33
+UMASK_RING_AD_USED_CCW          0xCC
+
+EVENT_RING_AK_USED              0x08 RBOX
+UMASK_RING_AK_USED_CW_VR0_EVEN  0x01
+UMASK_RING_AK_USED_CW_VR0_ODD   0x02
+UMASK_RING_AK_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_AK_USED_CCW_VR0_ODD  0x08
+UMASK_RING_AK_USED_CW           0x33
+UMASK_RING_AK_USED_CCW          0xCC
+
+EVENT_RING_BL_USED              0x09 RBOX
+UMASK_RING_BL_USED_CW_VR0_EVEN  0x01
+UMASK_RING_BL_USED_CW_VR0_ODD   0x02
+UMASK_RING_BL_USED_CCW_VR0_EVEN 0x04
+UMASK_RING_BL_USED_CCW_VR0_ODD  0x08
+UMASK_RING_BL_USED_CW           0x33
+UMASK_RING_BL_USED_CCW          0xCC
+
+EVENT_RING_IV_USED              0x0A RBOX
+UMASK_RING_IV_USED_CW           0x33
+UMASK_RING_IV_USED_CCW          0xCC
+UMASK_RING_IV_USED_ANY          0xFF
+
+EVENT_RXR_AD_BYPASSED           0x12 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_RXR_AD_BYPASSED           0x00
+
+EVENT_RXR_CYCLES_NE             0x10 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_RXR_CYCLES_NE_HOM         0x01
+UMASK_RXR_CYCLES_NE_SNP         0x02
+UMASK_RXR_CYCLES_NE_NDR         0x04
+
+EVENT_RXR_INSERTS               0x11 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_RXR_INSERTS_HOM           0x01
+UMASK_RXR_INSERTS_SNP           0x02
+UMASK_RXR_INSERTS_NDR           0x04
+UMASK_RXR_INSERTS_DRS           0x08
+UMASK_RXR_INSERTS_NCB           0x10
+UMASK_RXR_INSERTS_NCS           0x20
+
+EVENT_RXR_OCCUPANCY             0x13 RBOX0C0|RBOX1C0
+UMASK_RXR_OCCUPANCY_HOM         0x01
+UMASK_RXR_OCCUPANCY_SNP         0x02
+UMASK_RXR_OCCUPANCY_NDR         0x04
+UMASK_RXR_OCCUPANCY_DRS         0x08
+UMASK_RXR_OCCUPANCY_NCB         0x10
+UMASK_RXR_OCCUPANCY_NCS         0x20
+
+EVENT_TXR_CYCLES_FULL           0x25 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_TXR_CYCLES_FULL           0x00
+
+EVENT_TXR_CYCLES_NE             0x23 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_TXR_CYCLES_NE             0x00
+
+EVENT_TXR_NACK_CW               0x26 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_TXR_NACK_CW_AD            0x01
+UMASK_TXR_NACK_CW_AK            0x02
+UMASK_TXR_NACK_CW_BL            0x04
+
+EVENT_TXR_NACK_CCW              0x28 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_TXR_NACK_CCW_AD           0x01
+UMASK_TXR_NACK_CCW_AK           0x02
+UMASK_TXR_NACK_CCW_BL           0x04
+
+EVENT_VN0_CREDITS_REJECT        0x37 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VN0_CREDITS_REJECT_HOM    0x01
+UMASK_VN0_CREDITS_REJECT_SNP    0x02
+UMASK_VN0_CREDITS_REJECT_NDR    0x04
+UMASK_VN0_CREDITS_REJECT_DRS    0x08
+UMASK_VN0_CREDITS_REJECT_NCB    0x10
+UMASK_VN0_CREDITS_REJECT_NCS    0x20
+
+EVENT_VN0_CREDITS_USED          0x36 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VN0_CREDITS_USED_HOM      0x01
+UMASK_VN0_CREDITS_USED_SNP      0x02
+UMASK_VN0_CREDITS_USED_NDR      0x04
+UMASK_VN0_CREDITS_USED_DRS      0x08
+UMASK_VN0_CREDITS_USED_NCB      0x10
+UMASK_VN0_CREDITS_USED_NCS      0x20
+
+EVENT_VN1_CREDITS_REJECT        0x39 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VN1_CREDITS_REJECT_HOM    0x01
+UMASK_VN1_CREDITS_REJECT_SNP    0x02
+UMASK_VN1_CREDITS_REJECT_NDR    0x04
+UMASK_VN1_CREDITS_REJECT_DRS    0x08
+UMASK_VN1_CREDITS_REJECT_NCB    0x10
+UMASK_VN1_CREDITS_REJECT_NCS    0x20
+
+EVENT_VN1_CREDITS_USED          0x38 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VN1_CREDITS_USED_HOM      0x01
+UMASK_VN1_CREDITS_USED_SNP      0x02
+UMASK_VN1_CREDITS_USED_NDR      0x04
+UMASK_VN1_CREDITS_USED_DRS      0x08
+UMASK_VN1_CREDITS_USED_NCB      0x10
+UMASK_VN1_CREDITS_USED_NCS      0x20
+
+EVENT_VNA_CREDITS_ACQUIRED      0x33 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VNA_CREDITS_ACQUIRED_AD   0x01
+UMASK_VNA_CREDITS_ACQUIRED_BL   0x04
+
+EVENT_VNA_CREDITS_REJECT        0x34 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VNA_CREDITS_REJECT_HOM    0x01
+UMASK_VNA_CREDITS_REJECT_SNP    0x02
+UMASK_VNA_CREDITS_REJECT_NDR    0x04
+UMASK_VNA_CREDITS_REJECT_DRS    0x08
+UMASK_VNA_CREDITS_REJECT_NCB    0x10
+UMASK_VNA_CREDITS_REJECT_NCS    0x20
+
+EVENT_VNA_CREDIT_CYCLES_OUT     0x31 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VNA_CREDIT_CYCLES_OUT     0x00
+
+EVENT_VNA_CREDIT_CYCLES_USED    0x32 RBOX0C0|RBOX0C1|RBOX1C0|RBOX0C1
+UMASK_VNA_CREDIT_CYCLES_USED    0x00
+
+EVENT_IBOX_CLOCKTICKS           0x00 IBOX
+UMASK_IBOX_CLOCKTICKS           0x00
+
+EVENT_ADDRESS_MATCH             0x17 IBOX
+UMASK_ADDRESS_MATCH_STALL_COUNT 0x01
+UMASK_ADDRESS_MATCH_MERGE_COUNT 0x02
+
+EVENT_CACHE_ACK_PENDING_OCCUPANCY        0x14 IBOX
+UMASK_CACHE_ACK_PENDING_OCCUPANCY_ANY    0x01
+UMASK_CACHE_ACK_PENDING_OCCUPANCY_SOURCE 0x02
+
+EVENT_CACHE_OWN_OCCUPANCY        0x13 IBOX
+UMASK_CACHE_OWN_OCCUPANCY_ANY    0x01
+UMASK_CACHE_OWN_OCCUPANCY_SOURCE 0x02
+
+EVENT_CACHE_READ_OCCUPANCY        0x10 IBOX
+UMASK_CACHE_READ_OCCUPANCY_ANY    0x01
+UMASK_CACHE_READ_OCCUPANCY_SOURCE 0x02
+
+EVENT_CACHE_TOTAL_OCCUPANCY        0x12 IBOX
+UMASK_CACHE_TOTAL_OCCUPANCY_ANY    0x01
+UMASK_CACHE_TOTAL_OCCUPANCY_SOURCE 0x02
+
+EVENT_CACHE_WRITE_OCCUPANCY        0x11 IBOX
+UMASK_CACHE_WRITE_OCCUPANCY_ANY    0x01
+UMASK_CACHE_WRITE_OCCUPANCY_SOURCE 0x02
+
+EVENT_RXR_AK_CYCLES_FULL        0x0B IBOX
+UMASK_RXR_AK_CYCLES_FULL        0x00
+
+EVENT_RXR_AK_INSERTS            0x0A IBOX
+UMASK_RXR_AK_INSERTS            0x00
+
+EVENT_RXR_AK_OCCUPANCY          0x0C IBOX
+UMASK_RXR_AK_OCCUPANCY          0x00
+
+EVENT_RXR_BL_DRS_CYCLES_FULL    0x04 IBOX
+UMASK_RXR_BL_DRS_CYCLES_FULL    0x00
+
+EVENT_RXR_BL_DRS_INSERTS        0x01 IBOX
+UMASK_RXR_BL_DRS_INSERTS        0x00
+
+EVENT_RXR_BL_DRS_OCCUPANCY      0x07 IBOX
+UMASK_RXR_BL_DRS_OCCUPANCY      0x00
+
+EVENT_RXR_BL_NCB_CYCLES_FULL    0x05 IBOX
+UMASK_RXR_BL_NCB_CYCLES_FULL    0x00
+
+EVENT_RXR_BL_NCB_INSERTS        0x02 IBOX
+UMASK_RXR_BL_NCB_INSERTS        0x00
+
+EVENT_RXR_BL_NCB_OCCUPANCY      0x08 IBOX
+UMASK_RXR_BL_NCB_OCCUPANCY      0x00
+
+EVENT_RXR_BL_NCS_CYCLES_FULL    0x06 IBOX
+UMASK_RXR_BL_NCS_CYCLES_FULL    0x00
+
+EVENT_RXR_BL_NCS_INSERTS        0x03 IBOX
+UMASK_RXR_BL_NCS_INSERTS        0x00
+
+EVENT_RXR_BL_NCS_OCCUPANCY      0x09 IBOX
+UMASK_RXR_BL_NCS_OCCUPANCY      0x00
+
+EVENT_TICKLES                   0x16 IBOX
+UMASK_TICKLES_LOST_OWNERSHIP    0x01
+UMASK_TICKLES_TOP_OF_QUEUE      0x02
+
+EVENT_TRANSACTIONS              0x15 IBOX
+UMASK_TRANSACTIONS_READS        0x01
+UMASK_TRANSACTIONS_WRITES       0x02
+UMASK_TRANSACTIONS_RD_PREFETCHES 0x04
+UMASK_TRANSACTIONS_ORDERINGQ    0x08
+
+EVENT_TXR_AD_STALL_CREDIT_CYCLES 0x18 IBOX
+UMASK_TXR_AD_STALL_CREDIT_CYCLES 0x00
+
+EVENT_TXR_BL_STALL_CREDIT_CYCLES 0x19 IBOX
+UMASK_TXR_BL_STALL_CREDIT_CYCLES 0x00
+
+EVENT_TXR_DATA_INSERTS_NCB      0x0E IBOX
+UMASK_TXR_DATA_INSERTS_NCB      0x00
+
+EVENT_TXR_DATA_INSERTS_NCS      0x0F IBOX
+UMASK_TXR_DATA_INSERTS_NCS      0x00
+
+EVENT_TXR_REQUEST_OCCUPANCY     0x0D IBOX
+UMASK_TXR_REQUEST_OCCUPANCY     0x00
+
+EVENT_WRITE_ORDERING_STALL_CYCLES 0x1A IBOX
+UMASK_WRITE_ORDERING_STALL_CYCLES 0x00
diff --git a/src/includes/perfmon_ivybridge_counters.h b/src/includes/perfmon_ivybridge_counters.h
index e63dfb0..8c58161 100644
--- a/src/includes/perfmon_ivybridge_counters.h
+++ b/src/includes/perfmon_ivybridge_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_ivybridge_counters.h
  *
- *      Description: Counter header file of perfmon module for Ivy Bridge.
+ *      Description: Counter header file of perfmon module for Intel Ivy Bridge.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -30,46 +31,313 @@
 
 #define NUM_COUNTERS_CORE_IVYBRIDGE 8
 #define NUM_COUNTERS_UNCORE_IVYBRIDGE 12
-#define NUM_COUNTERS_IVYBRIDGE 32
+#define NUM_COUNTERS_IVYBRIDGE 12
+#define NUM_COUNTERS_CORE_IVYBRIDGEEP 8
+#define NUM_COUNTERS_UNCORE_IVYBRIDGEEP 81
+#define NUM_COUNTERS_IVYBRIDGEEP 161
 
-static PerfmonCounterMap ivybridge_counter_map[NUM_COUNTERS_IVYBRIDGE] = {
+#define IVB_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define IVB_VALID_OPTIONS_FIXED EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_COUNT_KERNEL_MASK
+#define IVB_VALID_OPTIONS_UBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVB_VALID_OPTIONS_CBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+            EVENT_OPTION_TID_MASK|EVENT_OPTION_STATE_MASK|EVENT_OPTION_NID_MASK|\
+            EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK
+#define IVB_VALID_OPTIONS_WBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+            EVENT_OPTION_OCCUPANCY_MASK|EVENT_OPTION_OCCUPANCY_FILTER_MASK|EVENT_OPTION_OCCUPANCY_EDGE_MASK|\
+            EVENT_OPTION_OCCUPANCY_INVERT_MASK
+#define IVB_VALID_OPTIONS_MBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVB_VALID_OPTIONS_SBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+            EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK|EVENT_OPTION_MASK0_MASK|\
+            EVENT_OPTION_MASK0_MASK
+#define IVB_VALID_OPTIONS_BBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK|\
+            EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK
+#define IVB_VALID_OPTIONS_PBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVB_VALID_OPTIONS_RBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define IVB_VALID_OPTIONS_IBOX EVENT_OPTION_EDGE_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap ivybridge_counter_map[NUM_COUNTERS_IVYBRIDGE] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
-    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
-    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, IVB_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, IVB_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, IVB_VALID_OPTIONS_FIXED},
     /* PMC Counters: 4 48bit wide */
-    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
-    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
-    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, IVB_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, IVB_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, IVB_VALID_OPTIONS_PMC},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, IVB_VALID_OPTIONS_PMC},
     /* Temperature Sensor*/
-    {"TMP0", PMC7, THERMAL, 0, 0, 0, 0},
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
     /* RAPL counters */
-    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
-    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0},
-    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0},
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK}
+};
+
+static RegisterMap ivybridgeEP_counter_map[NUM_COUNTERS_IVYBRIDGEEP] = {
+    /* Fixed Counters: instructions retired, cycles unhalted core */
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, IVB_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, IVB_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, IVB_VALID_OPTIONS_FIXED},
+    /* PMC Counters: 4 48bit wide */
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, IVB_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, IVB_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, IVB_VALID_OPTIONS_PMC},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, IVB_VALID_OPTIONS_PMC},
+    /* Temperature Sensor*/
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* RAPL counters */
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* CBOX counters, 44bits wide*/
+    {"CBOX0C0", PMC12, CBOX0, MSR_UNC_C0_PMON_CTL0, MSR_UNC_C0_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX0C1", PMC13, CBOX0, MSR_UNC_C0_PMON_CTL1, MSR_UNC_C0_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX0C2", PMC14, CBOX0, MSR_UNC_C0_PMON_CTL2, MSR_UNC_C0_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX0C3", PMC15, CBOX0, MSR_UNC_C0_PMON_CTL3, MSR_UNC_C0_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX1C0", PMC16, CBOX1, MSR_UNC_C1_PMON_CTL0, MSR_UNC_C1_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX1C1", PMC17, CBOX1, MSR_UNC_C1_PMON_CTL1, MSR_UNC_C1_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX1C2", PMC18, CBOX1, MSR_UNC_C1_PMON_CTL2, MSR_UNC_C1_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX1C3", PMC19, CBOX1, MSR_UNC_C1_PMON_CTL3, MSR_UNC_C1_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX2C0", PMC20, CBOX2, MSR_UNC_C2_PMON_CTL0, MSR_UNC_C2_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX2C1", PMC21, CBOX2, MSR_UNC_C2_PMON_CTL1, MSR_UNC_C2_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX2C2", PMC22, CBOX2, MSR_UNC_C2_PMON_CTL2, MSR_UNC_C2_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX2C3", PMC23, CBOX2, MSR_UNC_C2_PMON_CTL3, MSR_UNC_C2_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX3C0", PMC24, CBOX3, MSR_UNC_C3_PMON_CTL0, MSR_UNC_C3_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX3C1", PMC25, CBOX3, MSR_UNC_C3_PMON_CTL1, MSR_UNC_C3_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX3C2", PMC26, CBOX3, MSR_UNC_C3_PMON_CTL2, MSR_UNC_C3_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX3C3", PMC27, CBOX3, MSR_UNC_C3_PMON_CTL3, MSR_UNC_C3_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX4C0", PMC28, CBOX4, MSR_UNC_C4_PMON_CTL0, MSR_UNC_C4_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX4C1", PMC29, CBOX4, MSR_UNC_C4_PMON_CTL1, MSR_UNC_C4_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX4C2", PMC30, CBOX4, MSR_UNC_C4_PMON_CTL2, MSR_UNC_C4_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX4C3", PMC31, CBOX4, MSR_UNC_C4_PMON_CTL3, MSR_UNC_C4_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX5C0", PMC32, CBOX5, MSR_UNC_C5_PMON_CTL0, MSR_UNC_C5_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX5C1", PMC33, CBOX5, MSR_UNC_C5_PMON_CTL1, MSR_UNC_C5_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX5C2", PMC34, CBOX5, MSR_UNC_C5_PMON_CTL2, MSR_UNC_C5_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX5C3", PMC35, CBOX5, MSR_UNC_C5_PMON_CTL3, MSR_UNC_C5_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX6C0", PMC36, CBOX6, MSR_UNC_C6_PMON_CTL0, MSR_UNC_C6_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX6C1", PMC37, CBOX6, MSR_UNC_C6_PMON_CTL1, MSR_UNC_C6_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX6C2", PMC38, CBOX6, MSR_UNC_C6_PMON_CTL2, MSR_UNC_C6_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX6C3", PMC39, CBOX6, MSR_UNC_C6_PMON_CTL3, MSR_UNC_C6_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX7C0", PMC40, CBOX7, MSR_UNC_C7_PMON_CTL0, MSR_UNC_C7_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX7C1", PMC41, CBOX7, MSR_UNC_C7_PMON_CTL1, MSR_UNC_C7_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX7C2", PMC42, CBOX7, MSR_UNC_C7_PMON_CTL2, MSR_UNC_C7_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX7C3", PMC43, CBOX7, MSR_UNC_C7_PMON_CTL3, MSR_UNC_C7_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX8C0", PMC44, CBOX8, MSR_UNC_C8_PMON_CTL0, MSR_UNC_C8_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX8C1", PMC45, CBOX8, MSR_UNC_C8_PMON_CTL1, MSR_UNC_C8_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX8C2", PMC46, CBOX8, MSR_UNC_C8_PMON_CTL2, MSR_UNC_C8_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX8C3", PMC47, CBOX8, MSR_UNC_C8_PMON_CTL3, MSR_UNC_C8_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX9C0", PMC48, CBOX9, MSR_UNC_C9_PMON_CTL0, MSR_UNC_C9_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX9C1", PMC49, CBOX9, MSR_UNC_C9_PMON_CTL1, MSR_UNC_C9_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX9C2", PMC50, CBOX9, MSR_UNC_C9_PMON_CTL2, MSR_UNC_C9_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX9C3", PMC51, CBOX9, MSR_UNC_C9_PMON_CTL3, MSR_UNC_C9_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX10C0", PMC52, CBOX10, MSR_UNC_C10_PMON_CTL0, MSR_UNC_C10_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX10C1", PMC53, CBOX10, MSR_UNC_C10_PMON_CTL1, MSR_UNC_C10_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX10C2", PMC54, CBOX10, MSR_UNC_C10_PMON_CTL2, MSR_UNC_C10_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX10C3", PMC55, CBOX10, MSR_UNC_C10_PMON_CTL3, MSR_UNC_C10_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX11C0", PMC56, CBOX11, MSR_UNC_C11_PMON_CTL0, MSR_UNC_C11_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX11C1", PMC57, CBOX11, MSR_UNC_C11_PMON_CTL1, MSR_UNC_C11_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX11C2", PMC58, CBOX11, MSR_UNC_C11_PMON_CTL2, MSR_UNC_C11_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX11C3", PMC59, CBOX11, MSR_UNC_C11_PMON_CTL3, MSR_UNC_C11_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX12C0", PMC60, CBOX12, MSR_UNC_C12_PMON_CTL0, MSR_UNC_C12_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX12C1", PMC61, CBOX12, MSR_UNC_C12_PMON_CTL1, MSR_UNC_C12_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX12C2", PMC62, CBOX12, MSR_UNC_C12_PMON_CTL2, MSR_UNC_C12_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX12C3", PMC63, CBOX12, MSR_UNC_C12_PMON_CTL3, MSR_UNC_C12_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX13C0", PMC64, CBOX13, MSR_UNC_C13_PMON_CTL0, MSR_UNC_C13_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX13C1", PMC65, CBOX13, MSR_UNC_C13_PMON_CTL1, MSR_UNC_C13_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX13C2", PMC66, CBOX13, MSR_UNC_C13_PMON_CTL2, MSR_UNC_C13_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX13C3", PMC67, CBOX13, MSR_UNC_C13_PMON_CTL3, MSR_UNC_C13_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX14C0", PMC68, CBOX14, MSR_UNC_C14_PMON_CTL0, MSR_UNC_C14_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX14C1", PMC69, CBOX14, MSR_UNC_C14_PMON_CTL1, MSR_UNC_C14_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX14C2", PMC70, CBOX14, MSR_UNC_C14_PMON_CTL2, MSR_UNC_C14_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    {"CBOX14C3", PMC71, CBOX14, MSR_UNC_C14_PMON_CTL3, MSR_UNC_C14_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_CBOX},
+    /* Uncore management Counters: 2 48bit wide counters */
+    {"UBOX0", PMC72, UBOX, MSR_UNC_U_PMON_CTL0, MSR_UNC_U_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_UBOX},
+    {"UBOX1", PMC73, UBOX, MSR_UNC_U_PMON_CTL1, MSR_UNC_U_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_UBOX},
+    {"UBOXFIX", PMC74, UBOXFIX, MSR_UNC_U_UCLK_FIXED_CTL, MSR_UNC_U_UCLK_FIXED_CTR, 0, 0, 0},
+    /* PCU Counters: 4 48bit wide counters */
+    {"WBOX0", PMC75, WBOX, MSR_UNC_PCU_PMON_CTL0, MSR_UNC_PCU_PMON_CTR0, 0, 0, IVB_VALID_OPTIONS_WBOX},
+    {"WBOX1", PMC76, WBOX, MSR_UNC_PCU_PMON_CTL1, MSR_UNC_PCU_PMON_CTR1, 0, 0, IVB_VALID_OPTIONS_WBOX},
+    {"WBOX2", PMC77, WBOX, MSR_UNC_PCU_PMON_CTL2, MSR_UNC_PCU_PMON_CTR2, 0, 0, IVB_VALID_OPTIONS_WBOX},
+    {"WBOX3", PMC78, WBOX, MSR_UNC_PCU_PMON_CTL3, MSR_UNC_PCU_PMON_CTR3, 0, 0, IVB_VALID_OPTIONS_WBOX},
+    {"WBOX0FIX", PMC79, WBOX0FIX, 0, MSR_UNC_PCU_PMON_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"WBOX1FIX", PMC80, WBOX1FIX, 0, MSR_UNC_PCU_PMON_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
     /* IMC Counters: 4 48bit wide per memory channel, split in two reads */
-    {"MBOX0C0",PMC12, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX0C1",PMC13, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX0C2",PMC14, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX0C3",PMC15, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX0FIX",PMC16, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX1C0",PMC17, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX1C1",PMC18, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX1C2",PMC19, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX1C3",PMC20, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX1FIX",PMC21, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX2C0",PMC22, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX2C1",PMC23, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX2C2",PMC24, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX2C3",PMC25, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX2FIX",PMC26, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX3C0",PMC27, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX3C1",PMC28, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX3C2",PMC29, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX3C3",PMC30, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX3FIX",PMC31, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_1},
+    {"MBOX0C0",PMC81, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_0, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX0C1",PMC82, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_0, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX0C2",PMC83, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_0, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX0C3",PMC84, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_0, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX0FIX",PMC85, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+    {"MBOX1C0",PMC86, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_1, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX1C1",PMC87, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_1, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX1C2",PMC88, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_1, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX1C3",PMC89, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_1, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX1FIX",PMC90, MBOX1FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_1, EVENT_OPTION_NONE_MASK},
+    {"MBOX2C0",PMC91, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_2, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX2C1",PMC92, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_2, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX2C2",PMC93, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_2, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX2C3",PMC94, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_2, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX2FIX",PMC95, MBOX2FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_2, EVENT_OPTION_NONE_MASK},
+    {"MBOX3C0",PMC96, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_3, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX3C1",PMC97, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_3, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX3C2",PMC98, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_3, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX3C3",PMC99, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_3, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX3FIX",PMC100, MBOX3FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_3, EVENT_OPTION_NONE_MASK},
+    {"MBOX4C0",PMC101, MBOX4, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_0, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX4C1",PMC102, MBOX4, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_0, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX4C2",PMC103, MBOX4, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_0, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX4C3",PMC104, MBOX4, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_0, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX4FIX",PMC105, MBOX4FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_0, EVENT_OPTION_NONE_MASK},
+    {"MBOX5C0",PMC106, MBOX5, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_1, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX5C1",PMC107, MBOX5, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_1, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX5C2",PMC108, MBOX5, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_1, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX5C3",PMC109, MBOX5, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_1, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX5FIX",PMC110, MBOX5FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_1, EVENT_OPTION_NONE_MASK},
+    {"MBOX6C0",PMC111, MBOX6, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_2, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX6C1",PMC112, MBOX6, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_2, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX6C2",PMC113, MBOX6, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_2, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX6C3",PMC114, MBOX6, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_2, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX6FIX",PMC115, MBOX6FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_2, EVENT_OPTION_NONE_MASK},
+    {"MBOX7C0",PMC116, MBOX7, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_1_CH_3, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX7C1",PMC117, MBOX7, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_1_CH_3, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX7C2",PMC118, MBOX7, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_1_CH_3, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX7C3",PMC119, MBOX7, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_1_CH_3, IVB_VALID_OPTIONS_MBOX},
+    {"MBOX7FIX",PMC120, MBOX7FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_1_CH_3, EVENT_OPTION_NONE_MASK},
+    /* QPI counters four 48bit wide per port, split in two reads */
+    {"SBOX0C0",PMC121, SBOX0, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_0, IVB_VALID_OPTIONS_SBOX},
+    {"SBOX0C1",PMC122, SBOX0, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_0, IVB_VALID_OPTIONS_SBOX},
+    {"SBOX0C2",PMC123, SBOX0, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_0, IVB_VALID_OPTIONS_SBOX},
+    {"SBOX0C3",PMC124, SBOX0, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_0, IVB_VALID_OPTIONS_SBOX},
+    {"SBOX1C0",PMC125, SBOX1, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_1, IVB_VALID_OPTIONS_SBOX},
+    {"SBOX1C1",PMC126, SBOX1, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_1, IVB_VALID_OPTIONS_SBOX},
+    {"SBOX1C2",PMC127, SBOX1, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_1, IVB_VALID_OPTIONS_SBOX},
+    {"SBOX1C3",PMC128, SBOX1, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_1, IVB_VALID_OPTIONS_SBOX},
+    {"SBOX2C0",PMC129, SBOX2, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_2, IVB_VALID_OPTIONS_SBOX},
+    {"SBOX2C1",PMC130, SBOX2, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_2, IVB_VALID_OPTIONS_SBOX},
+    {"SBOX2C2",PMC131, SBOX2, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_2, IVB_VALID_OPTIONS_SBOX},
+    {"SBOX2C3",PMC132, SBOX2, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_2, IVB_VALID_OPTIONS_SBOX},
+    {"SBOX0FIX",PMC133, SBOX0FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"SBOX1FIX",PMC134, SBOX1FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"SBOX2FIX",PMC135, SBOX2FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_2, EVENT_OPTION_NONE_MASK},
+    /* HA counters four 48bit wide per counter, split in two reads */
+    {"BBOX0C0", PMC136, BBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_0, IVB_VALID_OPTIONS_BBOX},
+    {"BBOX0C1", PMC137, BBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_0, IVB_VALID_OPTIONS_BBOX},
+    {"BBOX0C2", PMC138, BBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_0, IVB_VALID_OPTIONS_BBOX},
+    {"BBOX0C3", PMC139, BBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_0, IVB_VALID_OPTIONS_BBOX},
+    {"BBOX1C0", PMC140, BBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_1, IVB_VALID_OPTIONS_BBOX},
+    {"BBOX1C1", PMC141, BBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_1, IVB_VALID_OPTIONS_BBOX},
+    {"BBOX1C2", PMC142, BBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_1, IVB_VALID_OPTIONS_BBOX},
+    {"BBOX1C3", PMC143, BBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_1, IVB_VALID_OPTIONS_BBOX},
+    /* R2PCIe counters four 44bit wide per counter, split in two reads */
+    {"PBOX0", PMC144, PBOX, PCI_UNC_R2PCIE_PMON_CTL_0, PCI_UNC_R2PCIE_PMON_CTR_0_A, PCI_UNC_R2PCIE_PMON_CTR_0_B, PCI_R2PCIE_DEVICE, IVB_VALID_OPTIONS_PBOX},
+    {"PBOX1", PMC145, PBOX, PCI_UNC_R2PCIE_PMON_CTL_1, PCI_UNC_R2PCIE_PMON_CTR_1_A, PCI_UNC_R2PCIE_PMON_CTR_1_B, PCI_R2PCIE_DEVICE, IVB_VALID_OPTIONS_PBOX},
+    {"PBOX2", PMC146, PBOX, PCI_UNC_R2PCIE_PMON_CTL_2, PCI_UNC_R2PCIE_PMON_CTR_2_A, PCI_UNC_R2PCIE_PMON_CTR_2_B, PCI_R2PCIE_DEVICE, IVB_VALID_OPTIONS_PBOX},
+    {"PBOX3", PMC147, PBOX, PCI_UNC_R2PCIE_PMON_CTL_3, PCI_UNC_R2PCIE_PMON_CTR_3_A, PCI_UNC_R2PCIE_PMON_CTR_3_B, PCI_R2PCIE_DEVICE, IVB_VALID_OPTIONS_PBOX},
+    /* R3QPI counters four 44bit wide per counter, split in two reads */
+    {"RBOX0C0", PMC148, RBOX0, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_0, IVB_VALID_OPTIONS_RBOX},
+    {"RBOX0C1", PMC149, RBOX0, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_0, IVB_VALID_OPTIONS_RBOX},
+    {"RBOX0C2", PMC150, RBOX0, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_0, IVB_VALID_OPTIONS_RBOX},
+    {"RBOX1C0", PMC151, RBOX1, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_1, IVB_VALID_OPTIONS_RBOX},
+    {"RBOX1C1", PMC152, RBOX1, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_1, IVB_VALID_OPTIONS_RBOX},
+    {"RBOX1C2", PMC153, RBOX1, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_1, IVB_VALID_OPTIONS_RBOX},
+    {"RBOX2C0", PMC154, RBOX2, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_2, IVB_VALID_OPTIONS_RBOX},
+    {"RBOX2C1", PMC155, RBOX2, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_2, IVB_VALID_OPTIONS_RBOX},
+    {"RBOX2C2", PMC156, RBOX2, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_2, IVB_VALID_OPTIONS_RBOX},
+    /* IRP counters four 44bit wide per counter */
+    {"IBOX0C0", PMC157, IBOX0, PCI_UNC_IRP0_PMON_CTL_0, PCI_UNC_IRP0_PMON_CTR_0, 0, PCI_IRP_DEVICE, IVB_VALID_OPTIONS_IBOX},
+    {"IBOX0C1", PMC158, IBOX0, PCI_UNC_IRP0_PMON_CTL_1, PCI_UNC_IRP0_PMON_CTR_1, 0, PCI_IRP_DEVICE, IVB_VALID_OPTIONS_IBOX},
+    {"IBOX1C0", PMC159, IBOX1, PCI_UNC_IRP1_PMON_CTL_0, PCI_UNC_IRP1_PMON_CTR_0, 0, PCI_IRP_DEVICE, IVB_VALID_OPTIONS_IBOX},
+    {"IBOX1C1", PMC160, IBOX1, PCI_UNC_IRP1_PMON_CTL_1, PCI_UNC_IRP1_PMON_CTR_1, 0, PCI_IRP_DEVICE, IVB_VALID_OPTIONS_IBOX},
 };
 
+static BoxMap ivybridge_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+    [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32}
+};
 
+static BoxMap ivybridgeEP_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+    [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32},
+    [MBOX0] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 20, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+    [MBOX0FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 20, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+    [MBOX1] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 20, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+    [MBOX1FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 20, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+    [MBOX2] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 20, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+    [MBOX2FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 20, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+    [MBOX3] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 20, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+    [MBOX3FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 20, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+    [MBOX4] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 21, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+    [MBOX4FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 21, 1, PCI_IMC_DEVICE_1_CH_0, 48},
+    [MBOX5] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 21, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+    [MBOX5FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 21, 1, PCI_IMC_DEVICE_1_CH_1, 48},
+    [MBOX6] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 21, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+    [MBOX6FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 21, 1, PCI_IMC_DEVICE_1_CH_2, 48},
+    [MBOX7] = {PCI_UNC_MC_PMON_BOX_CTL, PCI_UNC_MC_PMON_BOX_STATUS, PCI_UNC_MC_PMON_BOX_STATUS, 21, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+    [MBOX7FIX] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 21, 1, PCI_IMC_DEVICE_1_CH_3, 48},
+    [CBOX0] = {MSR_UNC_C0_PMON_BOX_CTL, 0, 0, 3, 0, 0, 44, MSR_UNC_C0_PMON_BOX_FILTER, MSR_UNC_C0_PMON_BOX_FILTER1},
+    [CBOX1] = {MSR_UNC_C1_PMON_BOX_CTL, 0, 0, 4, 0, 0, 44, MSR_UNC_C1_PMON_BOX_FILTER, MSR_UNC_C1_PMON_BOX_FILTER1},
+    [CBOX2] = {MSR_UNC_C2_PMON_BOX_CTL, 0, 0, 5, 0, 0, 44, MSR_UNC_C2_PMON_BOX_FILTER, MSR_UNC_C2_PMON_BOX_FILTER1},
+    [CBOX3] = {MSR_UNC_C3_PMON_BOX_CTL, 0, 0, 6, 0, 0, 44, MSR_UNC_C3_PMON_BOX_FILTER, MSR_UNC_C3_PMON_BOX_FILTER1},
+    [CBOX4] = {MSR_UNC_C4_PMON_BOX_CTL, 0, 0, 7, 0, 0, 44, MSR_UNC_C4_PMON_BOX_FILTER, MSR_UNC_C4_PMON_BOX_FILTER1},
+    [CBOX5] = {MSR_UNC_C5_PMON_BOX_CTL, 0, 0, 8, 0, 0, 44, MSR_UNC_C5_PMON_BOX_FILTER, MSR_UNC_C5_PMON_BOX_FILTER1},
+    [CBOX6] = {MSR_UNC_C6_PMON_BOX_CTL, 0, 0, 9, 0, 0, 44, MSR_UNC_C6_PMON_BOX_FILTER, MSR_UNC_C6_PMON_BOX_FILTER1},
+    [CBOX7] = {MSR_UNC_C7_PMON_BOX_CTL, 0, 0, 10, 0, 0, 44, MSR_UNC_C7_PMON_BOX_FILTER, MSR_UNC_C7_PMON_BOX_FILTER1},
+    [CBOX8] = {MSR_UNC_C8_PMON_BOX_CTL, 0, 0, 11, 0, 0, 44, MSR_UNC_C8_PMON_BOX_FILTER, MSR_UNC_C8_PMON_BOX_FILTER1},
+    [CBOX9] = {MSR_UNC_C9_PMON_BOX_CTL, 0, 0, 12, 0, 0, 44, MSR_UNC_C9_PMON_BOX_FILTER, MSR_UNC_C9_PMON_BOX_FILTER1},
+    [CBOX10] = {MSR_UNC_C10_PMON_BOX_CTL, 0, 0, 13, 0, 0, 44, MSR_UNC_C10_PMON_BOX_FILTER, MSR_UNC_C10_PMON_BOX_FILTER1},
+    [CBOX11] = {MSR_UNC_C11_PMON_BOX_CTL, 0, 0, 14, 0, 0, 44, MSR_UNC_C11_PMON_BOX_FILTER, MSR_UNC_C11_PMON_BOX_FILTER1},
+    [CBOX12] = {MSR_UNC_C12_PMON_BOX_CTL, 0, 0, 15, 0, 0, 44, MSR_UNC_C12_PMON_BOX_FILTER, MSR_UNC_C12_PMON_BOX_FILTER1},
+    [CBOX13] = {MSR_UNC_C13_PMON_BOX_CTL, 0, 0, 16, 0, 0, 44, MSR_UNC_C13_PMON_BOX_FILTER, MSR_UNC_C13_PMON_BOX_FILTER1},
+    [CBOX14] = {MSR_UNC_C14_PMON_BOX_CTL, 0, 0, 17, 0, 0, 44, MSR_UNC_C14_PMON_BOX_FILTER, MSR_UNC_C14_PMON_BOX_FILTER1},
+    [BBOX0] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 18, 1, PCI_HA_DEVICE_0, 48},
+    [BBOX1] = {PCI_UNC_HA_PMON_BOX_CTL, PCI_UNC_HA_PMON_BOX_STATUS, PCI_UNC_HA_PMON_BOX_STATUS, 19, 1, PCI_HA_DEVICE_1, 48},
+    [SBOX0] = {PCI_UNC_QPI_PMON_BOX_CTL, PCI_UNC_QPI_PMON_BOX_STATUS, PCI_UNC_QPI_PMON_BOX_STATUS, 22, 1, PCI_QPI_DEVICE_PORT_0, 48},
+    [SBOX1] = {PCI_UNC_QPI_PMON_BOX_CTL, PCI_UNC_QPI_PMON_BOX_STATUS, PCI_UNC_QPI_PMON_BOX_STATUS, 23, 1, PCI_QPI_DEVICE_PORT_1, 48},
+    [SBOX2] = {PCI_UNC_QPI_PMON_BOX_CTL, PCI_UNC_QPI_PMON_BOX_STATUS, PCI_UNC_QPI_PMON_BOX_STATUS, -1, 1, PCI_QPI_DEVICE_PORT_2, 48},
+    [SBOX0FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_0, 64},
+    [SBOX1FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_0, 64},
+    [SBOX2FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_2, 64},
+    [WBOX] = {MSR_UNC_PCU_PMON_BOX_CTL, MSR_UNC_PCU_PMON_BOX_STATUS, MSR_UNC_PCU_PMON_BOX_STATUS, 2, 0, 0, 48,  MSR_UNC_PCU_PMON_BOX_FILTER},
+    [WBOX0FIX] = {0, 0, 0, 0, 0, 0, 64},
+    [WBOX1FIX] = {0, 0, 0, 0, 0, 0, 64},
+    [UBOX] = {0, MSR_UNC_U_PMON_BOX_STATUS, MSR_UNC_U_PMON_BOX_STATUS, 1, 0, 0, 44},
+    [UBOXFIX] = {0, MSR_UNC_U_PMON_BOX_STATUS, MSR_UNC_U_PMON_BOX_STATUS, 0, 0, 0, 44},
+    [PBOX] = {PCI_UNC_R2PCIE_PMON_BOX_CTL, PCI_UNC_R2PCIE_PMON_BOX_STATUS, PCI_UNC_R2PCIE_PMON_BOX_STATUS, 26, 1,PCI_R2PCIE_DEVICE, 44},
+    [RBOX0] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 24, 1,PCI_R3QPI_DEVICE_LINK_0, 44},
+    [RBOX1] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, 25, 1,PCI_R3QPI_DEVICE_LINK_1, 44},
+    [RBOX2] = {PCI_UNC_R3QPI_PMON_BOX_CTL, PCI_UNC_R3QPI_PMON_BOX_STATUS, PCI_UNC_R3QPI_PMON_BOX_STATUS, -1, 1,PCI_R3QPI_DEVICE_LINK_2, 44},
+    [IBOX0] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, -1, 1, PCI_IRP_DEVICE, 44},
+    [IBOX1] = {PCI_UNC_IRP_PMON_BOX_CTL, PCI_UNC_IRP_PMON_BOX_STATUS, PCI_UNC_IRP_PMON_BOX_STATUS, -1, 1, PCI_IRP_DEVICE, 44},
+};
+
+static PciDevice ivybridgeEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [MSR_DEV] = {NONE, "", "", ""},
+ [PCI_R3QPI_DEVICE_LINK_0] = {R3QPI, "13.5", "PCI_R3QPI_DEVICE_LINK_0", "RBOX0", 0x0e36},
+ [PCI_R3QPI_DEVICE_LINK_1] = {R3QPI, "13.6", "PCI_R3QPI_DEVICE_LINK_1", "RBOX1", 0x0e37},
+ [PCI_R3QPI_DEVICE_LINK_2] = {R3QPI, "12.5", "PCI_R3QPI_DEVICE_LINK_2", "RBOX2", 0x0e3e},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "13.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x0e34},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "10.4", "PCI_IMC_DEVICE_0_CH_0", "MBOX0", 0x0eb4},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "10.5", "PCI_IMC_DEVICE_0_CH_1", "MBOX1", 0x0eb5},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "10.0", "PCI_IMC_DEVICE_0_CH_2", "MBOX2", 0x0eb0},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "10.1", "PCI_IMC_DEVICE_0_CH_3", "MBOX3", 0x0eb1},
+ [PCI_HA_DEVICE_0] = {HA, "0e.1", "PCI_HA_DEVICE_0", "BBOX0", 0x0e30},
+ [PCI_HA_DEVICE_1] = {HA, "1c.1", "PCI_HA_DEVICE_1", "BBOX1", 0x0e38},
+ [PCI_IMC_DEVICE_1_CH_0] = {IMC, "1e.4", "PCI_IMC_DEVICE_1_CH_0", "MBOX4", 0x0ef4},
+ [PCI_IMC_DEVICE_1_CH_1] = {IMC, "1e.5", "PCI_IMC_DEVICE_1_CH_1", "MBOX5", 0x0ef5},
+ [PCI_IMC_DEVICE_1_CH_2] = {IMC, "1e.0", "PCI_IMC_DEVICE_1_CH_2", "MBOX6", 0x0ef0},
+ [PCI_IMC_DEVICE_1_CH_3] = {IMC, "1e.1", "PCI_IMC_DEVICE_1_CH_3", "MBOX7", 0x0ef1},
+ [PCI_IRP_DEVICE] = {IRP, "05.6", "PCI_IRP_DEVICE", NULL, 0x0e39},
+ [PCI_QPI_DEVICE_PORT_0] = {QPI, "08.2", "PCI_QPI_DEVICE_PORT_0", "SBOX0", 0x0e32},
+ [PCI_QPI_DEVICE_PORT_1] = {QPI, "09.2", "PCI_QPI_DEVICE_PORT_1", "SBOX1", 0x0e33},
+ [PCI_QPI_DEVICE_PORT_2] = {QPI, "0a.2", "PCI_QPI_DEVICE_PORT_2", "SBOX2", 0x0e3a},
+ [PCI_QPI_MASK_DEVICE_PORT_0] = {QPI, "08.6", "PCI_QPI_MASK_DEVICE_PORT_0", NULL, 0x0e86},
+ [PCI_QPI_MASK_DEVICE_PORT_1] = {QPI, "09.6", "PCI_QPI_MASK_DEVICE_PORT_1", NULL, 0x0e96},
+ [PCI_QPI_MASK_DEVICE_PORT_2] = {QPI, "0a.6", "PCI_QPI_MASK_DEVICE_PORT_2", NULL, 0x0ec6},
+ [PCI_QPI_MISC_DEVICE_PORT_0] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_0/1", "SBOX01FIX",0x0e80},
+ [PCI_QPI_MISC_DEVICE_PORT_2] = {QPI, "0a.0", "PCI_QPI_MISC_DEVICE_PORT_2", "SBOX2FIX", 0x0ec0},
+};
diff --git a/src/includes/perfmon_ivybridge_events.txt b/src/includes/perfmon_ivybridge_events.txt
index 5318ce6..deb11d4 100644
--- a/src/includes/perfmon_ivybridge_events.txt
+++ b/src/includes/perfmon_ivybridge_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_ivybridge_events.txt
-# 
+#
 #      Description:  Event list for Intel Ivy Bridge
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -35,6 +36,9 @@ UMASK_PWR_PKG_ENERGY          0x00
 EVENT_PWR_PP0_ENERGY          0x00   PWR1
 UMASK_PWR_PP0_ENERGY          0x00
 
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
+
 EVENT_PWR_DRAM_ENERGY          0x00   PWR3
 UMASK_PWR_DRAM_ENERGY          0x00
 
@@ -120,7 +124,7 @@ EVENT_L1D_PEND_MISS              0x48   PMC1
 UMASK_L1D_PEND_MISS_PENDING      0x01
 
 EVENT_DTLB_STORE_MISSES                0x49   PMC
-UMASK_DTLB_STORE_MISSES_MISS_CAUSES_A_WALK   0x01
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK   0x01
 UMASK_DTLB_STORE_MISSES_WALK_COMPLETED       0x02
 UMASK_DTLB_STORE_MISSES_WALK_DURATION       0x04
 UMASK_DTLB_STORE_MISSES_STLB_HIT             0x10
@@ -148,9 +152,6 @@ UMASK_CPL_CYCLES_RING123             0x02
 EVENT_RS_EVENTS               0x5E    PMC
 UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
 
-EVENT_DTLB_LOAD_MISSES_STLB	0x5F PMC
-UMASK_DTLB_LOAD_MISSES_STLB_HIT 0x04
-
 EVENT_OFFCORE_REQUESTS_OUTSTANDING          0x60   PMC
 UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD   0x01
 UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD   0x02
@@ -378,284 +379,36 @@ UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_DRAM     0x0C
 UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_HITM     0x10
 UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_FWD     0x20
 
-EVENT_DRAM_CLOCKTICKS             0x00  MBOX
-UMASK_DRAM_CLOCKTICKS             0x00
-
-EVENT_ACT_COUNT                  0x01  MBOX
-UMASK_ACT_COUNT_RD                 0x01
-UMASK_ACT_COUNT_WR                 0x02
-UMASK_ACT_COUNT_BYP                0x08
-
-EVENT_BYP_CMDS                  0xA1  MBOX
-UMASK_BYP_CMDS_ACT                 0x01
-UMASK_BYP_CMDS_CAS                 0x02
-UMASK_BYP_CMDS_PRE                 0x04
-
-EVENT_CAS_COUNT                  0x04  MBOX
-UMASK_CAS_COUNT_RD_REG           0x01
-UMASK_CAS_COUNT_RD_UNDERFILL     0x02
-UMASK_CAS_COUNT_RD               0x03
-UMASK_CAS_COUNT_WR_WMM           0x04
-UMASK_CAS_COUNT_WR_RMM           0x08
-UMASK_CAS_COUNT_WR               0x0C
-UMASK_CAS_COUNT_ALL              0x0F
-UMASK_CAS_COUNT_RD_WMM           0x01
-UMASK_CAS_COUNT_RD_RMM           0x02
-
-EVENT_DRAM_PRE_ALL                  0x06  MBOX
-UMASK_DRAM_PRE_ALL                  0x00
-
-EVENT_DRAM_REFRESH                  0x05  MBOX
-UMASK_DRAM_REFRESH_PANIC            0x02
-UMASK_DRAM_REFRESH_HIGH             0x04
-
-EVENT_ECC_CORRECTABLE_ERRORS           0x09  MBOX
-UMASK_ECC_CORRECTABLE_ERRORS           0x00
-
-EVENT_MAJOR_MODES                  0x07  MBOX
-UMASK_MAJOR_MODES_READ             0x01
-UMASK_MAJOR_MODES_WRITE            0x02
-UMASK_MAJOR_MODES_PARTIAL          0x04
-UMASK_MAJOR_MODES_ISOCH            0x08
-
-EVENT_POWER_CHANNEL_DLLOFF           0x84  MBOX
-UMASK_POWER_CHANNEL_DLLOFF           0x00
-
-EVENT_POWER_CHANNEL_PPD           0x85  MBOX
-UMASK_POWER_CHANNEL_PPD           0x00
-
-EVENT_POWER_CKE_CYCLES                  0x83  MBOX
-UMASK_POWER_CKE_CYCLES_RANK0            0x01
-UMASK_POWER_CKE_CYCLES_RANK1            0x02
-UMASK_POWER_CKE_CYCLES_RANK2            0x04
-UMASK_POWER_CKE_CYCLES_RANK3            0x08
-UMASK_POWER_CKE_CYCLES_RANK4            0x10
-UMASK_POWER_CKE_CYCLES_RANK5            0x20
-UMASK_POWER_CKE_CYCLES_RANK6            0x40
-UMASK_POWER_CKE_CYCLES_RANK7            0x80
-
-EVENT_POWER_CRITICAL_THROTTLE_CYCLES           0x86  MBOX
-UMASK_POWER_CRITICAL_THROTTLE_CYCLES           0x00
-
-EVENT_POWER_PCU_THROTTLING           0x42  MBOX
-UMASK_POWER_PCU_THROTTLING           0x00
-
-EVENT_POWER_SELF_REFRESH           0x43  MBOX
-UMASK_POWER_SELF_REFRESH           0x00
-
-EVENT_POWER_THROTTLE_CYCLES                  0x41  MBOX
-UMASK_POWER_THROTTLE_CYCLES_RANK0            0x01
-UMASK_POWER_THROTTLE_CYCLES_RANK1            0x02
-UMASK_POWER_THROTTLE_CYCLES_RANK2            0x04
-UMASK_POWER_THROTTLE_CYCLES_RANK3            0x08
-UMASK_POWER_THROTTLE_CYCLES_RANK4            0x10
-UMASK_POWER_THROTTLE_CYCLES_RANK5            0x20
-UMASK_POWER_THROTTLE_CYCLES_RANK6            0x40
-UMASK_POWER_THROTTLE_CYCLES_RANK7            0x80
-
-EVENT_PREEMPTION           0x08  MBOX
-UMASK_PREEMPTION_RD_PREEMPT_RD           0x01
-UMASK_PREEMPTION_RD_PREEMPT_WR           0x02
-
-EVENT_PRE_COUNT           0x02  MBOX
-UMASK_PRE_COUNT_PAGE_MISS           0x01
-UMASK_PRE_COUNT_PAGE_CLOSE           0x02
-
-EVENT_RD_CAS_PRIO           0xA0  MBOX
-UMASK_RD_CAS_PRIO_LOW           0x01
-UMASK_RD_CAS_PRIO_MED           0x02
-UMASK_RD_CAS_PRIO_HIGH          0x04
-UMASK_RD_CAS_PRIO_PANIC         0x08
-
-EVENT_RD_CAS_RANK0           0xB0  MBOX
-UMASK_RD_CAS_RANK0_BANK0           0x01
-UMASK_RD_CAS_RANK0_BANK1           0x02
-UMASK_RD_CAS_RANK0_BANK2           0x04
-UMASK_RD_CAS_RANK0_BANK3           0x08
-UMASK_RD_CAS_RANK0_BANK4           0x10
-UMASK_RD_CAS_RANK0_BANK5           0x20
-UMASK_RD_CAS_RANK0_BANK6           0x40
-UMASK_RD_CAS_RANK0_BANK7           0x80
-
-EVENT_RD_CAS_RANK1           0xB1  MBOX
-UMASK_RD_CAS_RANK1_BANK0           0x01
-UMASK_RD_CAS_RANK1_BANK1           0x02
-UMASK_RD_CAS_RANK1_BANK2           0x04
-UMASK_RD_CAS_RANK1_BANK3           0x08
-UMASK_RD_CAS_RANK1_BANK4           0x10
-UMASK_RD_CAS_RANK1_BANK5           0x20
-UMASK_RD_CAS_RANK1_BANK6           0x40
-UMASK_RD_CAS_RANK1_BANK7           0x80
-
-EVENT_RD_CAS_RANK2           0xB2  MBOX
-UMASK_RD_CAS_RANK2_BANK0           0x01
-UMASK_RD_CAS_RANK2_BANK1           0x02
-UMASK_RD_CAS_RANK2_BANK2           0x04
-UMASK_RD_CAS_RANK2_BANK3           0x08
-UMASK_RD_CAS_RANK2_BANK4           0x10
-UMASK_RD_CAS_RANK2_BANK5           0x20
-UMASK_RD_CAS_RANK2_BANK6           0x40
-UMASK_RD_CAS_RANK2_BANK7           0x80
-
-EVENT_RD_CAS_RANK3           0xB3  MBOX
-UMASK_RD_CAS_RANK3_BANK0           0x01
-UMASK_RD_CAS_RANK3_BANK1           0x02
-UMASK_RD_CAS_RANK3_BANK2           0x04
-UMASK_RD_CAS_RANK3_BANK3           0x08
-UMASK_RD_CAS_RANK3_BANK4           0x10
-UMASK_RD_CAS_RANK3_BANK5           0x20
-UMASK_RD_CAS_RANK3_BANK6           0x40
-UMASK_RD_CAS_RANK3_BANK7           0x80
-
-EVENT_RD_CAS_RANK4           0xB4  MBOX
-UMASK_RD_CAS_RANK4_BANK0           0x01
-UMASK_RD_CAS_RANK4_BANK1           0x02
-UMASK_RD_CAS_RANK4_BANK2           0x04
-UMASK_RD_CAS_RANK4_BANK3           0x08
-UMASK_RD_CAS_RANK4_BANK4           0x10
-UMASK_RD_CAS_RANK4_BANK5           0x20
-UMASK_RD_CAS_RANK4_BANK6           0x40
-UMASK_RD_CAS_RANK4_BANK7           0x80
-
-EVENT_RD_CAS_RANK5           0xB5  MBOX
-UMASK_RD_CAS_RANK5_BANK0           0x01
-UMASK_RD_CAS_RANK5_BANK1           0x02
-UMASK_RD_CAS_RANK5_BANK2           0x04
-UMASK_RD_CAS_RANK5_BANK3           0x08
-UMASK_RD_CAS_RANK5_BANK4           0x10
-UMASK_RD_CAS_RANK5_BANK5           0x20
-UMASK_RD_CAS_RANK5_BANK6           0x40
-UMASK_RD_CAS_RANK5_BANK7           0x80
-
-EVENT_RD_CAS_RANK6           0xB6  MBOX
-UMASK_RD_CAS_RANK6_BANK0           0x01
-UMASK_RD_CAS_RANK6_BANK1           0x02
-UMASK_RD_CAS_RANK6_BANK2           0x04
-UMASK_RD_CAS_RANK6_BANK3           0x08
-UMASK_RD_CAS_RANK6_BANK4           0x10
-UMASK_RD_CAS_RANK6_BANK5           0x20
-UMASK_RD_CAS_RANK6_BANK6           0x40
-UMASK_RD_CAS_RANK6_BANK7           0x80
-
-EVENT_RD_CAS_RANK7           0xB7  MBOX
-UMASK_RD_CAS_RANK7_BANK0           0x01
-UMASK_RD_CAS_RANK7_BANK1           0x02
-UMASK_RD_CAS_RANK7_BANK2           0x04
-UMASK_RD_CAS_RANK7_BANK3           0x08
-UMASK_RD_CAS_RANK7_BANK4           0x10
-UMASK_RD_CAS_RANK7_BANK5           0x20
-UMASK_RD_CAS_RANK7_BANK6           0x40
-UMASK_RD_CAS_RANK7_BANK7           0x80
-
-EVENT_RPQ_CYCLES_NE           0x11  MBOX
-UMASK_RPQ_CYCLES_NE           0x00
-
-EVENT_RPQ_INSERTS           0x10  MBOX
-UMASK_RPQ_INSERTS           0x00
-
-EVENT_VMSE_MXB_WR_OCCUPANCY           0x91  MBOX
-UMASK_VMSE_MXB_WR_OCCUPANCY           0x00
-
-EVENT_VMSE_WR_PUSH           0x90  MBOX
-UMASK_VMSE_WR_PUSH           0x00
-
-EVENT_WMM_TO_RMM           0xC0  MBOX
-UMASK_WMM_TO_RMM           0x00
-
-EVENT_WPQ_CYCLES_FULL           0x22  MBOX
-UMASK_WPQ_CYCLES_FULL           0x00
-
-EVENT_WPQ_CYCLES_NE           0x21  MBOX
-UMASK_WPQ_CYCLES_NE           0x00
-
-EVENT_WPQ_INSERTS           0x20  MBOX
-UMASK_WPQ_INSERTS           0x00
-
-EVENT_WPQ_READ_HIT           0x23  MBOX
-UMASK_WPQ_READ_HIT           0x00
-
-EVENT_WPQ_WRITE_HIT           0x24  MBOX
-UMASK_WPQ_WRITE_HIT           0x00
-
-EVENT_WRONG_MM           0xC1  MBOX
-UMASK_WRONG_MM           0x00
-
-EVENT_WR_CAS_RANK0           0xB8  MBOX
-UMASK_WR_CAS_RANK0_BANK0           0x01
-UMASK_WR_CAS_RANK0_BANK1           0x02
-UMASK_WR_CAS_RANK0_BANK2           0x04
-UMASK_WR_CAS_RANK0_BANK3           0x08
-UMASK_WR_CAS_RANK0_BANK4           0x10
-UMASK_WR_CAS_RANK0_BANK5           0x20
-UMASK_WR_CAS_RANK0_BANK6           0x40
-UMASK_WR_CAS_RANK0_BANK7           0x80
-
-EVENT_WR_CAS_RANK1           0xB9  MBOX
-UMASK_WR_CAS_RANK1_BANK0           0x01
-UMASK_WR_CAS_RANK1_BANK1           0x02
-UMASK_WR_CAS_RANK1_BANK2           0x04
-UMASK_WR_CAS_RANK1_BANK3           0x08
-UMASK_WR_CAS_RANK1_BANK4           0x10
-UMASK_WR_CAS_RANK1_BANK5           0x20
-UMASK_WR_CAS_RANK1_BANK6           0x40
-UMASK_WR_CAS_RANK1_BANK7           0x80
-
-EVENT_WR_CAS_RANK2           0xBA  MBOX
-UMASK_WR_CAS_RANK2_BANK0           0x01
-UMASK_WR_CAS_RANK2_BANK1           0x02
-UMASK_WR_CAS_RANK2_BANK2           0x04
-UMASK_WR_CAS_RANK2_BANK3           0x08
-UMASK_WR_CAS_RANK2_BANK4           0x10
-UMASK_WR_CAS_RANK2_BANK5           0x20
-UMASK_WR_CAS_RANK2_BANK6           0x40
-UMASK_WR_CAS_RANK2_BANK7           0x80
-
-EVENT_WR_CAS_RANK3           0xBB  MBOX
-UMASK_WR_CAS_RANK3_BANK0           0x01
-UMASK_WR_CAS_RANK3_BANK1           0x02
-UMASK_WR_CAS_RANK3_BANK2           0x04
-UMASK_WR_CAS_RANK3_BANK3           0x08
-UMASK_WR_CAS_RANK3_BANK4           0x10
-UMASK_WR_CAS_RANK3_BANK5           0x20
-UMASK_WR_CAS_RANK3_BANK6           0x40
-UMASK_WR_CAS_RANK3_BANK7           0x80
-
-EVENT_WR_CAS_RANK4           0xBC  MBOX
-UMASK_WR_CAS_RANK4_BANK0           0x01
-UMASK_WR_CAS_RANK4_BANK1           0x02
-UMASK_WR_CAS_RANK4_BANK2           0x04
-UMASK_WR_CAS_RANK4_BANK3           0x08
-UMASK_WR_CAS_RANK4_BANK4           0x10
-UMASK_WR_CAS_RANK4_BANK5           0x20
-UMASK_WR_CAS_RANK4_BANK6           0x40
-UMASK_WR_CAS_RANK4_BANK7           0x80
-
-EVENT_WR_CAS_RANK5           0xBD  MBOX
-UMASK_WR_CAS_RANK5_BANK0           0x01
-UMASK_WR_CAS_RANK5_BANK1           0x02
-UMASK_WR_CAS_RANK5_BANK2           0x04
-UMASK_WR_CAS_RANK5_BANK3           0x08
-UMASK_WR_CAS_RANK5_BANK4           0x10
-UMASK_WR_CAS_RANK5_BANK5           0x20
-UMASK_WR_CAS_RANK5_BANK6           0x40
-UMASK_WR_CAS_RANK5_BANK7           0x80
-
-EVENT_WR_CAS_RANK6           0xBE  MBOX
-UMASK_WR_CAS_RANK6_BANK0           0x01
-UMASK_WR_CAS_RANK6_BANK1           0x02
-UMASK_WR_CAS_RANK6_BANK2           0x04
-UMASK_WR_CAS_RANK6_BANK3           0x08
-UMASK_WR_CAS_RANK6_BANK4           0x10
-UMASK_WR_CAS_RANK6_BANK5           0x20
-UMASK_WR_CAS_RANK6_BANK6           0x40
-UMASK_WR_CAS_RANK6_BANK7           0x80
-
-EVENT_WR_CAS_RANK7           0xBF  MBOX
-UMASK_WR_CAS_RANK7_BANK0           0x01
-UMASK_WR_CAS_RANK7_BANK1           0x02
-UMASK_WR_CAS_RANK7_BANK2           0x04
-UMASK_WR_CAS_RANK7_BANK3           0x08
-UMASK_WR_CAS_RANK7_BANK4           0x10
-UMASK_WR_CAS_RANK7_BANK5           0x20
-UMASK_WR_CAS_RANK7_BANK6           0x40
-UMASK_WR_CAS_RANK7_BANK7           0x80
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
diff --git a/src/includes/perfmon_k10.h b/src/includes/perfmon_k10.h
index cc614af..04f16c2 100644
--- a/src/includes/perfmon_k10.h
+++ b/src/includes/perfmon_k10.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_k10.h
  *
- *      Description:  Header file of perfmon module for K10
+ *      Description:  Header file of perfmon module for AMD K10
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,119 +30,191 @@
  */
 
 #include <perfmon_k10_events.h>
-#include <perfmon_k10_groups.h>
 #include <perfmon_k10_counters.h>
+#include <error.h>
 
 static int perfmon_numCountersK10 = NUM_COUNTERS_K10;
-static int perfmon_numGroupsK10 = NUM_GROUPS_K10;
 static int perfmon_numArchEventsK10 = NUM_ARCH_EVENTS_K10;
 
-void perfmon_init_k10(PerfmonThread *thread)
+int perfmon_init_k10(int cpu_id)
 {
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
+    return 0;
+}
 
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL3, 0x0ULL);
+int k10_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
+{
+    uint64_t flags = 0x0ULL;
 
-    //flags |= (1<<16);  /* user mode flag */
+    flags |= (1ULL<<16);
+    flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
 
-    /*msr_write(cpu_id, MSR_AMD_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_AMD_PERFEVTSEL3, flags);*/
+    if (event->numberOfOptions > 0)
+    {
+        for(int j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    if ((event->options[j].value & 0xFFULL) < 0x04ULL)
+                    {
+                        flags |= (event->options[j].value & 0xFFULL) << 24;
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
 }
 
-
-void perfmon_setupCounterThread_k10(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int perfmon_setupCounterThread_k10(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    uint64_t reg = k10_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
-
-    flags |= (1<<16);
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        if (type == PMC)
+        {
+            k10_pmc_setup(cpu_id, index, event);
+            eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        }
+    }
+    return 0;
+}
 
-    /* AMD uses a 12 bit Event mask: [35:32][7:0] */
-    flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
+int perfmon_startCountersThread_k10(int thread_id, PerfmonEventSet* eventSet)
+{
+    uint64_t flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if (perfmon_verbose)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                cpu_id,
-                LLU_CAST reg,
-                LLU_CAST flags);
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t reg = counter_map[index].configRegister;
+            uint32_t counter = counter_map[index].counterRegister;
+            VERBOSEPRINTREG(cpu_id, counter, 0x0ULL, CLEAR_PMC);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+            VERBOSEPRINTREG(cpu_id, reg, flags, READ_PMC_CTRL);
+            flags |= (1ULL<<22);  /* enable flag */
+            VERBOSEPRINTREG(cpu_id, reg, flags, START_PMC);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+        }
     }
-    msr_write(cpu_id, reg , flags);
+    return 0;
 }
 
-void perfmon_startCountersThread_k10(int thread_id)
+int perfmon_stopCountersThread_k10(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t flags = 0x0ULL;
+    uint64_t tmp;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    for ( int i=0; i<NUM_COUNTERS_K10; i++)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            msr_write(cpu_id, k10_counter_map[i].counterRegister , 0x0ULL);
-            flags = msr_read(cpu_id, k10_counter_map[i].configRegister);
-            flags |= (1<<22);  /* enable flag */
-
-            if (perfmon_verbose)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                        LLU_CAST k10_counter_map[i].configRegister,
-                        LLU_CAST flags);
+                continue;
             }
-
-            msr_write(cpu_id, k10_counter_map[i].configRegister , flags);
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t reg = counter_map[index].configRegister;
+            uint32_t counter = counter_map[index].counterRegister;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+            VERBOSEPRINTREG(cpu_id, reg, flags, READ_PMC_CTRL);
+            flags &= ~(1ULL<<22);  /* clear enable flag */
+            VERBOSEPRINTREG(cpu_id, reg, flags, STOP_PMC);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+            VERBOSEPRINTREG(cpu_id, counter, tmp, READ_PMC);
+            if (tmp < eventSet->events[i].threadCounter[thread_id].counterData)
+            {
+                eventSet->events[i].threadCounter[thread_id].overflows++;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(tmp, 0, box_map[type].regWidth);
         }
     }
+    return 0;
 }
 
-void perfmon_stopCountersThread_k10(int thread_id)
+int perfmon_readCountersThread_k10(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t tmp;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    for ( int i=0; i<NUM_COUNTERS_K10; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            flags = msr_read(cpu_id, k10_counter_map[i].configRegister);
-            flags &= ~(1<<22);  /* clear enable flag */
-            msr_write(cpu_id, k10_counter_map[i].configRegister , flags);
-
-            if (perfmon_verbose)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                        LLU_CAST k10_counter_map[i].configRegister,
-                        LLU_CAST flags);
+                continue;
             }
-
-            perfmon_threadData[thread_id].counters[i].counterData =
-                msr_read(cpu_id, k10_counter_map[i].counterRegister);
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t counter = counter_map[index].counterRegister;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &tmp));
+            VERBOSEPRINTREG(cpu_id, counter, tmp, READ_PMC);
+            if (tmp < eventSet->events[i].threadCounter[thread_id].counterData)
+            {
+                eventSet->events[i].threadCounter[thread_id].overflows++;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(tmp, 0, box_map[type].regWidth);
         }
     }
+    return 0;
 }
 
-void perfmon_readCountersThread_k10(int thread_id)
+
+int perfmon_finalizeCountersThread_k10(int thread_id, PerfmonEventSet* eventSet)
 {
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    for ( int i=0; i<NUM_COUNTERS_K10; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            perfmon_threadData[thread_id].counters[i].counterData =
-                msr_read(cpu_id, k10_counter_map[i].counterRegister);
+            continue;
         }
+        RegisterIndex index = eventSet->events[i].index;
+        uint32_t reg = counter_map[index].configRegister;
+        if (reg)
+        {
+            VERBOSEPRINTREG(cpu_id, reg, 0x0ULL, CLEAR_CTRL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
+    return 0;
 }
-
diff --git a/src/includes/perfmon_k10_counters.h b/src/includes/perfmon_k10_counters.h
index d01be3d..4560e48 100644
--- a/src/includes/perfmon_k10_counters.h
+++ b/src/includes/perfmon_k10_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_k10_counters.h
  *
- *      Description:  AMD K10 specific subroutines
+ *      Description:  AMD K10 performance counter definition. Also used for AMD K8.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,10 +32,15 @@
 #define NUM_COUNTERS_K10 4
 #define NUM_COUNTERS_CORE_K10 4
 
-static PerfmonCounterMap k10_counter_map[NUM_COUNTERS_K10] = {
-    {"PMC0",PMC0, PMC, MSR_AMD_PERFEVTSEL0, MSR_AMD_PMC0, 0, 0},
-    {"PMC1",PMC1, PMC, MSR_AMD_PERFEVTSEL1, MSR_AMD_PMC1, 0, 0},
-    {"PMC2",PMC2, PMC, MSR_AMD_PERFEVTSEL2, MSR_AMD_PMC2, 0, 0},
-    {"PMC3",PMC3, PMC, MSR_AMD_PERFEVTSEL3, MSR_AMD_PMC3, 0, 0}
+#define K10_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD
+
+static RegisterMap k10_counter_map[NUM_COUNTERS_K10] = {
+    {"PMC0",PMC0, PMC, MSR_AMD_PERFEVTSEL0, MSR_AMD_PMC0, 0, 0, K10_VALID_OPTIONS_PMC},
+    {"PMC1",PMC1, PMC, MSR_AMD_PERFEVTSEL1, MSR_AMD_PMC1, 0, 0, K10_VALID_OPTIONS_PMC},
+    {"PMC2",PMC2, PMC, MSR_AMD_PERFEVTSEL2, MSR_AMD_PMC2, 0, 0, K10_VALID_OPTIONS_PMC},
+    {"PMC3",PMC3, PMC, MSR_AMD_PERFEVTSEL3, MSR_AMD_PMC3, 0, 0, K10_VALID_OPTIONS_PMC}
 };
 
+static BoxMap k10_box_map[NUM_UNITS] = {
+    [PMC] = {0, 0, 0, 0, 0, 0, 48}
+};
diff --git a/src/includes/perfmon_k10_events.txt b/src/includes/perfmon_k10_events.txt
index 64c20e9..557a506 100644
--- a/src/includes/perfmon_k10_events.txt
+++ b/src/includes/perfmon_k10_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_k10_events.txt
-# 
+#
 #      Description:  Event list for AMD K10
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -132,17 +133,17 @@ UMASK_DATA_CACHE_EVICTED_PREFETCH_NTA         0x20
 UMASK_DATA_CACHE_EVICTED_NOT_PREFETCH_NTA     0x40
 UMASK_DATA_CACHE_EVICTED_ALL     0x1F
 
-EVENT_DTLB_L2_HIT                0x45   PMC
-UMASK_DTLB_L2_HIT_4K             0x01
-UMASK_DTLB_L2_HIT_2M             0x02
-UMASK_DTLB_L2_HIT_1G             0x04
-UMASK_DTLB_L2_HIT_ALL             0x07
+EVENT_DTLB_L2_HIT                  0x45   PMC
+UMASK_DTLB_L2_HIT_4KB              0x01
+UMASK_DTLB_L2_HIT_2MB              0x02
+UMASK_DTLB_L2_HIT_1GB              0x04
+UMASK_DTLB_L2_HIT_ALL              0x07
 
-EVENT_DTLB_L2_MISS                0x46   PMC
-UMASK_DTLB_L2_MISS_4K             0x01
-UMASK_DTLB_L2_MISS_2M             0x02
-UMASK_DTLB_L2_MISS_1G             0x04
-UMASK_DTLB_L2_MISS_ALL            0x07
+EVENT_DTLB_L2_MISS                 0x46   PMC
+UMASK_DTLB_L2_MISS_4KB             0x01
+UMASK_DTLB_L2_MISS_2MB             0x02
+UMASK_DTLB_L2_MISS_1GB             0x04
+UMASK_DTLB_L2_MISS_ALL             0x07
 
 EVENT_MISALIGNED_ACCESS           0x47   PMC
 UMASK_MISALIGNED_ACCESS           0x00
@@ -167,10 +168,11 @@ UMASK_PREFETCH_INSTRUCTION_DISPATCHED_NTA    0x04
 EVENT_DCACHE_LOCK_MISS           0x4C   PMC
 UMASK_DCACHE_LOCK_MISS           0x02
 
-EVENT_DTLB_L1_HIT                0x4D   PMC
-UMASK_DTLB_L1_HIT_4K             0x01
-UMASK_DTLB_L1_HIT_2M             0x02
-UMASK_DTLB_L1_HIT_1G             0x04
+EVENT_DTLB_L1_HIT                 0x4D   PMC
+UMASK_DTLB_L1_HIT_4KB             0x01
+UMASK_DTLB_L1_HIT_2MB             0x02
+UMASK_DTLB_L1_HIT_1GB             0x04
+UMASK_DTLB_L1_HIT_ANY             0x07
 
 EVENT_SW_PREFETCH_HIT                0x52   PMC
 UMASK_SW_PREFETCH_HIT_L1             0x01
@@ -238,9 +240,10 @@ UMASK_ICACHE_REFILLS_MEM          0x00
 EVENT_ITLB_L2_HIT          0x84   PMC
 UMASK_ITLB_L2_HIT          0x00
 
-EVENT_ITLB_L2_MISS          0x85   PMC
-UMASK_ITLB_L2_MISS_4K       0x01
-UMASK_ITLB_L2_MISS_2M       0x02
+EVENT_ITLB_L2_MISS           0x85   PMC
+UMASK_ITLB_L2_MISS_4KB       0x01
+UMASK_ITLB_L2_MISS_2MB       0x02
+UMASK_ITLB_L2_MISS_ANY       0x03
 
 EVENT_PIPELINE_RESTART_STREAM_PROBE    0x86   PMC
 UMASK_PIPELINE_RESTART_STREAM_PROBE    0x00
diff --git a/src/includes/perfmon_k8.h b/src/includes/perfmon_k8.h
index 9313168..d54570d 100644
--- a/src/includes/perfmon_k8.h
+++ b/src/includes/perfmon_k8.h
@@ -3,17 +3,17 @@
  *
  *      Filename:  perfmon_k8.h
  *
- *      Description:  Header File of perfmon module for K8 support.
- *                    Configures and reads out performance counters
- *                    on x86 based architectures. Supports multi threading.
+ *      Description:  Header File of perfmon module for AMD K8 support.
+ *                    The setup routines and registers are similar to AMD K10
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,10 +31,9 @@
  */
 
 #include <perfmon_k8_events.h>
-#include <perfmon_k8_groups.h>
+#include <error.h>
 
 
-static int perfmon_numGroupsK8 = NUM_GROUPS_K8;
 static int perfmon_numArchEventsK8 = NUM_ARCH_EVENTS_K8;
 
 
diff --git a/src/includes/perfmon_k8_events.txt b/src/includes/perfmon_k8_events.txt
index 127b56f..c5d6b08 100644
--- a/src/includes/perfmon_k8_events.txt
+++ b/src/includes/perfmon_k8_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_k8_events.txt
-# 
+#
 #      Description:  Event list for AMD K8
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -97,15 +98,17 @@ UMASK_DATA_CACHE_EVICTED_PREFETCH_NTA         0x20
 UMASK_DATA_CACHE_EVICTED_NOT_PREFETCH_NTA     0x40
 UMASK_DATA_CACHE_EVICTED_ALL     0x1F
 
-EVENT_DTLB_L2_HIT                0x45     PMC
-UMASK_DTLB_L2_HIT_4K             0x01
-UMASK_DTLB_L2_HIT_2M             0x02
-UMASK_DTLB_L2_HIT_1G             0x04
+EVENT_DTLB_L2_HIT                 0x45     PMC
+UMASK_DTLB_L2_HIT_4KB             0x01
+UMASK_DTLB_L2_HIT_2MB             0x02
+UMASK_DTLB_L2_HIT_1GB             0x04
+UMASK_DTLB_L2_HIT_ANY             0x07
 
-EVENT_DTLB_L2_MISS                0x46     PMC
-UMASK_DTLB_L2_MISS_4K             0x01
-UMASK_DTLB_L2_MISS_2M             0x02
-UMASK_DTLB_L2_MISS_1G             0x04
+EVENT_DTLB_L2_MISS                 0x46     PMC
+UMASK_DTLB_L2_MISS_4KB             0x01
+UMASK_DTLB_L2_MISS_2MB             0x02
+UMASK_DTLB_L2_MISS_1GB             0x04
+UMASK_DTLB_L2_MISS_ANY             0x07
 
 EVENT_MISALIGNED_ACCESS           0x47     PMC
 UMASK_MISALIGNED_ACCESS           0x00
@@ -178,9 +181,10 @@ UMASK_ICACHE_REFILLS_MEM          0x00
 EVENT_ITLB_L2_HIT          0x84     PMC
 UMASK_ITLB_L2_HIT          0x00
 
-EVENT_ITLB_L2_MISS          0x85     PMC
-UMASK_ITLB_L2_MISS_4K       0x01
-UMASK_ITLB_L2_MISS_2M       0x02
+EVENT_ITLB_L2_MISS           0x85     PMC
+UMASK_ITLB_L2_MISS_4KB       0x01
+UMASK_ITLB_L2_MISS_2MB       0x02
+UMASK_ITLB_L2_MISS_ANY       0x03
 
 EVENT_PIPELINE_RESTART_STREAM_PROBE    0x86     PMC
 UMASK_PIPELINE_RESTART_STREAM_PROBE    0x00
diff --git a/src/includes/perfmon_kabini.h b/src/includes/perfmon_kabini.h
index 018eb04..9457d9e 100644
--- a/src/includes/perfmon_kabini.h
+++ b/src/includes/perfmon_kabini.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_kabini.h
  *
- *      Description:  Header file of perfmon module for AMD Family16
+ *      Description:  Header file of perfmon module for AMD Family 16
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,218 +30,319 @@
  */
 
 #include <perfmon_kabini_events.h>
-#include <perfmon_kabini_groups.h>
 #include <perfmon_kabini_counters.h>
+#include <error.h>
+#include <affinity.h>
 
 static int perfmon_numCountersKabini = NUM_COUNTERS_KABINI;
-static int perfmon_numGroupsKabini = NUM_GROUPS_KABINI;
 static int perfmon_numArchEventsKabini = NUM_ARCH_EVENTS_KABINI;
 
-void perfmon_init_kabini(PerfmonThread *thread)
+int perfmon_init_kabini(int cpu_id)
+{
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    return 0;
+}
+
+
+int k16_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
 {
     uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
 
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL3, 0x0ULL);
+    flags |= (1ULL<<16);
+    flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire(
-                (int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id)
-       )
+    if (event->numberOfOptions > 0)
     {
-        msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_AMD16_NB_PERFEVTSEL3, 0x0ULL);
+        for(int j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    if ((event->options[j].value & 0xFFULL) < 0x04)
+                    {
+                        flags |= (event->options[j].value & 0xFFULL) << 24;
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
     }
-
-    //flags |= (1<<16);  /* user mode flag */
-    /*msr_write(cpu_id, MSR_AMD16_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_AMD16_PERFEVTSEL3, flags);*/
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
 }
 
-
-void perfmon_setupCounterThread_kabini(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int k16_uncore_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
 {
     uint64_t flags = 0x0ULL;
-    uint64_t reg = kabini_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
 
-    /* only one thread accesses Uncore */
-    if ( (kabini_counter_map[index].type == UNCORE) &&
-            !(socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) )
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
     {
-        return;
+        return 0;
     }
 
-    if (kabini_counter_map[index].type == PMC)
+    flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
+
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UNCORE);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+int k16_cache_setup(int cpu_id, RegisterIndex index, PerfmonEvent* event)
+{
+    uint64_t flags = 0x0ULL;
+
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] != cpu_id)
     {
-        flags |= (1<<16);
+        return 0;
     }
 
-    /* AMD uses a 12 bit Event mask: [35:32][7:0] */
     flags |= ((uint64_t)(event->eventId>>8)<<32) + (event->umask<<8) + (event->eventId & ~(0xF00U));
-
-    if (perfmon_verbose)
+    if (event->numberOfOptions > 0)
     {
-        printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                cpu_id,
-                LLU_CAST reg,
-                LLU_CAST flags);
+        for(int j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    if ((event->options[j].value & 0xFFULL) < 0x04)
+                    {
+                        flags |= (event->options[j].value & 0xFFULL) << 24;
+                    }
+                    break;
+                case EVENT_OPTION_TID:
+                    flags |= (~((uint64_t)(event->options[j].value & 0xFULL))) << 56;
+                    break;
+                case EVENT_OPTION_NID:
+                    flags |= (~((uint64_t)(event->options[j].value & 0xFULL))) << 48;
+                    break;
+                default:
+                    break;
+            }
+        }
     }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_CBOX);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
 
-    msr_write(cpu_id, reg , flags);
+int perfmon_setupCounterThread_kabini(int thread_id, PerfmonEventSet* eventSet)
+{
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        switch (type)
+        {
+            case PMC:
+                k16_pmc_setup(cpu_id, index, event);
+                break;
+            case UNCORE:
+                k16_uncore_setup(cpu_id, index, event);
+                break;
+            case CBOX0:
+                k16_cache_setup(cpu_id, index, event);
+                break;
+            default:
+                break;
+        }
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+    }
+    return 0;
 }
 
 
-void perfmon_startCountersThread_kabini(int thread_id)
+int perfmon_startCountersThread_kabini(int thread_id, PerfmonEventSet* eventSet)
 {
-    int haveLock = 0;
-    uint64_t flags;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int haveSLock = 0;
+    int haveTLock = 0;
+    uint64_t flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
-        haveLock = 1;
+        haveSLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTLock = 1;
     }
 
-    for ( int i=0; i<NUM_COUNTERS_KABINI; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (kabini_counter_map[i].type == PMC)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                msr_write(cpu_id, kabini_counter_map[i].counterRegister , 0x0ULL);
-                flags = msr_read(cpu_id, kabini_counter_map[i].configRegister);
-                flags |= (1<<22);  /* enable flag */
-
-                if (perfmon_verbose) 
-                {
-                    printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                            LLU_CAST kabini_counter_map[i].configRegister,
-                            LLU_CAST flags);
-                }
-
-                msr_write(cpu_id, kabini_counter_map[i].configRegister , flags);
-
+                continue;
             }
-            else if ( kabini_counter_map[i].type == UNCORE )
-            {
-                if(haveLock)
-                {
-                    msr_write(cpu_id, kabini_counter_map[i].counterRegister , 0x0ULL);
-                    flags = msr_read(cpu_id, kabini_counter_map[i].configRegister);
-                    flags |= (1<<22);  /* enable flag */
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t reg = counter_map[index].configRegister;
+            uint32_t counter = counter_map[index].counterRegister;
 
-                    if (perfmon_verbose)
-                    {
-                        printf("perfmon_start_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                                LLU_CAST kabini_counter_map[i].configRegister,
-                                LLU_CAST flags);
-                    }
-
-                    msr_write(cpu_id, kabini_counter_map[i].configRegister , flags);
-                }
+            if ((type == PMC) ||
+                ((type == UNCORE) && (haveSLock)) ||
+                ((type == CBOX0) && (haveTLock)))
+            {
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+                flags |= (1ULL<<22);  /* enable flag */
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
             }
         }
     }
+    return 0;
 }
 
-void perfmon_stopCountersThread_kabini(int thread_id)
+int perfmon_stopCountersThread_kabini(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t flags = 0x0ULL;
+    int haveSLock = 0;
+    int haveTLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
-        haveLock = 1;
+        haveSLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTLock = 1;
     }
 
-    for ( int i=0; i<NUM_COUNTERS_KABINI; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if ( kabini_counter_map[i].type == PMC )
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                flags = msr_read(cpu_id,kabini_counter_map[i].configRegister);
-                flags &= ~(1<<22);  /* clear enable flag */
-                msr_write(cpu_id, kabini_counter_map[i].configRegister , flags);
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, kabini_counter_map[i].counterRegister);
-
-                if (perfmon_verbose)
-                {
-                    printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                            LLU_CAST kabini_counter_map[i].configRegister,
-                            LLU_CAST flags);
-                    printf("perfmon_stop_counters: Read Register 0x%llX , Flags: 0x%llX \n",
-                            LLU_CAST kabini_counter_map[i].counterRegister,
-                            LLU_CAST perfmon_threadData[thread_id].counters[i].counterData);
-                }
-
+                continue;
             }
-            else if (kabini_counter_map[i].type == UNCORE)
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t reg = counter_map[index].configRegister;
+            uint32_t counter = counter_map[index].counterRegister;
+            if ((type == PMC) ||
+                ((type == UNCORE) && (haveSLock)) ||
+                ((type == CBOX0) && (haveTLock)))
             {
-                if(haveLock)
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &flags));
+                flags &= ~(1ULL<<22);  /* clear enable flag */
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
                 {
-                    flags = msr_read(cpu_id, kabini_counter_map[i].configRegister);
-                    flags &= ~(1<<22);  /* clear enable flag */
-                    msr_write(cpu_id, kabini_counter_map[i].configRegister , flags);
-
-                    if (perfmon_verbose)
-                    {
-                        printf("perfmon_stop_counters: Write Register 0x%llX , Flags: 0x%llX \n",
-                                LLU_CAST kabini_counter_map[i].configRegister,
-                                LLU_CAST flags);
-                    }
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, kabini_counter_map[i].counterRegister);
+                    eventSet->events[i].threadCounter[thread_id].overflows++;
                 }
+                eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
             }
         }
     }
+    return 0;
 }
 
 
-void perfmon_readCountersThread_kabini(int thread_id)
+int perfmon_readCountersThread_kabini(int thread_id, PerfmonEventSet* eventSet)
 {
-    int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int haveSLock = 0;
+    int haveTLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
-        haveLock = 1;
+        haveSLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTLock = 1;
     }
 
-
-    for (int i=0;i<NUM_COUNTERS_KABINI;i++)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if ( perfmon_threadData[thread_id].counters[i].init == TRUE )
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if ( kabini_counter_map[i].type == UNCORE )
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                if ( haveLock )
-                {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, kabini_counter_map[i].counterRegister);
-                }
+                continue;
             }
-            else
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t counter = counter_map[index].counterRegister;
+
+            if ((type == PMC) ||
+                ((type == UNCORE) && (haveSLock)) ||
+                ((type == CBOX0) && (haveTLock)))
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, kabini_counter_map[i].counterRegister);
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                VERBOSEPRINTREG(cpu_id, counter, counter_result, CLEAR_CTRL);
+                if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                {
+                    eventSet->events[i].threadCounter[thread_id].overflows++;
+                }
+                eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
             }
         }
     }
+    return 0;
 }
 
+
+int perfmon_finalizeCountersThread_kabini(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveSLock = 0;
+    int haveTLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveSLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTLock = 1;
+    }
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        if ((type == PMC) ||
+            ((type == UNCORE) && (haveSLock)) ||
+            ((type == CBOX0) && (haveTLock)))
+        {
+            VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, 0x0ULL, CLEAR_CTRL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, 0x0ULL));
+            eventSet->events[i].threadCounter[thread_id].init = FALSE;
+        }
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_kabini_counters.h b/src/includes/perfmon_kabini_counters.h
index 8662522..a1b9bdb 100644
--- a/src/includes/perfmon_kabini_counters.h
+++ b/src/includes/perfmon_kabini_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_kabini_counters.h
  *
- *      Description:  Counter Header File of perfmon module for AMD Family16
+ *      Description:  Counter Header File of perfmon module for AMD Family 16
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,19 +29,33 @@
  * =======================================================================================
  */
 
-#define NUM_COUNTERS_KABINI 8 
-#define NUM_COUNTERS_CORE_KABINI 4
+#define NUM_COUNTERS_KABINI 12
+#define NUM_COUNTERS_CORE_KABINI 8
 
-static PerfmonCounterMap kabini_counter_map[NUM_COUNTERS_KABINI] = {
+#define KAB_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD
+#define KAB_VALID_OPTIONS_CBOX EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD|EVENT_OPTION_TID_MASK|EVENT_OPTION_NID_MASK
+
+static RegisterMap kabini_counter_map[NUM_COUNTERS_KABINI] = {
     /* Core counters */
     {"PMC0",PMC0, PMC, MSR_AMD16_PERFEVTSEL0, MSR_AMD16_PMC0, 0, 0},
     {"PMC1",PMC1, PMC, MSR_AMD16_PERFEVTSEL1, MSR_AMD16_PMC1, 0, 0},
     {"PMC2",PMC2, PMC, MSR_AMD16_PERFEVTSEL2, MSR_AMD16_PMC2, 0, 0},
     {"PMC3",PMC3, PMC, MSR_AMD16_PERFEVTSEL3, MSR_AMD16_PMC3, 0, 0},
+    /* L2 cache counters */
+    {"CPMC0",PMC4, CBOX0, MSR_AMD16_L2_PERFEVTSEL0, MSR_AMD16_L2_PMC0, 0, 0},
+    {"CPMC1",PMC5, CBOX0, MSR_AMD16_L2_PERFEVTSEL1, MSR_AMD16_L2_PMC1, 0, 0},
+    {"CPMC2",PMC6, CBOX0, MSR_AMD16_L2_PERFEVTSEL2, MSR_AMD16_L2_PMC2, 0, 0},
+    {"CPMC3",PMC7, CBOX0, MSR_AMD16_L2_PERFEVTSEL3, MSR_AMD16_L2_PMC3, 0, 0},
     /* Northbridge counters */
-    {"UPMC0",PMC4, UNCORE, MSR_AMD16_NB_PERFEVTSEL0, MSR_AMD16_NB_PMC0, 0, 0},
-    {"UPMC1",PMC5, UNCORE, MSR_AMD16_NB_PERFEVTSEL1, MSR_AMD16_NB_PMC1, 0, 0},
-    {"UPMC2",PMC6, UNCORE, MSR_AMD16_NB_PERFEVTSEL2, MSR_AMD16_NB_PMC2, 0, 0},
-    {"UPMC3",PMC7, UNCORE, MSR_AMD16_NB_PERFEVTSEL3, MSR_AMD16_NB_PMC3, 0, 0}
+    {"UPMC0",PMC8, UNCORE, MSR_AMD16_NB_PERFEVTSEL0, MSR_AMD16_NB_PMC0, 0, 0},
+    {"UPMC1",PMC9, UNCORE, MSR_AMD16_NB_PERFEVTSEL1, MSR_AMD16_NB_PMC1, 0, 0},
+    {"UPMC2",PMC10, UNCORE, MSR_AMD16_NB_PERFEVTSEL2, MSR_AMD16_NB_PMC2, 0, 0},
+    {"UPMC3",PMC11, UNCORE, MSR_AMD16_NB_PERFEVTSEL3, MSR_AMD16_NB_PMC3, 0, 0}
+};
+
+static BoxMap kabini_box_map[NUM_UNITS] = {
+    [PMC] = {0, 0, 0, 0, 0, 0, 48},
+    [UNCORE] = {0, 0, 0, 0, 0, 0, 48},
+    [CBOX0] = {0, 0, 0, 0, 0, 0, 48},
 };
 
diff --git a/src/includes/perfmon_kabini_events.txt b/src/includes/perfmon_kabini_events.txt
index 9ccc726..93e61bf 100644
--- a/src/includes/perfmon_kabini_events.txt
+++ b/src/includes/perfmon_kabini_events.txt
@@ -1,16 +1,16 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_kabini_events.txt
-# 
+#
 #      Description:  Event list for AMD Kabini
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author: saravanan.ekanathan at amd.com
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   saravanan.ekanathan at amd.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -148,9 +148,10 @@ UMASK_PREFETCH_INSTR_DISPATCHED_NTA       0x04
 EVENT_DCACHE_LOCK_MISS           0x4C   PMC
 UMASK_DCACHE_LOCK_MISS           0x02
 
-EVENT_DTLB_L1_HIT                0x4D   PMC
-UMASK_DTLB_L1_HIT_4K             0x01
-UMASK_DTLB_L1_HIT_2M             0x02
+EVENT_DTLB_L1_HIT                 0x4D   PMC
+UMASK_DTLB_L1_HIT_4KB             0x01
+UMASK_DTLB_L1_HIT_2MB             0x02
+UMASK_DTLB_L1_HIT_ANY             0x03
 
 EVENT_INEFFECTIVE_PREFETCHES        0x52    PMC
 UMASK_INEFFECTIVE_PREFETCHES_DATA_CACHE     0x01
@@ -234,12 +235,13 @@ UMASK_INSTRUCTION_CACHE_L2_REFILLS         0x00
 EVENT_INSTRUCTION_CACHE_SYSTEM_REFILLS        0x083     PMC
 UMASK_INSTRUCTION_CACHE_SYSTEM_REFILLS         0x00
 
-EVENT_ITLB_L1_MISS_L2_HIT        0x084     PMC
-UMASK_ITLB_L1_MISS_L2_HIT         0x00
+EVENT_ITLB_L1_MISS_L2_HIT              0x084     PMC
+UMASK_ITLB_L1_MISS_L2_HIT              0x00
 
-EVENT_ITLB_L1_MISS_L2_MISS        0x085     PMC
+EVENT_ITLB_L1_MISS_L2_MISS             0x085     PMC
 UMASK_ITLB_L1_MISS_L2_MISS_4KB         0x01
 UMASK_ITLB_L1_MISS_L2_MISS_2MB         0x02
+UMASK_ITLB_L1_MISS_L2_MISS_ANY         0x03
 
 EVENT_INSTRUCTION_FETCH_STALL        0x087     PMC
 UMASK_INSTRUCTION_FETCH_STALL         0x00
diff --git a/src/includes/perfmon_nehalem.h b/src/includes/perfmon_nehalem.h
index b3e7907..5e8fa47 100644
--- a/src/includes/perfmon_nehalem.h
+++ b/src/includes/perfmon_nehalem.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_nehalem.h
  *
- *      Description:  Header File of perfmon module for Nehalem.
+ *      Description:  Header File of perfmon module for Intel Nehalem.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,300 +30,583 @@
  */
 
 #include <perfmon_nehalem_events.h>
-#include <perfmon_nehalem_groups.h>
 #include <perfmon_nehalem_counters.h>
+#include <error.h>
+#include <affinity.h>
+
 
 static int perfmon_numCountersNehalem = NUM_COUNTERS_NEHALEM;
-static int perfmon_numGroupsNehalem = NUM_GROUPS_NEHALEM;
 static int perfmon_numArchEventsNehalem = NUM_ARCH_EVENTS_NEHALEM;
 
-#define OFFSET_PMC 3
-#define OFFSET_UPMC 7
 
-void perfmon_init_nehalem(PerfmonThread *thread)
+int perfmon_init_nehalem(int cpu_id)
 {
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC2, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC3, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
-    /* initialize fixed counters
-     * FIXED 0: Instructions retired
-     * FIXED 1: Clocks unhalted core
-     * FIXED 2: Clocks unhalted ref */
-    //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
-    //    flags |= (1<<22);  /* enable flag */
-    //    flags |= (1<<16);  /* user mode flag */
-    //setBit(flags,16); /* set user mode flag */
-    //setBit(flags,22); /* set enable flag */
-
-    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
-
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire(
-                (int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id)
-       )
-    {
-        /* UNCORE FIXED 0: Uncore cycles */
-        msr_write(cpu_id, MSR_UNCORE_FIXED_CTR_CTRL, 0x01ULL);
-        msr_write(cpu_id, MSR_UNCORE_FIXED_CTR_CTRL, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_FIXED_CTR0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL4, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL5, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL6, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL7, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC0, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC1, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC2, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC3, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC4, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC5, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC6, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PMC7, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-        msr_write(cpu_id, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL);
-        msr_write(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL);
-
-        /* Preinit of PERFEVSEL registers */
-        //clearBit(flags,16); /* set enable flag */
-
-        /*msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL0, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL1, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL2, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL3, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL4, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL5, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL6, flags);
-        msr_write(cpu_id, MSR_UNCORE_PERFEVTSEL7, flags);*/
-    }
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
 }
 
+uint32_t neh_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j = 0; j < event->numberOfOptions; j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+            default:
+                break;
+        }
+    }
+    return flags;
+}
 
-void perfmon_setupCounterThread_nehalem(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int neh_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
-    int haveLock = 0;
+    int j;
     uint64_t flags = 0x0ULL;
-    uint64_t reg = nehalem_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    uint64_t offcore_flags = 0x0ULL;
+
+    flags = (1ULL<<22)|(1ULL<<16);
+    flags |= (event->umask<<8) + event->eventId;
+    /* set custom cfg and cmask */
+    if ((event->cfgBits != 0) &&
+        (event->eventId != 0xB7) &&
+        (event->eventId != 0xBB))
     {
-        haveLock = 1;
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL)<<24;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0xFF);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value & 0xF7)<<7;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    // Offcore event with additional configuration register
+    // cfgBits contain offset of "request type" bit
+    // cmask contain offset of "response type" bit
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
     }
+    if ((event->eventId == 0xBB) &&
+        ((cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == NEHALEM_WESTMERE_M)))
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
 
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
+int neh_uncore_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t mask_flags = 0x0ULL;
 
-    if ( nehalem_counter_map[index].type == PMC )
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
     {
-        flags = (1<<16)|(1<<22);
-
-        /* Intel with standard 8 bit event mask: [7:0] */
-        flags |= (event->umask<<8) + event->eventId;
+        return 0;
+    }
 
-        if (event->cfgBits != 0) /* set custom cfg and cmask */
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->cfgBits != 0x0 && event->eventId != 0x35) /* set custom cfg and cmask */
+    {
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+    else if (event->cfgBits != 0x0 && event->eventId == 0x35) /* set custom cfg and cmask */
+    {
+        mask_flags |= ((uint64_t)event->cfgBits)<<61;
+        if (event->cmask != 0x0)
         {
-            flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-            flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+            mask_flags |= ((uint64_t)event->cmask)<<40;
         }
-
-        msr_write(cpu_id, reg , flags);
-
-        if (perfmon_verbose)
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
         {
-            printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                    cpu_id,
-                    LLU_CAST reg,
-                    LLU_CAST flags);
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL)<<24;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    mask_flags |= field64(event->options[j].value,3,37)<<2;
+                    break;
+                case EVENT_OPTION_OPCODE:
+                    mask_flags |= field64(event->options[j].value,0,8)<<40;
+                    break;
+                default:
+                    break;
+            }
         }
     }
-    else if ( nehalem_counter_map[index].type == UNCORE )
+    if ((mask_flags != 0x0ULL) && (event->eventId == 0x35))
     {
-        if(haveLock)
+        if ((cpuid_info.model == NEHALEM_BLOOMFIELD) ||
+             (cpuid_info.model == NEHALEM_LYNNFIELD) ||
+             (cpuid_info.model == NEHALEM_LYNNFIELD_M))
         {
-            flags = (1<<22);
+            DEBUG_PLAIN_PRINT(DEBUGLEV_ONLY_ERROR, Register documented in SDM but ADDR_OPCODE_MATCH event not documented for Nehalem architectures);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_ADDR_OPCODE_MATCH, LLU_CAST mask_flags, SETUP_UNCORE_MATCH);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_ADDR_OPCODE_MATCH, mask_flags));
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UNCORE);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
 
-            /* Intel with standard 8 bit event mask: [7:0] */
-            flags |= (event->umask<<8) + event->eventId;
+int perfmon_setupCounterThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-            if (event->cfgBits != 0) /* set custom cfg and cmask */
-            {
-                flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-                flags |= ((event->cmask<<8) + event->cfgBits)<<16;
-            }
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
 
-            msr_write(cpu_id, reg , flags);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
 
-            if (perfmon_verbose)
-            {
-                printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                        cpu_id,
-                        LLU_CAST reg,
-                        LLU_CAST flags);
-            }
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
 
+        switch (type)
+        {
+            case PMC:
+                neh_pmc_setup(cpu_id, index, event);
+                break;
+            case FIXED:
+                fixed_flags |= neh_fixed_setup(cpu_id, index, event);
+                break;
+            case UNCORE:
+                if (haveLock)
+                {
+                    if (index < NUM_COUNTERS_UNCORE_NEHALEM-1)
+                    {
+                        neh_uncore_setup(cpu_id, index, event);
+                    }
+                    else
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_FIXED_CTR_CTRL, LLU_CAST 0x1ULL, SETUP_UPMCFIX);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_FIXED_CTR_CTRL, 0x1ULL));
+                    }
+                }
+                break;
+            default:
+                break;
         }
     }
-    else if (nehalem_counter_map[index].type == FIXED)
+    if (fixed_flags != 0x0ULL)
     {
-        fixed_flags |= (0x2 <<(index*4));
-        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
     }
+    return 0;
 }
 
-void perfmon_startCountersThread_nehalem(int thread_id)
+int perfmon_startCountersThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
     uint64_t flags = 0x0ULL;
     uint64_t uflags = 0x0ULL;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
-        msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL);
-        /* Fixed Uncore counter */
-        uflags = 0x100000000ULL;
     }
 
-    for ( int i=0; i<NUM_PMC; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (nehalem_counter_map[i].type == PMC)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                msr_write(cpu_id, nehalem_counter_map[i].counterRegister , 0x0ULL);
-                flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
+                continue;
             }
-            else if (nehalem_counter_map[i].type == FIXED)
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter = counter_map[index].counterRegister;
+            switch(type)
             {
-                msr_write(cpu_id, nehalem_counter_map[i].counterRegister , 0x0ULL);
-                flags |= (1ULL<<(i+32));  /* enable fixed counter */
-            }
-            else if (nehalem_counter_map[i].type == UNCORE)
-            {
-                if(haveLock)
-                {
-                    msr_write(cpu_id, nehalem_counter_map[i].counterRegister , 0x0ULL);
-                    uflags |= (1<<(i-OFFSET_UPMC));  /* enable uncore counter */
-                }
+                case PMC:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+                    flags |= (1ULL<<(index - cpuid_info.perf_num_fixed_ctr));  /* enable counter */
+                    break;
+                case FIXED:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+                    flags |= (1ULL<<(index+32));  /* enable fixed counter */
+                    break;
+                case UNCORE:
+                    if(haveLock)
+                    {
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter, 0x0ULL));
+                        if (index < NUM_COUNTERS_UNCORE_NEHALEM-1)
+                        {
+                            uflags |= (1ULL<<(index-NUM_COUNTERS_CORE_NEHALEM));  /* enable uncore counter */
+                        }
+                        else
+                        {
+                            uflags |= (1ULL<<32);
+                        }
+                    }
+                    break;
+                default:
+                    break;
             }
         }
     }
 
-    if (perfmon_verbose)
+    if (haveLock && (uflags != 0x0ULL) && (eventSet->regTypeMask & ~(0xF)))
     {
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags, UNFREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, uflags));
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    if (haveLock) msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, uflags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+    if ((flags != 0x0ULL) && (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED))))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_AND_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|flags));
+    }
+    return 0;
 }
 
-void perfmon_stopCountersThread_nehalem(int thread_id)
+#define NEH_CHECK_OVERFLOW(offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &tmp)); \
+        if (tmp & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (tmp & (1ULL<<offset)))); \
+        } \
+    }
+
+#define NEH_CHECK_UNCORE_OVERFLOW(offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_STATUS, &tmp)); \
+        if (tmp & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, (tmp & (1ULL<<offset)))); \
+        } \
+    }
+
+int perfmon_stopCountersThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
-        msr_write(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL);
     }
 
-    for ( int i=0; i<NUM_COUNTERS_NEHALEM; i++ )
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (nehalem_counter_map[i].type == UNCORE)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                if(haveLock)
-                {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, nehalem_counter_map[i].counterRegister);
-                }
+                continue;
             }
-            else
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter = counter_map[index].counterRegister;
+            switch (type)
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, nehalem_counter_map[i].counterRegister);
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_PMC);
+                    NEH_CHECK_OVERFLOW(index - cpuid_info.perf_num_fixed_ctr);
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_FIXED);
+                    NEH_CHECK_OVERFLOW(index + 32);
+                    break;
+                case UNCORE:
+                    if(haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_UNCORE);
+                        if (index < NUM_COUNTERS_UNCORE_NEHALEM-1)
+                        {
+                            NEH_CHECK_UNCORE_OVERFLOW(index - NUM_COUNTERS_CORE_NEHALEM);
+                        }
+                        else
+                        {
+                            NEH_CHECK_UNCORE_OVERFLOW(32);
+                        }
+                    }
+                    break;
+                default:
+                    break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
+    return 0;
+}
+
+int perfmon_readCountersThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t pmc_flags = 0x0ULL;
+    uint64_t uncore_flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &pmc_flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, &uncore_flags));
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
 
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter = counter_map[index].counterRegister;
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_PMC);
+                    NEH_CHECK_OVERFLOW(index - cpuid_info.perf_num_fixed_ctr);
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_FIXED);
+                    NEH_CHECK_OVERFLOW(index + 32);
+                    break;
+                case UNCORE:
+                    if(haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                        VERBOSEPRINTREG(cpu_id, counter, counter_result, READ_UNCORE);
+                        if (index < NUM_COUNTERS_UNCORE_NEHALEM-1)
+                        {
+                            NEH_CHECK_UNCORE_OVERFLOW(index - NUM_COUNTERS_CORE_NEHALEM);
+                        }
+                        else
+                        {
+                            NEH_CHECK_UNCORE_OVERFLOW(32);
+                        }
+                    }
+                    break;
+                default:
+                    break;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+        }
+    }
 
-    if((flags & 0x3) || (flags & (0x3ULL<<32)) )
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, pmc_flags, UNFREEZE_PMC_AND_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, pmc_flags));
+    }
+    if (haveLock && (eventSet->regTypeMask & ~(0xF)))
     {
-        printf ("Overflow occured \n");
-        printf ("Status: 0x%llX \n", LLU_CAST flags);
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, uncore_flags, UNFREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, uncore_flags));
     }
+    return 0;
 }
 
-void perfmon_readCountersThread_nehalem(int thread_id)
+int perfmon_finalizeCountersThread_nehalem(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int haveTileLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
 
-    for ( int i=0; i<NUM_COUNTERS_NEHALEM; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (nehalem_counter_map[i].type == UNCORE)
+            RegisterType type = eventSet->events[i].type;
+            if (type == NOTYPE)
             {
-                if(haveLock)
-                {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, nehalem_counter_map[i].counterRegister);
-                }
+                continue;
             }
-            else
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t reg = counter_map[index].configRegister;
+            PciDeviceIndex dev = counter_map[index].device;
+            switch (type)
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, nehalem_counter_map[i].counterRegister);
+                case PMC:
+                    ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                    if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                    }
+                    else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB) &&
+                             ((cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == NEHALEM_WESTMERE_M)))
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                    }
+                    else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0x35) &&
+                             ((cpuid_info.model == NEHALEM_WESTMERE) || (cpuid_info.model == NEHALEM_WESTMERE_M)))
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL, CLEAR_UNCORE_MATCH);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_ADDR_OPCODE_MATCH, 0x0ULL));
+                    }
+                    break;
+                case FIXED:
+                    ovf_values_core |= (1ULL<<(index+32));
+                    break;
+                default:
+                    break;
             }
+            if ((reg) && (((type == PMC)||(type == FIXED))||((type >= UNCORE) && (haveLock))))
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            }
+            eventSet->events[i].threadCounter[thread_id].init = FALSE;
         }
     }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core, CLEAR_OVF_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, CLEAR_PMC_AND_FIXED_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    if (haveLock && eventSet->regTypeMask & ~(0xFULL))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, 0x0ULL, CLEAR_UNCORE_OVF);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL, CLEAR_UNCORE_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNCORE_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    return 0;
 }
 
diff --git a/src/includes/perfmon_nehalemEX.h b/src/includes/perfmon_nehalemEX.h
index ea632cf..23c5534 100644
--- a/src/includes/perfmon_nehalemEX.h
+++ b/src/includes/perfmon_nehalemEX.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_nehalemEX.h
  *
- *      Description:  Header File of perfmon module for Nehalem EX.
+ *      Description:  Header File of perfmon module for Intel Nehalem EX.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,839 +30,1218 @@
  */
 
 #include <perfmon_nehalemEX_events.h>
-#include <perfmon_nehalemEX_groups.h>
+#include <perfmon_nehalemEX_counters.h>
+#include <error.h>
+#include <affinity.h>
 
-#define NUM_COUNTERS_NEHALEMEX 7
 
-//static int perfmon_numCountersNehalemEX = NUM_COUNTERS_NEHALEMEX;
-static int perfmon_numGroupsNehalemEX = NUM_GROUPS_NEHALEMEX;
 static int perfmon_numArchEventsNehalemEX = NUM_ARCH_EVENTS_NEHALEMEX;
+static int perfmon_numCountersNehalemEX = NUM_COUNTERS_NEHALEMEX;
 
 /* This SUCKS: There are only subtle difference between NehalemEX
- * and Westmere EX Uncore. Still one of them is that one field is
- * 1 bit shifted. Thank you Intel for this mess!!! Do you want
+ * and Westmere EX Uncore. Still one of them is that one field is 
+ * 1 bit shifted. Thank you Intel for this mess!!! Do you want 
  * to change the register definitions for every architecture?*/
 
+int perfmon_init_nehalemEX(int cpu_id)
+{
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
+}
+
+uint32_t nex_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j = 0; j < event->numberOfOptions; j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+            default:
+                break;
+        }
+    }
+    return flags;
+}
 
-void perfmon_init_nehalemEX(PerfmonThread *thread)
+int nex_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
+    int j;
     uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-    perfmon_verbose = 1;
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC2, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC3, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
-    /* initialize fixed counters
-     * FIXED 0: Instructions retired
-     * FIXED 1: Clocks unhalted core
-     * FIXED 2: Clocks unhalted ref */
-    //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
-    /* Preinit of PERFEVSEL registers */
-    //flags |= (1<<22);  /* enable flag */
-    //flags |= (1<<16);  /* user mode flag */
-
-    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
-    /* Initialize uncore */
-    /* MBOX */
-    thread->counters[PMC7].id  = 0;
-    thread->counters[PMC8].id  = 1;
-    thread->counters[PMC9].id  = 2;
-    thread->counters[PMC10].id = 3;
-    thread->counters[PMC11].id = 4;
-    thread->counters[PMC12].id = 5;
-    westmereEX_PMunits[MBOX0].ctrlRegister = MSR_M0_PMON_BOX_CTRL;
-    westmereEX_PMunits[MBOX0].statusRegister = MSR_M0_PMON_BOX_STATUS;
-    westmereEX_PMunits[MBOX0].ovflRegister = MSR_M0_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC13].id = 0;
-    thread->counters[PMC14].id = 1;
-    thread->counters[PMC15].id = 2;
-    thread->counters[PMC16].id = 3;
-    thread->counters[PMC17].id = 4;
-    thread->counters[PMC18].id = 5;
-    westmereEX_PMunits[MBOX1].ctrlRegister = MSR_M1_PMON_BOX_CTRL;
-    westmereEX_PMunits[MBOX1].statusRegister = MSR_M1_PMON_BOX_STATUS;
-    westmereEX_PMunits[MBOX1].ovflRegister = MSR_M1_PMON_BOX_OVF_CTRL;
-
-    /* BBOX */
-    thread->counters[PMC19].id = 0;
-    thread->counters[PMC20].id = 1;
-    thread->counters[PMC21].id = 2;
-    thread->counters[PMC22].id = 3;
-    westmereEX_PMunits[BBOX0].ctrlRegister = MSR_B0_PMON_BOX_CTRL;
-    westmereEX_PMunits[BBOX0].statusRegister =  MSR_B0_PMON_BOX_STATUS;
-    westmereEX_PMunits[BBOX0].ovflRegister = MSR_B0_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC23].id = 0;
-    thread->counters[PMC24].id = 1;
-    thread->counters[PMC25].id = 2;
-    thread->counters[PMC26].id = 3;
-    westmereEX_PMunits[BBOX1].ctrlRegister = MSR_B1_PMON_BOX_CTRL;
-    westmereEX_PMunits[BBOX1].statusRegister =  MSR_B1_PMON_BOX_STATUS;
-    westmereEX_PMunits[BBOX1].ovflRegister = MSR_B1_PMON_BOX_OVF_CTRL;
-
-    /* RBOX */
-    thread->counters[PMC27].id = 0;
-    thread->counters[PMC28].id = 1;
-    thread->counters[PMC29].id = 2;
-    thread->counters[PMC30].id = 3;
-    thread->counters[PMC31].id = 4;
-    thread->counters[PMC32].id = 5;
-    thread->counters[PMC33].id = 6;
-    thread->counters[PMC34].id = 7;
-    westmereEX_PMunits[RBOX0].ctrlRegister = MSR_R0_PMON_BOX_CTRL;
-    westmereEX_PMunits[RBOX0].statusRegister =  MSR_R0_PMON_BOX_STATUS;
-    westmereEX_PMunits[RBOX0].ovflRegister = MSR_R0_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC35].id = 0;
-    thread->counters[PMC36].id = 1;
-    thread->counters[PMC37].id = 2;
-    thread->counters[PMC38].id = 3;
-    thread->counters[PMC39].id = 4;
-    thread->counters[PMC40].id = 5;
-    thread->counters[PMC41].id = 6;
-    thread->counters[PMC42].id = 7;
-    westmereEX_PMunits[RBOX1].ctrlRegister = MSR_R1_PMON_BOX_CTRL;
-    westmereEX_PMunits[RBOX1].statusRegister =  MSR_R1_PMON_BOX_STATUS;
-    westmereEX_PMunits[RBOX1].ovflRegister = MSR_R1_PMON_BOX_OVF_CTRL;
-
-    /* WBOX */
-    thread->counters[PMC43].id = 0;
-    thread->counters[PMC44].id = 1;
-    thread->counters[PMC45].id = 2;
-    thread->counters[PMC46].id = 3;
-    thread->counters[PMC47].id = 31;
-    westmereEX_PMunits[WBOX].ctrlRegister   = MSR_W_PMON_BOX_CTRL;
-    westmereEX_PMunits[WBOX].statusRegister = MSR_W_PMON_BOX_STATUS;
-    westmereEX_PMunits[WBOX].ovflRegister   = MSR_W_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC48].id = 0;
-    westmereEX_PMunits[UBOX].ctrlRegister   = MSR_U_PMON_GLOBAL_CTRL;
-    westmereEX_PMunits[UBOX].statusRegister = MSR_U_PMON_GLOBAL_STATUS;
-    westmereEX_PMunits[UBOX].ovflRegister   = MSR_U_PMON_GLOBAL_OVF_CTRL;
-
-    /* Set IDs for all CBOXes */
-    for (int i=PMC49; i<=PMC88; i+= 5)
-    {
-        for(int j=0; j<5; j++)
+    uint64_t offcore_flags = 0x0ULL;
+    uint64_t reg = counter_map[index].configRegister;
+
+    flags |= (1ULL<<22)|(1ULL<<16);
+    /* Intel with standard 8 bit event mask: [7:0] */
+    flags |= (event->umask<<8) + event->eventId;
+
+    if (event->cfgBits != 0 &&
+       ((event->eventId != 0xB7) || (event->eventId != 0xBB)))
+    {
+        /* set custom cfg and cmask */
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+
+    if (event->numberOfOptions > 0)
+    {
+        for (j = 0; j < event->numberOfOptions; j++)
         {
-            thread->counters[i].id = j;
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL)<<24;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0xFFULL);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value & 0xF7ULL)<<8;
+                    break;
+                default:
+                    break;
+            }
         }
     }
-    westmereEX_PMunits[CBOX0].ctrlRegister   = MSR_C0_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX0].statusRegister = MSR_C0_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX0].ovflRegister   = MSR_C0_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX1].ctrlRegister   = MSR_C1_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX1].statusRegister = MSR_C1_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX1].ovflRegister   = MSR_C1_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX2].ctrlRegister   = MSR_C2_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX2].statusRegister = MSR_C2_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX2].ovflRegister   = MSR_C2_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX3].ctrlRegister   = MSR_C3_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX3].statusRegister = MSR_C3_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX3].ovflRegister   = MSR_C3_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX4].ctrlRegister   = MSR_C4_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX4].statusRegister = MSR_C4_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX4].ovflRegister   = MSR_C4_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX5].ctrlRegister   = MSR_C5_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX5].statusRegister = MSR_C5_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX5].ovflRegister   = MSR_C5_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX6].ctrlRegister   = MSR_C6_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX6].statusRegister = MSR_C6_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX6].ovflRegister   = MSR_C6_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX7].ctrlRegister   = MSR_C7_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX7].statusRegister = MSR_C7_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX7].ovflRegister   = MSR_C7_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC99].id = 0;
-    thread->counters[PMC100].id = 1;
-    thread->counters[PMC101].id = 2;
-    thread->counters[PMC102].id = 3;
-    westmereEX_PMunits[SBOX0].ctrlRegister   = MSR_S0_PMON_BOX_CTRL;
-    westmereEX_PMunits[SBOX0].statusRegister = MSR_S0_PMON_BOX_STATUS;
-    westmereEX_PMunits[SBOX0].ovflRegister   = MSR_S0_PMON_BOX_OVF_CTRL;
-    thread->counters[PMC103].id = 0;
-    thread->counters[PMC104].id = 1;
-    thread->counters[PMC105].id = 2;
-    thread->counters[PMC106].id = 3;
-    westmereEX_PMunits[SBOX1].ctrlRegister   = MSR_S1_PMON_BOX_CTRL;
-    westmereEX_PMunits[SBOX1].statusRegister = MSR_S1_PMON_BOX_STATUS;
-    westmereEX_PMunits[SBOX1].ovflRegister   = MSR_S1_PMON_BOX_OVF_CTRL;
-
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
-    {
-        msr_write(cpu_id, MSR_W_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_FIXED_CTR, 0x0ULL);
-
-        msr_write(cpu_id, MSR_M0_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL4, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL5, 0x0ULL);
-
-        msr_write(cpu_id, MSR_M1_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL4, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL5, 0x0ULL);
-
-        msr_write(cpu_id, MSR_B0_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL3, 0x0ULL);
-
-        msr_write(cpu_id, MSR_B1_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL3, 0x0ULL);
-
-        msr_write(cpu_id, MSR_R0_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL4, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL5, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL6, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL7, 0x0ULL);
-
-        msr_write(cpu_id, MSR_R1_PMON_BOX_CTRL,   0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL8,  0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL9,  0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL10, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL11, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL12, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL13, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL14, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL15, 0x0ULL);
-
-        msr_write(cpu_id, MSR_U_PMON_GLOBAL_EVNT_SEL, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL3, 0x0ULL);
-
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL3, 0x0ULL);
-
-        flags = 0x0UL;
-        flags |= (1<<29); /* reset all */
-        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, flags );
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
     }
+
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));
+    VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_PMC)
+
+    return 0;
 }
 
+
 /* MBOX macros */
-#define MBOX_GATE_NEHEX(NUM)  \
-flags = 0x41ULL; \
-switch (event->cfgBits)  \
-{  \
-    case 0x00:   /* primary Event */  \
-        flags |= (event->eventId<<9);  \
-        break;  \
-    case 0x01: /* secondary Events */  \
-        /* TODO fvid index is missing defaults to 0 */   \
-        flags |= (1<<7); /* toggle flag mode */   \
-        flags |= (event->eventId<<19);   \
-        switch (event->eventId)   \
-        {   \
-            case 0x00: /* CYCLES_DSP_FILL: DSP */   \
+
+#define NEX_SETUP_MBOX(number)  \
+    if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(MBOX##number))) \
+    { \
+        flags = 0x41ULL; \
+        if ((event->numberOfOptions > 0) && ((event->cfgBits == 0x02) || (event->cfgBits == 0x04))) \
+        { \
+            for (int j=0; j < event->numberOfOptions; j++) \
+            {\
+                switch (event->options[j].type) \
+                { \
+                    case EVENT_OPTION_MATCH0: \
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ADDR_MATCH, (event->options[j].value & 0x3FFFFFFFFULL))); \
+                        VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ADDR_MATCH, (event->options[j].value & 0x3FFFFFFFFULL), MBOX##number##_ADDR_MATCH) \
+                        break; \
+                    case EVENT_OPTION_MASK0: \
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ADDR_MASK, (event->options[j].value & 0x1FFFFFFC0ULL)>>6)); \
+                        VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ADDR_MASK, (event->options[j].value & 0x1FFFFFFC0ULL)>>6, MBOX##number##_ADDR_MASK) \
+                        break; \
+                    default: \
+                        break; \
+                } \
+            } \
+        } \
+        switch (event->cfgBits)  \
+        {  \
+            case 0x00:   /* primary Event */  \
+                flags |= (event->eventId & 0x1FULL)<<9;  \
+                break;  \
+            case 0x01: /* secondary Events */  \
+                /* TODO fvid index is missing defaults to 0 */   \
+                flags |= (1ULL<<7); /* toggle flag mode */   \
+                flags |= (event->eventId & 0x7ULL)<<19;   \
+                switch (event->eventId)   \
+                {   \
+                    case 0x00: /* CYCLES_DSP_FILL: DSP */   \
+                        {   \
+                            uint64_t dsp_flags = 0x0ULL;   \
+                            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_DSP, &dsp_flags));   \
+                            VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_DSP, dsp_flags, MBOX##number##_DSP_READ); \
+                            dsp_flags |= (event->umask & 0xFULL)<<7;  \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_DSP, dsp_flags));   \
+                            VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_DSP, dsp_flags, MBOX##number##_DSP); \
+                        }   \
+                        break;   \
+                    case 0x01: /* CYCLES_SCHED_MODE: ISS */   \
+                        {   \
+                            uint64_t iss_flags = 0x0ULL;   \
+                            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ISS, &iss_flags));   \
+                            VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ISS, iss_flags, MBOX##number##_ISS_READ); \
+                            iss_flags |= (event->umask & 0x3ULL)<<4;   \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ISS, iss_flags));   \
+                            VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ISS, iss_flags, MBOX##number##_ISS); \
+                        }    \
+                        break;   \
+                    case 0x05: /* CYCLES_PGT_STATE: PGT */   \
+                        {   \
+                            uint64_t pgt_flags = 0x0ULL;   \
+                            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_PGT, &pgt_flags));   \
+                            VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PGT, pgt_flags, MBOX##number##_PGT_READ); \
+                            pgt_flags |= (event->umask & 0x1ULL)<<6;   \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_PGT, pgt_flags));   \
+                            VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PGT, pgt_flags, MBOX##number##_PGT); \
+                        }    \
+                        break;   \
+                    case 0x06: /* BCMD_SCHEDQ_OCCUPANCY: MAP */   \
+                        {   \
+                            uint64_t map_flags = 0x0ULL;   \
+                            CHECK_MSR_WRITE_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_MAP, &map_flags));   \
+                            VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_MAP, map_flags, MBOX##number##_MAP_READ); \
+                            map_flags |= (event->umask & 0xFULL)<<5;   \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_MAP, map_flags));   \
+                            VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_MAP, map_flags, MBOX##number##_MAP); \
+                        }   \
+                        break;   \
+                    case 0x04: /* CYCLES_RETRYQ_STARVED */ \
+                    case 0x03: /* CYCLES_RETRYQ_MFULL */ \
+                        break; \
+                }    \
+                break;   \
+            case 0x02: /* DRAM_CMD: PLD/ISS */   \
+                flags |= (event->eventId & 0x1FULL)<<9;  \
+                {   \
+                    uint64_t pld_flags = 0x0ULL;   \
+                    uint64_t iss_flags = 0x0ULL;   \
+                    CHECK_MSR_WRITE_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_PLD, &pld_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PLD, pld_flags, MBOX##number##_PLD_READ); \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ISS, &iss_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ISS, iss_flags, MBOX##number##_ISS_READ); \
+                    pld_flags |= (event->umask & 0x1FULL)<<8;   \
+                    if (event->cmask & 0xFULL != 0)   \
+                    {   \
+                        iss_flags |= (event->cmask & 0x7ULL)<<7;   \
+                    }   \
+                    if ((event->cmask & 0xF0ULL) != 0) \
+                    { \
+                        pld_flags |= (1ULL<<0); /* toggle cmd flag */   \
+                    } \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_PLD, pld_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PLD, pld_flags, MBOX##number##_PLD); \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ISS, iss_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ISS, iss_flags, MBOX##number##_ISS); \
+                }   \
+                break;   \
+            case 0x03: /* DSP_FILL: DSP */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
                 {   \
                     uint64_t dsp_flags = 0x0ULL;   \
-                    dsp_flags |= (event->umask<<7);  \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags);   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_DSP, &dsp_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_DSP, dsp_flags, MBOX##number##_DSP_READ); \
+                    dsp_flags |= (event->umask & 0xFULL)<<7;   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_DSP, dsp_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_DSP, dsp_flags, MBOX##number##_DSP); \
                 }   \
                 break;   \
-            case 0x01: /* CYCLES_SCHED_MODE: ISS */   \
+            case 0x05: /* FRM_TYPE: ISS */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                if (((event->umask >= 0x0) && (event->umask <= 0x3)) || (event->umask == 0x8) || (event->umask == 0xC)) \
                 {   \
-                    uint32_t iss_flags = 0x0UL;   \
-                    iss_flags |= (event->umask<<4);   \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-                }    \
+                    uint64_t iss_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ISS, &iss_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ISS, iss_flags, MBOX##number##_ISS_READ); \
+                    iss_flags |= event->umask & 0xFULL;   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ISS, iss_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ISS, iss_flags, MBOX##number##_ISS); \
+                }   \
                 break;   \
-            case 0x05: /* CYCLES_PGT_STATE: PGT */   \
+            case 0x06: /* FVC_EV0: FVC */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
                 {   \
-                    uint32_t pgt_flags = 0x0UL;   \
-                    pgt_flags |= (event->umask<<6);   \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-                }    \
+                    uint64_t fvc_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, &fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_READ); \
+                    fvc_flags |= (event->umask & 0x7ULL)<<11;   \
+                    if (event->umask == 0x5)   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<5;   \
+                    }   \
+                    else   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<8;   \
+                    }   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_EV0); \
+                }   \
                 break;   \
-            case 0x06: /* BCMD_SCHEDQ_OCCUPANCY: MAP */   \
+            case 0x07: /* FVC_EV1: FVC */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
                 {   \
-                    uint32_t map_flags = 0x0UL;   \
-                    map_flags |= (event->umask<<6);   \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_MAP, map_flags);   \
+                    uint64_t fvc_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, &fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_READ); \
+                    fvc_flags |= (event->umask & 0x7ULL)<<14;   \
+                    if (event->umask == 0x5)   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<5;   \
+                    }   \
+                    else   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<8;   \
+                    }   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_EV1); \
                 }   \
                 break;   \
-        }    \
-    break;   \
-    case 0x02: /* DRAM_CMD: PLD/ISS */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t pld_flags = 0x0UL;   \
-            uint32_t iss_flags = 0x0UL;   \
-            pld_flags |= (event->umask<<8);   \
-            if (event->cmask != 0)   \
-            {   \
-                iss_flags |= (event->cmask<<7);   \
-                pld_flags |= 1; /* toggle cmd flag */   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags);   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-        }   \
-        break;   \
-    case 0x03: /* DSP_FILL: DSP */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint64_t dsp_flags = 0x0ULL;   \
-            dsp_flags |= (event->umask<<7);   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags);   \
-        }   \
-        break;   \
-    case 0x04: /* DRAM_MISC: PLD */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint64_t pld_flags = 0x0ULL;   \
-            switch (event->cmask)   \
-            {   \
-                case 0x0:   \
-                    pld_flags |= (1<<16);   \
-                    pld_flags |= (event->umask<<19);   \
-                    break;   \
-                case 0x1:   \
-                    pld_flags |= (event->umask<<18);   \
-                    break;   \
-                case 0x2:   \
-                    pld_flags |= (event->umask<<17);   \
-                    break;   \
-                case 0x3:   \
-                    pld_flags |= (event->umask<<7);   \
-                    break;   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags);   \
-        }   \
-        break;   \
-    case 0x05: /* FRM_TYPE: ISS */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t iss_flags = 0x0UL;   \
-            iss_flags |= event->umask;   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-        }   \
-    break;   \
-    case 0x06: /* FVC_EV0: FVC */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t fvc_flags = 0x0UL;   \
-            fvc_flags |= (event->umask<<11);   \
-            if (event->umask == 0x5)   \
-            {   \
-                fvc_flags |= (event->cmask<<5);   \
-            }   \
-            else   \
-            {   \
-                fvc_flags |= (event->cmask<<8);   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV0) \
-        }   \
-        break;   \
-    case 0x07: /* FVC_EV1: FVC */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t fvc_flags = 0x0UL;   \
-            fvc_flags |= (event->umask<<14);   \
-            if (event->umask == 0x5)   \
-            {   \
-                fvc_flags |= (event->cmask<<5);   \
-            }   \
-            else   \
-            {   \
-                fvc_flags |= (event->cmask<<8);   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV1) \
-        }   \
-        break;   \
-    case 0x08: /* FVC_EV2: FVC */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t fvc_flags = 0x0UL;   \
-            fvc_flags |= (event->umask<<17);   \
-            if (event->umask == 0x5)   \
-            {   \
-                fvc_flags |= (event->cmask<<5);   \
-            }   \
-            else   \
-            {   \
-                fvc_flags |= (event->cmask<<8);   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV2) \
-        }   \
-        break;   \
-    case 0x09: /* FVC_EV3: FVC(ZDP) */   \
-    flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t fvc_flags = 0x0UL;   \
-        fvc_flags |= (event->umask<<20);   \
-        if (event->umask == 0x5)   \
-        {   \
-            fvc_flags |= (event->cmask<<5);   \
-        }   \
-        else   \
-        {   \
-            fvc_flags |= (event->cmask<<8);   \
-        }   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-    }   \
-    break;   \
-    case 0x0A: /* ISS_SCHED: ISS */   \
-    flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t iss_flags = 0x0UL;   \
-        iss_flags |= (event->umask<<10);   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-    }   \
-    break;   \
-    case 0x0B: /* PGT_PAGE_EV: PGT */   \
-    flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t pgt_flags = 0x0UL;   \
-        pgt_flags |= event->umask;   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-    }   \
-    break;   \
-    case 0x0C: /* PGT_PAGE_EV2: PGT */   \
-    flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t pgt_flags = 0x0UL;   \
-        pgt_flags |= (event->umask<<11);   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-    }   \
-    break;   \
-    case 0x0D: /* THERM_TRP_DN: THR */   \
-    flags |= (event->eventId<<9);   \
-    {   \
-        uint32_t thr_flags = 0x0UL;   \
-        thr_flags |= (1<<3);   \
-        thr_flags |= (event->umask<<9);   \
-        msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, thr_flags);   \
-    }   \
-    break;   \
-}
+            case 0x08: /* FVC_EV2: FVC */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                {   \
+                    uint64_t fvc_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, &fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_READ); \
+                    fvc_flags |= (event->umask & 0x7ULL)<<17;   \
+                    if (event->umask == 0x5)   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<5;   \
+                    }   \
+                    else   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<8;   \
+                    }   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_EV2); \
+                }   \
+                break;   \
+            case 0x09: /* FVC_EV3: FVC(ZDP) */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                {   \
+                    uint64_t fvc_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, &fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_READ); \
+                    fvc_flags |= (event->umask<<20);   \
+                    if (event->umask == 0x5)   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<5;   \
+                    }   \
+                    else   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<8;   \
+                    }   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_EV3); \
+                }   \
+                break;   \
+            case 0x16: /* PGT_PAGE_EV: PGT */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                {   \
+                    uint64_t pgt_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_PGT, &pgt_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PGT, pgt_flags, MBOX##number##_PGT_READ); \
+                    pgt_flags |= (event->umask & 0x1ULL);   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_PGT, pgt_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PGT, pgt_flags, MBOX##number##_PGT); \
+                }   \
+                break;   \
+            case 0x0D: /* THERM_TRP_DN: THR */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                {   \
+                    uint64_t thr_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_MSC_THR, &thr_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_MSC_THR, thr_flags, MBOX##number##_THR_READ); \
+                    if (event->cmask == 0x0) \
+                    { \
+                        thr_flags |= (1ULL<<3);   \
+                    } \
+                    else \
+                    { \
+                        thr_flags &= ~(1ULL<<3);   \
+                        thr_flags |= (event->cmask & 0x7ULL) << 4; \
+                    } \
+                    thr_flags |= (event->umask & 0x3ULL)<<9;   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_MSC_THR, thr_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_MSC_THR, thr_flags, MBOX##number##_THR); \
+                }   \
+                break;   \
+            case 0x0E: /* THERM_TRP_UP: THR */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                {   \
+                    uint64_t thr_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_MSC_THR, &thr_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_MSC_THR, thr_flags, MBOX##number##_THR_READ); \
+                    if (event->cmask == 0x0) \
+                    { \
+                        thr_flags |= (1ULL<<3);   \
+                    } \
+                    else \
+                    { \
+                        thr_flags &= ~(1ULL<<3);   \
+                        thr_flags |= (event->cmask & 0x7ULL) << 4; \
+                    } \
+                    thr_flags |= (event->umask & 0x3ULL)<<7;   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_MSC_THR, thr_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_MSC_THR, thr_flags, MBOX##number##_THR); \
+                }   \
+                break;   \
+        } \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, flags));  \
+        VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_MBOX##number) \
+    }
 
+#define NEX_SETUP_RBOX(number)  \
+    if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(RBOX##number))) \
+    { \
+        flags = 0x01ULL; /* set local enable flag */ \
+        switch (event->eventId) {  \
+            case 0x00:  \
+                flags |= (event->umask & 0x1FULL)<<1; /* configure sub register */   \
+                {  \
+                    uint32_t iperf_flags = 0x0UL;   \
+                    iperf_flags |= (event->cfgBits<<event->cmask); /* configure event */  \
+                    switch (event->umask) { /* pick correct iperf register */  \
+                        case 0x00: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF0_P0, iperf_flags));   \
+                            break; \
+                        case 0x01: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF1_P0, iperf_flags));   \
+                            break; \
+                        case 0x06: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF0_P1, iperf_flags));   \
+                            break; \
+                        case 0x07: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF1_P1, iperf_flags));   \
+                            break; \
+                        case 0x0C: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF0_P2, iperf_flags));   \
+                            break; \
+                        case 0x0D: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF1_P2, iperf_flags));   \
+                            break; \
+                        case 0x12: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF0_P3, iperf_flags));   \
+                            break; \
+                        case 0x13: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF1_P3, iperf_flags));   \
+                            break; \
+                    } \
+                } \
+                break; \
+            case 0x01: \
+                flags |= (event->umask & 0x1FULL)<<1; /* configure sub register */   \
+                { \
+                    uint32_t qlx_flags = 0x0UL;   \
+                    qlx_flags |= (event->cfgBits); /* configure event */  \
+                    if (event->cmask) qlx_flags |= (event->cmask & 0x7ULL)<<4;  \
+                    switch (event->umask) { /* pick correct qlx register */  \
+                        case 0x02: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P0, qlx_flags));   \
+                            break; \
+                        case 0x03: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P0, (qlx_flags<<8)));   \
+                            break; \
+                        case 0x08: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P1, qlx_flags));   \
+                            break; \
+                        case 0x09: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P1, (qlx_flags<<8)));   \
+                            break; \
+                        case 0x0E: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P2, qlx_flags));   \
+                            break; \
+                        case 0x0F: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P2, (qlx_flags<<8)));   \
+                            break; \
+                        case 0x14: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P3, qlx_flags));   \
+                            break; \
+                        case 0x15: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P3, (qlx_flags<<8)));   \
+                            break; \
+                    } \
+                } \
+                break; \
+        } \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags)); \
+        VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_RBOX##number) \
+    }
 
-void perfmon_setupCounterThread_nehalemEX(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int nex_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
-    uint64_t flags = 0x0ULL;
-    int haveLock = 0;
+    int j;
+    uint64_t flags = 0x1ULL; /* set enable bit */
     uint64_t reg = counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
+    RegisterType type = counter_map[index].type;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
     {
-        haveLock = 1;
+        return 0;
     }
 
-    switch (counter_map[index].type)
+    flags |= (event->eventId<<1);
+    if (event->numberOfOptions > 0)
     {
-        case PMC:
-            flags = (1<<22)|(1<<16);
-
-            /* Intel with standard 8 bit event mask: [7:0] */
-            flags |= (event->umask<<8) + event->eventId;
-
-            if (event->cfgBits != 0) /* set custom cfg and cmask */
+        for (j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
             {
-                flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-                flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+                case EVENT_OPTION_MATCH0:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, event->options[j].value & 0xFFFFFFFFFFFFFFFULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, LLU_CAST event->options[j].value & 0xFFFFFFFFFFFFFFFULL, SETUP_BBOX_MATCH);
+                    break;
+                case EVENT_OPTION_MASK0:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, event->options[j].value & 0xFFFFFFFFFFFFFFFULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, LLU_CAST event->options[j].value & 0xFFFFFFFFFFFFFFFULL, SETUP_BBOX_MASK);
+                    break;
+                default:
+                    break;
             }
+        }
+    }
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+    VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_BBOX);
+    return 0;
+}
 
-            msr_write(cpu_id, reg , flags);
-            VERBOSEPRINTREG(cpu_id, reg, flags, PMC_EV_SEL)
-            break;
+int nex_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    uint64_t reg = counter_map[index].configRegister;
 
-        case FIXED:
-            fixed_flags |= (0x2<<(index*4));
-            msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
-            break;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
 
-        case MBOX0:
-            if (haveLock)
+    flags = (1ULL<<22);
+    flags |=(event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for (j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
             {
-                MBOX_GATE_NEHEX(0);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, MBOX0_CTRL)
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1FULL) << 24);
+                    break;
+                default:
+                    break;
             }
-            break;
+        }
+    }
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+    VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_CBOX);
+    return 0;
+}
 
-        case MBOX1:
-            if (haveLock)
-            {
-                MBOX_GATE_NEHEX(1);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, MBOX1_CTRL)
-            }
-            break;
+int nex_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x0ULL;
+    uint64_t reg = counter_map[index].configRegister;
+    int j;
 
-        case BBOX0:
-        case BBOX1:
-            if (haveLock)
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    flags |= (1ULL<<22); /* set enable bit */
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for (j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
             {
-                flags = 0x1ULL; /* set enable bit */
-                flags |=  (event->eventId<<1);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, BBOX_CTRL)
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL) << 24);
+                    break;
+                default:
+                    break;
             }
-            break;
+        }
+    }
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+    VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_WBOX);
+    return 0;
+}
 
-        case RBOX0:
-            if (haveLock)
-            {
-                RBOX_GATE(0);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, RBOX0_CTRL)
-            }
-            break;
+int nex_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    int match_mask = 0;
+    uint64_t flags = 0x0ULL;
+    uint64_t reg = counter_map[index].configRegister;
+    RegisterType type = counter_map[index].type;
 
-        case RBOX1:
-            if (haveLock)
-            {
-                RBOX_GATE(1);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, RBOX1_CTRL)
-            }
-            break;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
 
-        case WBOX:
-            if (haveLock)
+    flags = (1ULL<<22);
+    flags |=(event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        if (event->eventId == 0x0)
+        {
+            for (j = 0; j < event->numberOfOptions; j++)
             {
-                if (event->eventId == 0xFF)  /* Fixed Counter */
+                if ((event->options[j].type == EVENT_OPTION_MATCH0) ||
+                    (event->options[j].type == EVENT_OPTION_MASK0))
                 {
-                    flags = 0x1ULL; /* set enable bit */
+                    match_mask = 1;
+                    break;
+                }
+            }
+            if (match_mask) {
+                
+                if (type == SBOX0)
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_S0_PMON_MM_CFG, 0x0ULL, CLEAR_MM_CFG);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S0_PMON_MM_CFG, 0x0ULL));
                 }
                 else
                 {
-                    flags |= (1<<22); /* set enable bit */
-                    flags |= (event->umask<<8) + event->eventId;
+                    VERBOSEPRINTREG(cpu_id, MSR_S1_PMON_MM_CFG, 0x0ULL, CLEAR_MM_CFG);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S1_PMON_MM_CFG, 0x0ULL));
                 }
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, WBOX_CTRL)
             }
-            break;
-
-        case UBOX:
-            if (haveLock)
+        }
+        for (j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
             {
-                flags = 0x0ULL;
-                flags |= (1<<22);
-                flags |= event->eventId;
-                fprintf(stderr, "Setup UBOX with value 0x%llx in register 0x%llx, event 0x%x \n", LLU_CAST flags, LLU_CAST reg,event->eventId);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, UBOX_CTRL)
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL) << 24);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    if (event->eventId == 0x0)
+                    {
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, event->options[j].value));
+                        VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, LLU_CAST event->options[j].value, SETUP_SBOX_MATCH);
+                    }
+                    break;
+                case EVENT_OPTION_MASK0:
+                    if (event->eventId == 0x0)
+                    {
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, event->options[j].value));
+                        VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, LLU_CAST event->options[j].value, SETUP_SBOX_MASK);
+                    }
+                    break;
+                default:
+                    break;
             }
-            break;
-
-        case CBOX0:
-        case CBOX1:
-        case CBOX2:
-        case CBOX3:
-        case CBOX4:
-        case CBOX5:
-        case CBOX6:
-        case CBOX7:
-            if (haveLock)
+        }
+        if (match_mask)
+        {
+            if (type == SBOX0)
             {
-                flags = 0x0ULL;
-                flags |= (1<<22);
-                flags |= (event->umask<<8) + event->eventId;
-                fprintf(stderr, "Setup CBOX with value 0x%llx in register 0x%llx, event 0x%x umask 0x%x \n", LLU_CAST flags, LLU_CAST reg,event->eventId, event->umask);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, CBOX_CTRL)
+                VERBOSEPRINTREG(cpu_id, MSR_S0_PMON_MM_CFG, (1ULL<<63), SET_MM_CFG);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S0_PMON_MM_CFG, (1ULL<<63)));
             }
-            break;
-        case SBOX0:
-        case SBOX1:
-            if (haveLock)
+            else
             {
-                flags = 0x0ULL;
-                flags |= (1<<22);
-                flags |= (event->umask<<8);
-                flags |= (event->eventId);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, SBOX_CTRL)
+                VERBOSEPRINTREG(cpu_id, MSR_S1_PMON_MM_CFG, (1ULL<<63), SET_MM_CFG);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S1_PMON_MM_CFG, (1ULL<<63)));
             }
-            break;
-
-        default:
-            /* should never be reached */
-            break;
+        }
     }
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+    VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_SBOX);
+    return 0;
 }
 
+#define NEX_FREEZE_UNCORE \
+    if (haveLock && (eventSet->regTypeMask & ~(0xF))) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &tmp)); \
+        tmp &= ~(1ULL<<28); \
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST tmp, FREEZE_UNCORE) \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, tmp)); \
+    }
 
-/* Actions for Performance Monitoring Session:
- *
- * Core Counters (counter is always enabled in PERVSEL register):
- * 1) Disable counters in global ctrl Register MSR_PERF_GLOBAL_CTRL
- * 2) Zero according counter registers
- * 3) Set enable bit in global register flag
- * 4) Write global register flag
- *
- * Uncore Counters (only one core per socket):
- * 1) Set reset flag in global U Box control register
- * 2) Zero according counter registers
- * 3) Set enable bit in Box control register
- * 4) Write according uncore Box ctrl register
- * 3) Set enable bit in global U Box control register
- * */
-
-void perfmon_startCountersThread_nehalemEX(int thread_id)
+
+int perfmon_setupCounterThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
+    int haveTileLock = 0;
     uint64_t flags = 0x0ULL;
-    uint32_t uflags[NUM_UNITS];
-    int enable_ubox = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    uint64_t fixed_flags = 0x0ULL;
+    uint64_t ubox_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
-        uint32_t ubflags = 0x0UL;
-        ubflags |= (1<<29); /* reset all */
         haveLock = 1;
-        //        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
-        //       VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags, UBOX_GLOBAL_CTRL)
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_AND_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    if (haveLock && (eventSet->regTypeMask & ~(0xFULL)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL, FREEZE_UNCORE)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX0))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_TIMESTAMP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_DSP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_ISS, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_MAP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_MSC_THR, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_PGT, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_PLD, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_ZDP, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX1))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_TIMESTAMP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_DSP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_ISS, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_MAP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_MSC_THR, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_PGT, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_PLD, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_ZDP, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(RBOX0))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P3, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(RBOX1))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P3, 0x0ULL));
     }
 
-    for ( int i=0; i<NUM_UNITS; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        uflags[i] = 0x0UL;
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        flags = 0x0ULL;
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        uint64_t reg = counter_map[index].configRegister;
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        switch (type)
+        {
+            case PMC:
+                nex_pmc_setup(cpu_id, index, event);
+                break;
+
+            case FIXED:
+                fixed_flags |= nex_fixed_setup(cpu_id, index, event);
+                break;
+
+            case MBOX0:
+                NEX_SETUP_MBOX(0);
+                break;
+
+            case MBOX1:
+                NEX_SETUP_MBOX(1);
+                break;
+
+            case BBOX0:
+            case BBOX1:
+                nex_bbox_setup(cpu_id, index, event);
+                break;
+
+            case RBOX0:
+                NEX_SETUP_RBOX(0);
+                break;
+
+            case RBOX1:
+                NEX_SETUP_RBOX(1);
+                break;
+
+            case CBOX0:
+            case CBOX1:
+            case CBOX2:
+            case CBOX3:
+            case CBOX4:
+            case CBOX5:
+            case CBOX6:
+            case CBOX7:
+            case CBOX8:
+            case CBOX9:
+                nex_cbox_setup(cpu_id, index, event);
+                break;
+
+            case SBOX0:
+            case SBOX1:
+                nex_sbox_setup(cpu_id, index, event);
+                break;
+
+            case WBOX:
+                nex_wbox_setup(cpu_id, index, event);
+                break;
+
+            case WBOX0FIX:
+                if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(WBOX0FIX)))
+                {
+                    flags = 0x1ULL;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+                    VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_WBOXFIX)
+                    eventSet->regTypeMask |= REG_TYPE_MASK(WBOX);
+                }
+                break;
+
+            case UBOX:
+                if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(UBOX)))
+                {
+                    flags |= (1ULL<<22); /* set enable bit */
+                    flags |= event->eventId;
+                    for (int j=0;j<event->numberOfOptions;j++)
+                    {
+                        if (event->options[j].type == EVENT_OPTION_EDGE)
+                        {
+                            flags |= (1ULL<<18);
+                            break;
+                        }
+                    }
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+                    VERBOSEPRINTREG(cpu_id, reg, flags, SETUP_UBOX);
+                    ubox_flags = 0x1ULL;
+                }
+                break;
+
+            default:
+                break;
+        }
     }
 
-    for ( int i=0; i<NUM_PMC; i++ )
+    if (fixed_flags != 0x0ULL)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) {
-            if (westmereEX_counter_map[i].type == PMC)
-            {
-                msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
-                flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
-            }
-            else if (westmereEX_counter_map[i].type == FIXED)
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+    }
+    if (ubox_flags != 0x0ULL)
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST ubox_flags, ACTIVATE_UBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, ubox_flags));
+    }
+    return 0;
+}
+
+#define NEX_RESET_ALL_UNCORE_COUNTERS \
+    if (haveLock && (eventSet->regTypeMask & ~(0xF))) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &tmp)); \
+        tmp |= (1ULL<<29); \
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST tmp, RESET_ALL_UNCORE_COUNTERS); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, tmp)); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0U)); \
+    }
+
+#define NEX_UNFREEZE_UNCORE \
+    if (haveLock && (eventSet->regTypeMask & ~(0xF))) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &tmp)); \
+        tmp |= (1ULL<<28); \
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST tmp, UNFREEZE_UNCORE); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, tmp)); \
+    }
+
+#define NEX_UNFREEZE_BOX(id, flags) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) \
+    { \
+        VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, LLU_CAST flags, UNFREEZE_BOX); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, flags)); \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, flags)); \
+    }
+
+int perfmon_startCountersThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t core_ctrl_flags = 0x0ULL;
+    uint32_t uflags[NUM_UNITS] = { [0 ... NUM_UNITS-1] = 0x0U };
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    NEX_RESET_ALL_UNCORE_COUNTERS;
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE) 
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
-                flags |= (1ULL<<(i+32));  /* enable fixed counter */
+                continue;
             }
-            else if (westmereEX_counter_map[i].type > UNCORE)
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            switch (type)
             {
-                if(haveLock)
-                {
-                    msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
-                    uflags[westmereEX_counter_map[i].type] |=
-                        (1<<(perfmon_threadData[thread_id].counters[i].id));  /* enable uncore counter */
-                    if (westmereEX_counter_map[i].type == UBOX)
+                case PMC:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    core_ctrl_flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                    break;
+                case FIXED:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    core_ctrl_flags |= (1ULL<<(index+32));
+                    break;
+                case WBOX0FIX:
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(WBOX0FIX)))
                     {
-                        enable_ubox = 1;
+                        uflags[WBOX] |= (1ULL<<31);
                     }
-                }
+                    break;
+                default:
+                    if (haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(counter_map[index].type)))
+                    {
+                        uflags[counter_map[index].type] |= (1<<getCounterTypeOffset(index));
+                    }
+                    break;
             }
         }
     }
 
-    VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, GLOBAL_CTRL);
-
     if (haveLock)
     {
         for ( int i=0; i<NUM_UNITS; i++ )
         {
-            /* if counters are enabled write the according box ctrl register */
-            if (uflags[i])
+            if (uflags[i] != 0x0U)
             {
-                msr_write(cpu_id, westmereEX_PMunits[i].ctrlRegister, uflags[i]);
-                VERBOSEPRINTREG(cpu_id, westmereEX_PMunits[i].ctrlRegister, LLU_CAST uflags[i], BOXCTRL);
+                NEX_UNFREEZE_BOX(i, uflags[i]);
             }
         }
-
-        /* set global enable flag in U BOX ctrl register */
-        uint32_t ubflags = 0x0UL;
-        ubflags |= (1<<28); /* enable all */
-        if (enable_ubox)
-        {
-            ubflags |= (1<<0);
-        }
-        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST ubflags, UBOX_GLOBAL_CTRL);
-        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
     }
+
+    NEX_UNFREEZE_UNCORE;
+
     /* Finally enable counters */
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST core_ctrl_flags, GLOBAL_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, core_ctrl_flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|core_ctrl_flags));
+    }
+    return 0;
 }
 
-void perfmon_stopCountersThread_nehalemEX(int thread_id)
+#define NEX_CHECK_OVERFLOW(id, offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].statusRegister, &tmp)); \
+        if (tmp & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].statusRegister, (tmp & (1ULL<<offset)))); \
+        } \
+    }
+
+#define NEX_CHECK_UNCORE_OVERFLOW(id, offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].statusRegister, &tmp)); \
+        if (tmp & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, (tmp & (1ULL<<offset)))); \
+        } \
+    }
+
+int perfmon_stopCountersThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
-        uint32_t ubflags = 0x0UL;
         haveLock = 1;
-        //        ubflags |= (1<<29); /* reset all */
-        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
     }
 
-    for ( int i=0; i<NUM_COUNTERS_WESTMEREEX; i++ )
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, FREEZE_PMC_AND_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    NEX_FREEZE_UNCORE;
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (westmereEX_counter_map[i].type > UNCORE)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                if(haveLock)
-                {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-
-                    VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister,
-                            LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_UNCORE);
-                }
+                continue;
             }
-            else
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t reg = counter_map[index].configRegister;
+            switch (type)
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-
-                VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister,
-                        LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_CORE);
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[index].counterRegister, &counter_result));
+                    NEX_CHECK_OVERFLOW(PMC, index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, LLU_CAST counter_result, READ_PMC);
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[index].counterRegister, &counter_result));
+                    NEX_CHECK_OVERFLOW(PMC, index+32);
+                    VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, LLU_CAST counter_result, READ_FIXED);
+                    break;
+                default:
+                    if(haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(counter_map[index].type)))
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[index].counterRegister, &counter_result));
+                        NEX_CHECK_UNCORE_OVERFLOW(counter_map[index].type, getCounterTypeOffset(index));
+                        VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, LLU_CAST counter_result, READ_UNCORE);
+                    }
+                    break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
 
-#if 0
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    printf ("Status: 0x%llX \n", LLU_CAST flags);
-    if((flags & 0x3) || (flags & (0x3ULL<<32)) )
-    {
-        printf ("Overflow occured \n");
-    }
-#endif
+    return 0;
 }
 
-void perfmon_readCountersThread_nehalemEX(int thread_id)
+int perfmon_readCountersThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t counter_result = 0x0ULL;
+    uint64_t core_ctrl_flags = 0x0ULL;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    for ( int i=0; i<NUM_COUNTERS_WESTMEREEX; i++ )
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &core_ctrl_flags));
+    }
+    NEX_FREEZE_UNCORE;
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (westmereEX_counter_map[i].type > UNCORE)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                if(haveLock)
-                {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-                }
+                continue;
             }
-            else
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter = counter_map[index].counterRegister;
+            switch (type)
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    NEX_CHECK_OVERFLOW(PMC, index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_PMC);
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                    NEX_CHECK_OVERFLOW(PMC, index+32);
+                    VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_FIXED);
+                    break;
+                default:
+                    if(haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(counter_map[index].type)))
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+                        NEX_CHECK_UNCORE_OVERFLOW(counter_map[index].type, getCounterTypeOffset(index));
+                        VERBOSEPRINTREG(cpu_id, counter, LLU_CAST counter_result, READ_UNCORE);
+                    }
+                    break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
+
+    NEX_UNFREEZE_UNCORE;
+    if ((eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED))) && (core_ctrl_flags != 0x0ULL))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, core_ctrl_flags));
+    }
+    return 0;
 }
 
+int perfmon_finalizeCountersThread_nehalemEX(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    int haveTileLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        uint64_t reg = counter_map[index].configRegister;
+        PciDeviceIndex dev = counter_map[index].device;
+        switch (type)
+        {
+            case PMC:
+                ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                if ((haveTileLock) && (event->eventId == 0xB7))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                }
+                else if ((haveTileLock) && (event->eventId == 0xBB))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                }
+                break;
+            case FIXED:
+                ovf_values_core |= (1ULL<<(index+32));
+                break;
+            case MBOX0:
+            case MBOX1:
+                if (haveLock && ((event->cfgBits == 0x02) || (event->cfgBits == 0x04)))
+                {
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, 0x0ULL, CLEAR_MATCH0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, 0x0ULL, CLEAR_MASK0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, 0x0ULL));
+                }
+                break;
+            case SBOX0:
+                if (haveLock && (event->eventId == 0x00))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_S0_PMON_MM_CFG, 0x0ULL, CLEAR_MM_CFG);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S0_PMON_MM_CFG, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, 0x0ULL, CLEAR_MATCH0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, 0x0ULL, CLEAR_MASK0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, 0x0ULL));
+                }
+                break;
+            case SBOX1:
+                if (haveLock && (event->eventId == 0x00))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_S1_PMON_MM_CFG, 0x0ULL, CLEAR_MM_CFG);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S1_PMON_MM_CFG, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, 0x0ULL, CLEAR_MATCH0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, 0x0ULL, CLEAR_MASK0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, 0x0ULL));
+                }
+                break;
+            case BBOX0:
+            case BBOX1:
+                if (haveLock && ((event->eventId == 0x01) ||
+                                 (event->eventId == 0x02) ||
+                                 (event->eventId == 0x03) ||
+                                 (event->eventId == 0x04) ||
+                                 (event->eventId == 0x05) ||
+                                 (event->eventId == 0x06)))
+                {
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, 0x0ULL, CLEAR_MATCH0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, 0x0ULL, CLEAR_MASK0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, 0x0ULL));
+                }
+                break;
+        }
+        if ((reg) && (((dev == MSR_DEV) && (type < UNCORE)) || (((haveLock) && (type > UNCORE)))))
+        {
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core, CLEAR_OVF_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, CLEAR_PMC_AND_FIXED_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    if (haveLock && (eventSet->regTypeMask & ~(0xFULL)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL, CLEAR_UNCORE_OVF);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL, CLEAR_UNCORE_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL));
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_nehalemEX_counters.h b/src/includes/perfmon_nehalemEX_counters.h
new file mode 100644
index 0000000..6248c58
--- /dev/null
+++ b/src/includes/perfmon_nehalemEX_counters.h
@@ -0,0 +1,185 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  perfmon_nehalemEX_counters.h
+ *
+ *      Description: Counter Header File of perfmon module for Intel Westmere EX.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#define NUM_COUNTERS_CORE_NEHALEMEX 7
+#define NUM_COUNTERS_UNCORE_NEHALEMEX 105
+#define NUM_COUNTERS_NEHALEMEX 105
+
+#define NEX_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define NEX_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+#define NEX_VALID_OPTIONS_MBOX EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define NEX_VALID_OPTIONS_BBOX EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define NEX_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+#define NEX_VALID_OPTIONS_SBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+#define NEX_VALID_OPTIONS_WBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+
+static RegisterMap nehalemEX_counter_map[NUM_COUNTERS_NEHALEMEX] = {
+    /* Fixed Counters: instructions retired, cycles unhalted core */
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, NEX_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, NEX_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, NEX_VALID_OPTIONS_FIXED},
+    /* PMC Counters: 4 48bit wide */
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, NEX_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, NEX_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, NEX_VALID_OPTIONS_PMC},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, NEX_VALID_OPTIONS_PMC},
+    /* MBOX */
+    {"MBOX0C0",PMC7, MBOX0, MSR_M0_PMON_EVNT_SEL0, MSR_M0_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C1",PMC8, MBOX0, MSR_M0_PMON_EVNT_SEL1, MSR_M0_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C2",PMC9, MBOX0, MSR_M0_PMON_EVNT_SEL2, MSR_M0_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C3",PMC10, MBOX0, MSR_M0_PMON_EVNT_SEL3, MSR_M0_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C4",PMC11, MBOX0, MSR_M0_PMON_EVNT_SEL4, MSR_M0_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C5",PMC12, MBOX0, MSR_M0_PMON_EVNT_SEL5, MSR_M0_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C0",PMC13, MBOX1, MSR_M1_PMON_EVNT_SEL0, MSR_M1_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C1",PMC14, MBOX1, MSR_M1_PMON_EVNT_SEL1, MSR_M1_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C2",PMC15, MBOX1, MSR_M1_PMON_EVNT_SEL2, MSR_M1_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C3",PMC16, MBOX1, MSR_M1_PMON_EVNT_SEL3, MSR_M1_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C4",PMC17, MBOX1, MSR_M1_PMON_EVNT_SEL4, MSR_M1_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C5",PMC18, MBOX1, MSR_M1_PMON_EVNT_SEL5, MSR_M1_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_MBOX},
+    /* BBOX */
+    {"BBOX0C0",PMC19, BBOX0, MSR_B0_PMON_EVNT_SEL0, MSR_B0_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    {"BBOX0C1",PMC20, BBOX0, MSR_B0_PMON_EVNT_SEL1, MSR_B0_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    {"BBOX0C2",PMC21, BBOX0, MSR_B0_PMON_EVNT_SEL2, MSR_B0_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    {"BBOX0C3",PMC22, BBOX0, MSR_B0_PMON_EVNT_SEL3, MSR_B0_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C0",PMC23, BBOX1, MSR_B1_PMON_EVNT_SEL0, MSR_B1_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C1",PMC24, BBOX1, MSR_B1_PMON_EVNT_SEL1, MSR_B1_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C2",PMC25, BBOX1, MSR_B1_PMON_EVNT_SEL2, MSR_B1_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C3",PMC26, BBOX1, MSR_B1_PMON_EVNT_SEL3, MSR_B1_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_BBOX},
+    /* RBOX */
+    {"RBOX0C0",PMC27, RBOX0, MSR_R0_PMON_EVNT_SEL0, MSR_R0_PMON_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C1",PMC28, RBOX0, MSR_R0_PMON_EVNT_SEL1, MSR_R0_PMON_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C2",PMC29, RBOX0, MSR_R0_PMON_EVNT_SEL2, MSR_R0_PMON_CTR2, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C3",PMC30, RBOX0, MSR_R0_PMON_EVNT_SEL3, MSR_R0_PMON_CTR3, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C4",PMC31, RBOX0, MSR_R0_PMON_EVNT_SEL4, MSR_R0_PMON_CTR4, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C5",PMC32, RBOX0, MSR_R0_PMON_EVNT_SEL5, MSR_R0_PMON_CTR5, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C6",PMC33, RBOX0, MSR_R0_PMON_EVNT_SEL6, MSR_R0_PMON_CTR6, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C7",PMC34, RBOX0, MSR_R0_PMON_EVNT_SEL7, MSR_R0_PMON_CTR7, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C0",PMC35, RBOX1, MSR_R1_PMON_EVNT_SEL8, MSR_R1_PMON_CTR8, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C1",PMC36, RBOX1, MSR_R1_PMON_EVNT_SEL9, MSR_R1_PMON_CTR9, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C2",PMC37, RBOX1, MSR_R1_PMON_EVNT_SEL10, MSR_R1_PMON_CTR10, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C3",PMC38, RBOX1, MSR_R1_PMON_EVNT_SEL11, MSR_R1_PMON_CTR11, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C4",PMC39, RBOX1, MSR_R1_PMON_EVNT_SEL12, MSR_R1_PMON_CTR12, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C5",PMC40, RBOX1, MSR_R1_PMON_EVNT_SEL13, MSR_R1_PMON_CTR13, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C6",PMC41, RBOX1, MSR_R1_PMON_EVNT_SEL14, MSR_R1_PMON_CTR14, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C7",PMC42, RBOX1, MSR_R1_PMON_EVNT_SEL15, MSR_R1_PMON_CTR15, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* WBOX */
+    {"WBOX0",PMC43, WBOX, MSR_W_PMON_EVNT_SEL0, MSR_W_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_WBOX},
+    {"WBOX1",PMC44, WBOX, MSR_W_PMON_EVNT_SEL1, MSR_W_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_WBOX},
+    {"WBOX2",PMC45, WBOX, MSR_W_PMON_EVNT_SEL2, MSR_W_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_WBOX},
+    {"WBOX3",PMC46, WBOX, MSR_W_PMON_EVNT_SEL3, MSR_W_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_WBOX},
+    {"WBOXFIX",PMC47, WBOX0FIX, MSR_W_PMON_FIXED_CTR_CTL, MSR_W_PMON_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* UBOX */
+    {"UBOX0",PMC48, UBOX, MSR_U_PMON_GLOBAL_EVNT_SEL, MSR_U_PMON_GLOBAL_CTR, 0, 0, EVENT_OPTION_EDGE_MASK},
+    /* CBOXes */
+    {"CBOX0C0",PMC49, CBOX0, MSR_C0_PMON_EVNT_SEL0, MSR_C0_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C1",PMC50, CBOX0, MSR_C0_PMON_EVNT_SEL1, MSR_C0_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C2",PMC51, CBOX0, MSR_C0_PMON_EVNT_SEL2, MSR_C0_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C3",PMC52, CBOX0, MSR_C0_PMON_EVNT_SEL3, MSR_C0_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C4",PMC53, CBOX0, MSR_C0_PMON_EVNT_SEL4, MSR_C0_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C5",PMC54, CBOX0, MSR_C0_PMON_EVNT_SEL5, MSR_C0_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C0",PMC55, CBOX1, MSR_C1_PMON_EVNT_SEL0, MSR_C1_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C1",PMC56, CBOX1, MSR_C1_PMON_EVNT_SEL1, MSR_C1_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C2",PMC57, CBOX1, MSR_C1_PMON_EVNT_SEL2, MSR_C1_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C3",PMC58, CBOX1, MSR_C1_PMON_EVNT_SEL3, MSR_C1_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C4",PMC59, CBOX1, MSR_C1_PMON_EVNT_SEL4, MSR_C1_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C5",PMC60, CBOX1, MSR_C1_PMON_EVNT_SEL5, MSR_C1_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C0",PMC61, CBOX2, MSR_C2_PMON_EVNT_SEL0, MSR_C2_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C1",PMC62, CBOX2, MSR_C2_PMON_EVNT_SEL1, MSR_C2_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C2",PMC63, CBOX2, MSR_C2_PMON_EVNT_SEL2, MSR_C2_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C3",PMC64, CBOX2, MSR_C2_PMON_EVNT_SEL3, MSR_C2_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C4",PMC65, CBOX2, MSR_C2_PMON_EVNT_SEL4, MSR_C2_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C5",PMC66, CBOX2, MSR_C2_PMON_EVNT_SEL5, MSR_C2_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C0",PMC67, CBOX3, MSR_C3_PMON_EVNT_SEL0, MSR_C3_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C1",PMC68, CBOX3, MSR_C3_PMON_EVNT_SEL1, MSR_C3_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C2",PMC69, CBOX3, MSR_C3_PMON_EVNT_SEL2, MSR_C3_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C3",PMC70, CBOX3, MSR_C3_PMON_EVNT_SEL3, MSR_C3_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C4",PMC71, CBOX3, MSR_C3_PMON_EVNT_SEL4, MSR_C3_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C5",PMC72, CBOX3, MSR_C3_PMON_EVNT_SEL5, MSR_C3_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C0",PMC73, CBOX4, MSR_C4_PMON_EVNT_SEL0, MSR_C4_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C1",PMC74, CBOX4, MSR_C4_PMON_EVNT_SEL1, MSR_C4_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C2",PMC75, CBOX4, MSR_C4_PMON_EVNT_SEL2, MSR_C4_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C3",PMC76, CBOX4, MSR_C4_PMON_EVNT_SEL3, MSR_C4_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C4",PMC77, CBOX4, MSR_C4_PMON_EVNT_SEL4, MSR_C4_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C5",PMC78, CBOX4, MSR_C4_PMON_EVNT_SEL5, MSR_C4_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C0",PMC79, CBOX5, MSR_C5_PMON_EVNT_SEL0, MSR_C5_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C1",PMC80, CBOX5, MSR_C5_PMON_EVNT_SEL1, MSR_C5_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C2",PMC81, CBOX5, MSR_C5_PMON_EVNT_SEL2, MSR_C5_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C3",PMC82, CBOX5, MSR_C5_PMON_EVNT_SEL3, MSR_C5_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C4",PMC83, CBOX5, MSR_C5_PMON_EVNT_SEL4, MSR_C5_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C5",PMC84, CBOX5, MSR_C5_PMON_EVNT_SEL5, MSR_C5_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C0",PMC85, CBOX6, MSR_C6_PMON_EVNT_SEL0, MSR_C6_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C1",PMC86, CBOX6, MSR_C6_PMON_EVNT_SEL1, MSR_C6_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C2",PMC87, CBOX6, MSR_C6_PMON_EVNT_SEL2, MSR_C6_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C3",PMC88, CBOX6, MSR_C6_PMON_EVNT_SEL3, MSR_C6_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C4",PMC89, CBOX6, MSR_C6_PMON_EVNT_SEL4, MSR_C6_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C4",PMC90, CBOX6, MSR_C6_PMON_EVNT_SEL5, MSR_C6_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C0",PMC91, CBOX7, MSR_C7_PMON_EVNT_SEL0, MSR_C7_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C1",PMC92, CBOX7, MSR_C7_PMON_EVNT_SEL1, MSR_C7_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C2",PMC93, CBOX7, MSR_C7_PMON_EVNT_SEL2, MSR_C7_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C3",PMC94, CBOX7, MSR_C7_PMON_EVNT_SEL3, MSR_C7_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C4",PMC95, CBOX7, MSR_C7_PMON_EVNT_SEL4, MSR_C7_PMON_CTR4, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C5",PMC96, CBOX7, MSR_C7_PMON_EVNT_SEL5, MSR_C7_PMON_CTR5, 0, 0, NEX_VALID_OPTIONS_CBOX},
+    /* SBOXes */
+    {"SBOX0C0",PMC97, SBOX0, MSR_S0_PMON_EVNT_SEL0, MSR_S0_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_SBOX},
+    {"SBOX0C1",PMC98, SBOX0, MSR_S0_PMON_EVNT_SEL1, MSR_S0_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_SBOX},
+    {"SBOX0C2",PMC99, SBOX0, MSR_S0_PMON_EVNT_SEL2, MSR_S0_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_SBOX},
+    {"SBOX0C3",PMC100, SBOX0, MSR_S0_PMON_EVNT_SEL3, MSR_S0_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C0",PMC101, SBOX1, MSR_S1_PMON_EVNT_SEL0, MSR_S1_PMON_CTR0, 0, 0, NEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C1",PMC102, SBOX1, MSR_S1_PMON_EVNT_SEL1, MSR_S1_PMON_CTR1, 0, 0, NEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C2",PMC103, SBOX1, MSR_S1_PMON_EVNT_SEL2, MSR_S1_PMON_CTR2, 0, 0, NEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C3",PMC104, SBOX1, MSR_S1_PMON_EVNT_SEL3, MSR_S1_PMON_CTR3, 0, 0, NEX_VALID_OPTIONS_SBOX}
+};
+
+
+static BoxMap nehalemEX_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [MBOX0] = {MSR_M0_PMON_BOX_CTRL, MSR_M0_PMON_BOX_STATUS, MSR_M0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_M0_PMON_ADDR_MATCH, MSR_M0_PMON_ADDR_MASK},
+    [MBOX1] = {MSR_M1_PMON_BOX_CTRL, MSR_M1_PMON_BOX_STATUS, MSR_M1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_M1_PMON_ADDR_MATCH, MSR_M1_PMON_ADDR_MASK},
+    [BBOX0] = {MSR_B0_PMON_BOX_CTRL, MSR_B0_PMON_BOX_STATUS, MSR_B0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_B0_PMON_MATCH,MSR_B0_PMON_MASK},
+    [BBOX1] = {MSR_B1_PMON_BOX_CTRL, MSR_B1_PMON_BOX_STATUS, MSR_B1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_B1_PMON_MATCH,MSR_B1_PMON_MASK},
+    [RBOX0] = {MSR_R0_PMON_BOX_CTRL, MSR_R0_PMON_BOX_STATUS, MSR_R0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [RBOX1] = {MSR_R1_PMON_BOX_CTRL, MSR_R1_PMON_BOX_STATUS, MSR_R1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [SBOX0] = {MSR_S0_PMON_BOX_CTRL, MSR_S0_PMON_BOX_STATUS, MSR_S0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_S0_PMON_MATCH, MSR_S0_PMON_MASK},
+    [SBOX1] = {MSR_S1_PMON_BOX_CTRL, MSR_S1_PMON_BOX_STATUS, MSR_S1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_S1_PMON_MATCH, MSR_S1_PMON_MASK},
+    [CBOX0] = {MSR_C0_PMON_BOX_CTRL, MSR_C0_PMON_BOX_STATUS, MSR_C0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX1] = {MSR_C1_PMON_BOX_CTRL, MSR_C1_PMON_BOX_STATUS, MSR_C1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX2] = {MSR_C2_PMON_BOX_CTRL, MSR_C2_PMON_BOX_STATUS, MSR_C2_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX3] = {MSR_C3_PMON_BOX_CTRL, MSR_C3_PMON_BOX_STATUS, MSR_C3_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX4] = {MSR_C4_PMON_BOX_CTRL, MSR_C4_PMON_BOX_STATUS, MSR_C4_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX5] = {MSR_C5_PMON_BOX_CTRL, MSR_C5_PMON_BOX_STATUS, MSR_C5_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX6] = {MSR_C6_PMON_BOX_CTRL, MSR_C6_PMON_BOX_STATUS, MSR_C6_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX7] = {MSR_C7_PMON_BOX_CTRL, MSR_C7_PMON_BOX_STATUS, MSR_C7_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [WBOX] = {MSR_W_PMON_BOX_CTRL, MSR_W_PMON_BOX_STATUS, MSR_W_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [WBOX0FIX] = {MSR_W_PMON_BOX_CTRL, MSR_W_PMON_BOX_STATUS, MSR_W_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [UBOX] = {MSR_U_PMON_GLOBAL_CTRL, MSR_U_PMON_GLOBAL_STATUS, MSR_U_PMON_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+};
diff --git a/src/includes/perfmon_nehalemEX_events.txt b/src/includes/perfmon_nehalemEX_events.txt
index 565f5ca..0b07bcc 100644
--- a/src/includes/perfmon_nehalemEX_events.txt
+++ b/src/includes/perfmon_nehalemEX_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_nehalemEX_events.txt
-# 
-#      Description:  Event list for Intel NehalemEX
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Description:  Event list for Intel Nehalem EX
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -69,12 +70,6 @@ UMASK_DTLB_LOAD_MISSES_PDE_MISS        0x20
 UMASK_DTLB_LOAD_MISSES_PDP_MISS        0x40
 UMASK_DTLB_LOAD_MISSES_LARGE_WALK_COMPLETED  0x80
 
-EVENT_MEMORY_DISAMBIGURATION      0x09   PMC
-UMASK_MEMORY_DISAMBIGURATION_RESET         0x01 
-UMASK_MEMORY_DISAMBIGURATION_SUCCESS       0x01 
-UMASK_MEMORY_DISAMBIGURATION_WATCHDOG      0x01 
-UMASK_MEMORY_DISAMBIGURATION_WATCH_CYCLES  0x01 
-
 EVENT_MEM_INST_RETIRED           0x0B  PMC
 UMASK_MEM_INST_RETIRED_LOADS     0x01
 UMASK_MEM_INST_RETIRED_STORES    0x02
@@ -84,8 +79,8 @@ EVENT_MEM_STORE_RETIRED_DTLB        0x0C  PMC
 UMASK_MEM_STORE_RETIRED_DTLB_MISS   0x01
 
 EVENT_UOPS_ISSUED                0x0E   PMC
-UMASK_UOPS_ISSUED_ANY            0x01 
-UMASK_UOPS_ISSUED_FUSED          0x02 
+UMASK_UOPS_ISSUED_ANY            0x01
+UMASK_UOPS_ISSUED_FUSED          0x02
 
 EVENT_MEM_UNCORE_RETIRED         0x0F    PMC
 UMASK_MEM_UNCORE_RETIRED_OTHER_CORE_L2_HITM            0x02 
@@ -519,8 +514,12 @@ UMASK_SIMD_INT_64_PACKED_LOGICAL        0x10
 UMASK_SIMD_INT_64_PACKED_ARITH          0x20
 UMASK_SIMD_INT_64_SHUFFLE_MOVE          0x40
 
-EVENT_UNCORE_CYCLES                  0xFF  WBOX4
-UMASK_UNCORE_CYCLES                  0x00
+EVENT_OFFCORE_RESPONSE_0                              0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                      EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                         0x01 0xFF 0xFF
+
+EVENT_UNCORE_CLOCKTICKS                  0xFF  WBOXFIX
+UMASK_UNCORE_CLOCKTICKS                  0x00
 
 EVENT_C_CYCLES_TURBO                  0x04  WBOX
 UMASK_C_CYCLES_TURBO_C0               0x01
@@ -592,40 +591,32 @@ UMASK_TM1_ON_C7               0x80
 UMASK_TM1_ON_C_ALL            0xFF
 
 EVENT_BBOX_CMDS_ALL                  0x1A  MBOX
-UMASK_BBOX_CMDS_ALL                  0xFF
+UMASK_BBOX_CMDS_ALL                  0x00 0x00 0x00
 
-EVENT_BCMD_SCHEDQ_OCCUPANCY           0x06  MBOX
-UMASK_BCMD_SCHEDQ_OCCUPANCY_READS     0x00 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_WRITES    0x01 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_MERGE     0x02 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_V2F       0x03 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_V2V       0x04 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_F2V       0x05 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_F2B       0x06 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_SPRWR     0x07 0x01 0x00
-UMASK_BCMD_SCHEDQ_OCCUPANCY_ALL       0x08 0x01 0x00
+EVENT_REFRESH                        0x06  MBOX
+UMASK_REFRESH                        0x00 0x00 0x00
 
-EVENT_BBOX_CYCLES                  0x1B  MBOX
-UMASK_BBOX_CYCLES                  0xFF
+EVENT_MBOX_CLOCKTICKS                0x1B  MBOX
+UMASK_MBOX_CLOCKTICKS                0x00 0x00 0x00
 
-EVENT_CYCLES_DSP_FILL                  0x00  MBOX
-UMASK_CYCLES_DSP_FILL_RDQ_FULL         0x01 0x01 0x00
-UMASK_CYCLES_DSP_FILL_WRQ_FULL         0x02 0x01 0x00
-UMASK_CYCLES_DSP_FILL_RDQ_EMPTY        0x04 0x01 0x00
-UMASK_CYCLES_DSP_FILL_WRQ_EMPTY        0x08 0x01 0x00
+EVENT_CYCLES_DSP_FILL                0x00  MBOX
+UMASK_CYCLES_DSP_FILL_RDQ_FULL       0x01 0x01 0x00
+UMASK_CYCLES_DSP_FILL_WRQ_FULL       0x02 0x01 0x00
+UMASK_CYCLES_DSP_FILL_RDQ_EMPTY      0x04 0x01 0x00
+UMASK_CYCLES_DSP_FILL_WRQ_EMPTY      0x08 0x01 0x00
 
-EVENT_CYCLES_MFULL                  0x01  MBOX
-UMASK_CYCLES_MFULL                  0x00 0x01 0x00
+EVENT_CYCLES_MFULL                   0x01  MBOX
+UMASK_CYCLES_MFULL                   0x00 0x00 0x00
 
-EVENT_CYCLES_PGT_STATE                  0x05  MBOX
-UMASK_CYCLES_PGT_STATE_CLOSED           0x00 0x01 0x00
-UMASK_CYCLES_PGT_STATE_OPEN             0x01 0x01 0x00
+EVENT_CYCLES_PGT_STATE               0x05  MBOX
+UMASK_CYCLES_PGT_STATE_CLOSED        0x00 0x01 0x00
+UMASK_CYCLES_PGT_STATE_OPEN          0x01 0x01 0x00
 
-EVENT_CYCLES_RETRYQ_STARVED                  0x04  MBOX
-UMASK_CYCLES_RETRYQ_STARVED                  0x00 0x01 0x00
+EVENT_CYCLES_RETRYQ_STARVED          0x04  MBOX
+UMASK_CYCLES_RETRYQ_STARVED          0x00 0x01 0x00
 
-EVENT_CYCLES_RETRYQ_MFULL                  0x03  MBOX
-UMASK_CYCLES_RETRYQ_MFULL                  0x00 0x01 0x00
+EVENT_CYCLES_RETRYQ_MFULL            0x03  MBOX
+UMASK_CYCLES_RETRYQ_MFULL            0x00 0x01 0x00
 
 EVENT_CYCLES_SCHED_MODE                  0x01  MBOX
 UMASK_CYCLES_SCHED_MODE_TRADEOFF         0x00 0x01 0x00
@@ -634,34 +625,35 @@ UMASK_CYCLES_SCHED_MODE_WRPRIO           0x02 0x01 0x00
 UMASK_CYCLES_SCHED_MODE_ADAPTIVE         0x03 0x01 0x00
 
 EVENT_DRAM_CMD                              0x0A  MBOX
+OPTIONS_DRAM_CMD                            EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
 UMASK_DRAM_CMD_ALL                          0x00 0x02 0x00
-UMASK_DRAM_CMD_ILLEGAL                      0x01 0x02 0x00
+UMASK_DRAM_CMD_ILLEGAL                      0x00 0x02 0x00
 UMASK_DRAM_CMD_PREALL                       0x01 0x02 0x00
-UMASK_DRAM_CMD_PREALL_TRDOFF                0x01 0x02 0x00
-UMASK_DRAM_CMD_PREALL_RDPRIO                0x01 0x02 0x01
-UMASK_DRAM_CMD_PREALL_WRPRIO                0x01 0x02 0x02
-UMASK_DRAM_CMD_PREALL_ADAPTIVE              0x01 0x02 0x02
+UMASK_DRAM_CMD_PREALL_TRDOFF                0x01 0x02 0x10
+UMASK_DRAM_CMD_PREALL_RDPRIO                0x01 0x02 0x11
+UMASK_DRAM_CMD_PREALL_WRPRIO                0x01 0x02 0x12
+UMASK_DRAM_CMD_PREALL_ADAPTIVE              0x01 0x02 0x13
 UMASK_DRAM_CMD_RAS                          0x02 0x02 0x00
-UMASK_DRAM_CMD_RAS_TRDOFF                   0x02 0x02 0x00
-UMASK_DRAM_CMD_RAS_RDPRIO                   0x02 0x02 0x01
-UMASK_DRAM_CMD_RAS_WRPRIO                   0x02 0x02 0x02
-UMASK_DRAM_CMD_RAS_ADAPTIVE                 0x02 0x02 0x02
-UMASK_DRAM_CMD_CAS_RD_OPN                   0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN                   0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN_TRDOFF            0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN_RDPRIO            0x04 0x02 0x01
-UMASK_DRAM_CMD_CAS_WR_OPN_WRPRIO            0x04 0x02 0x02
-UMASK_DRAM_CMD_CAS_WR_OPN_ADAPTIVE          0x04 0x02 0x03
+UMASK_DRAM_CMD_RAS_TRDOFF                   0x02 0x02 0x10
+UMASK_DRAM_CMD_RAS_RDPRIO                   0x02 0x02 0x11
+UMASK_DRAM_CMD_RAS_WRPRIO                   0x02 0x02 0x12
+UMASK_DRAM_CMD_RAS_ADAPTIVE                 0x02 0x02 0x13
+UMASK_DRAM_CMD_CAS_RD_OPN                   0x03 0x02 0x00
+UMASK_DRAM_CMD_CAS_WR_OPN                   0x04 0x02 0x00
+UMASK_DRAM_CMD_CAS_WR_OPN_TRDOFF            0x04 0x02 0x10
+UMASK_DRAM_CMD_CAS_WR_OPN_RDPRIO            0x04 0x02 0x11
+UMASK_DRAM_CMD_CAS_WR_OPN_WRPRIO            0x04 0x02 0x12
+UMASK_DRAM_CMD_CAS_WR_OPN_ADAPTIVE          0x04 0x02 0x13
 UMASK_DRAM_CMD_CAS_RD_CLS                   0x05 0x02 0x00
-UMASK_DRAM_CMD_CAS_RD_CLS_TRDOFF            0x05 0x02 0x00
-UMASK_DRAM_CMD_CAS_RD_CLS_RDPRIO            0x05 0x02 0x01
-UMASK_DRAM_CMD_CAS_RD_CLS_WRPRIO            0x05 0x02 0x02
-UMASK_DRAM_CMD_CAS_RD_CLS_ADAPTIVE          0x05 0x02 0x03
+UMASK_DRAM_CMD_CAS_RD_CLS_TRDOFF            0x05 0x02 0x10
+UMASK_DRAM_CMD_CAS_RD_CLS_RDPRIO            0x05 0x02 0x11
+UMASK_DRAM_CMD_CAS_RD_CLS_WRPRIO            0x05 0x02 0x12
+UMASK_DRAM_CMD_CAS_RD_CLS_ADAPTIVE          0x05 0x02 0x13
 UMASK_DRAM_CMD_CAS_WR_CLS                   0x06 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_CLS_TRDOFF            0x06 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_CLS_RDPRIO            0x06 0x02 0x01
-UMASK_DRAM_CMD_CAS_WR_CLS_WRPRIO            0x06 0x02 0x02
-UMASK_DRAM_CMD_CAS_WR_CLS_ADAPTIVE          0x06 0x02 0x03
+UMASK_DRAM_CMD_CAS_WR_CLS_TRDOFF            0x06 0x02 0x10
+UMASK_DRAM_CMD_CAS_WR_CLS_RDPRIO            0x06 0x02 0x11
+UMASK_DRAM_CMD_CAS_WR_CLS_WRPRIO            0x06 0x02 0x12
+UMASK_DRAM_CMD_CAS_WR_CLS_ADAPTIVE          0x06 0x02 0x13
 UMASK_DRAM_CMD_MRS                          0x07 0x02 0x00
 UMASK_DRAM_CMD_RFR                          0x09 0x02 0x00
 UMASK_DRAM_CMD_ENSR                         0x0A 0x02 0x00
@@ -688,33 +680,16 @@ UMASK_DSP_FILL_WRQ_FULL                     0x02 0x03 0x00
 UMASK_DSP_FILL_RDQ_EMPTY                    0x04 0x03 0x00
 UMASK_DSP_FILL_WRQ_EMPTY                    0x08 0x03 0x00
 
-EVENT_DRAM_MISC                          0x0B  MBOX
-UMASK_DRAM_MISC_RETRIES_ALL              0x00 0x04 0x03
-UMASK_DRAM_MISC_RETRIES_FVID             0x01 0x04 0x03
-UMASK_DRAM_MISC_VALID                    0x01 0x04 0x02
-UMASK_DRAM_MISC_NON_NOP_TRKL             0x01 0x04 0x01
-
-UMASK_DRAM_MISC_ILLEGAL                  0x00 0x04 0x00
-UMASK_DRAM_MISC_PREALL                   0x01 0x04 0x00
-UMASK_DRAM_MISC_RAS                      0x02 0x04 0x00
-UMASK_DRAM_MISC_CAS_RD_OPN               0x03 0x04 0x00
-UMASK_DRAM_MISC_CAS_WR_OPN               0x04 0x04 0x00
-UMASK_DRAM_MISC_CAS_RD_CLS               0x05 0x04 0x00
-UMASK_DRAM_MISC_CAS_WR_CLS               0x06 0x04 0x00
-UMASK_DRAM_MISC_MRS                      0x07 0x04 0x00
-UMASK_DRAM_MISC_RFR                      0x09 0x04 0x00
-UMASK_DRAM_MISC_ENSR                     0x0A 0x04 0x00
-UMASK_DRAM_MISC_EXSR                     0x0B 0x04 0x00
-UMASK_DRAM_MISC_NOP                      0x0C 0x04 0x00
-UMASK_DRAM_MISC_TRKL                     0x10 0x04 0x00
-UMASK_DRAM_MISC_PRE                      0x11 0x04 0x00
-UMASK_DRAM_MISC_SYNC                     0x12 0x04 0x00
-UMASK_DRAM_MISC_CKE_HI                   0x14 0x04 0x00
-UMASK_DRAM_MISC_CKE_LO                   0x15 0x04 0x00
-UMASK_DRAM_MISC_SOFT_RST                 0x17 0x04 0x00
-UMASK_DRAM_MISC_WR_CFG                   0x1C 0x04 0x00
-UMASK_DRAM_MISC_RD_CFG                   0x1D 0x04 0x00
-UMASK_DRAM_MISC_ZQCAL                    0x1E 0x04 0x00
+EVENT_BCMD_SCHEDQ_OCCUPANCY           0x06  MBOX
+UMASK_BCMD_SCHEDQ_OCCUPANCY_READS     0x00 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_WRITES    0x01 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_MERGE     0x02 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_V2F       0x03 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_V2V       0x04 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_F2V       0x05 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_F2B       0x06 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_SPRWR     0x07 0x01 0x00
+UMASK_BCMD_SCHEDQ_OCCUPANCY_ALL       0x08 0x01 0x00
 
 EVENT_FRM_TYPE                        0x09  MBOX
 UMASK_FRM_TYPE_3CMD                   0x00 0x05 0x00
@@ -750,12 +725,12 @@ UMASK_FVC_EV1_FAST_RESET              0x04 0x07 0x00
 UMASK_FVC_EV1_BBOX_CMDS_READS         0x05 0x07 0x00
 UMASK_FVC_EV1_BBOX_CMDS_WRITES        0x05 0x07 0x01
 UMASK_FVC_EV1_BBOX_RSP_ACK            0x06 0x07 0x00
-UMASK_FVC_EV1_BBOX_RSP_RETRY          0x06 0x07 0x10
-UMASK_FVC_EV1_BBOX_RSP_COR            0x06 0x07 0x20
-UMASK_FVC_EV1_BBOX_RSP_UNCOR          0x06 0x07 0x30
-UMASK_FVC_EV1_BBOX_RSP_SPEC_ACK       0x06 0x07 0x40
-UMASK_FVC_EV1_BBOX_RSP_SPR_ACK        0x06 0x07 0x50
-UMASK_FVC_EV1_BBOX_RSP_SPR_UNCORE     0x06 0x07 0x70
+UMASK_FVC_EV1_BBOX_RSP_RETRY          0x06 0x07 0x01
+UMASK_FVC_EV1_BBOX_RSP_COR            0x06 0x07 0x02
+UMASK_FVC_EV1_BBOX_RSP_UNCOR          0x06 0x07 0x03
+UMASK_FVC_EV1_BBOX_RSP_SPEC_ACK       0x06 0x07 0x04
+UMASK_FVC_EV1_BBOX_RSP_SPR_ACK        0x06 0x07 0x05
+UMASK_FVC_EV1_BBOX_RSP_SPR_UNCORE     0x06 0x07 0x07
 UMASK_FVC_EV1_SMI_NB_TRIG             0x07 0x07 0x00
 
 EVENT_FVC_EV2                         0x0F  MBOX
@@ -767,30 +742,30 @@ UMASK_FVC_EV2_FAST_RESET              0x04 0x08 0x00
 UMASK_FVC_EV2_BBOX_CMDS_READS         0x05 0x08 0x00
 UMASK_FVC_EV2_BBOX_CMDS_WRITES        0x05 0x08 0x01
 UMASK_FVC_EV2_BBOX_RSP_ACK            0x06 0x08 0x00
-UMASK_FVC_EV2_BBOX_RSP_RETRY          0x06 0x08 0x10
-UMASK_FVC_EV2_BBOX_RSP_COR            0x06 0x08 0x20
-UMASK_FVC_EV2_BBOX_RSP_UNCOR          0x06 0x08 0x30
-UMASK_FVC_EV2_BBOX_RSP_SPEC_ACK       0x06 0x08 0x40
-UMASK_FVC_EV2_BBOX_RSP_SPR_ACK        0x06 0x08 0x50
-UMASK_FVC_EV2_BBOX_RSP_SPR_UNCORE     0x06 0x08 0x70
+UMASK_FVC_EV2_BBOX_RSP_RETRY          0x06 0x08 0x01
+UMASK_FVC_EV2_BBOX_RSP_COR            0x06 0x08 0x02
+UMASK_FVC_EV2_BBOX_RSP_UNCOR          0x06 0x08 0x03
+UMASK_FVC_EV2_BBOX_RSP_SPEC_ACK       0x06 0x08 0x04
+UMASK_FVC_EV2_BBOX_RSP_SPR_ACK        0x06 0x08 0x05
+UMASK_FVC_EV2_BBOX_RSP_SPR_UNCORE     0x06 0x08 0x07
 UMASK_FVC_EV2_SMI_NB_TRIG             0x07 0x08 0x00
 
 EVENT_FVC_EV3                         0x10  MBOX
 UMASK_FVC_EV3_SMI_CRC_ERR             0x00 0x09 0x00
-UMASK_FVC_EV3_MEM_ECC_ERR             0x00 0x09 0x00
-UMASK_FVC_EV3_POISON_TXN              0x00 0x09 0x00
-UMASK_FVC_EV3_ALERT_FRAMES            0x00 0x09 0x00
-UMASK_FVC_EV3_FAST_RESET              0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_CMDS_READS         0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_CMDS_WRITES        0x00 0x09 0x01
-UMASK_FVC_EV3_BBOX_RSP_ACK            0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_RSP_RETRY          0x00 0x09 0x10
-UMASK_FVC_EV3_BBOX_RSP_COR            0x00 0x09 0x20
-UMASK_FVC_EV3_BBOX_RSP_UNCOR          0x00 0x09 0x30
-UMASK_FVC_EV3_BBOX_RSP_SPEC_ACK       0x00 0x09 0x40
-UMASK_FVC_EV3_BBOX_RSP_SPR_ACK        0x00 0x09 0x50
-UMASK_FVC_EV3_BBOX_RSP_SPR_UNCORE     0x00 0x09 0x70
-UMASK_FVC_EV3_SMI_NB_TRIG             0x00 0x09 0x00
+UMASK_FVC_EV3_MEM_ECC_ERR             0x01 0x09 0x00
+UMASK_FVC_EV3_POISON_TXN              0x02 0x09 0x00
+UMASK_FVC_EV3_ALERT_FRAMES            0x03 0x09 0x00
+UMASK_FVC_EV3_FAST_RESET              0x04 0x09 0x00
+UMASK_FVC_EV3_BBOX_CMDS_READS         0x05 0x09 0x00
+UMASK_FVC_EV3_BBOX_CMDS_WRITES        0x05 0x09 0x01
+UMASK_FVC_EV3_BBOX_RSP_ACK            0x06 0x09 0x00
+UMASK_FVC_EV3_BBOX_RSP_RETRY          0x06 0x09 0x01
+UMASK_FVC_EV3_BBOX_RSP_COR            0x06 0x09 0x02
+UMASK_FVC_EV3_BBOX_RSP_UNCOR          0x06 0x09 0x03
+UMASK_FVC_EV3_BBOX_RSP_SPEC_ACK       0x06 0x09 0x04
+UMASK_FVC_EV3_BBOX_RSP_SPR_ACK        0x06 0x09 0x05
+UMASK_FVC_EV3_BBOX_RSP_SPR_UNCORE     0x06 0x09 0x07
+UMASK_FVC_EV3_SMI_NB_TRIG             0x07 0x09 0x00
 
 EVENT_FVID_RACE                       0x18  MBOX
 UMASK_FVID_RACE                       0x00 0x00 0x00
@@ -798,9 +773,8 @@ UMASK_FVID_RACE                       0x00 0x00 0x00
 EVENT_INFLIGHT_CMDS                   0x1D  MBOX
 UMASK_INFLIGHT_CMDS                   0x00 0x00 0x00
 
-EVENT_ISS_SCHED                       0x08  MBOX
-UMASK_ISS_SCHED_CHANGES               0x00 0x0A 0x00
-UMASK_ISS_SCHED_FRAME_BEAT            0x01 0x0A 0x00
+EVENT_SCHED_MODE_CHANGES              0x08  MBOX
+UMASK_SCHED_MODE_CHANGES              0x00 0x00 0x00
 
 EVENT_MA_PAR_ERR                      0x0C  MBOX
 UMASK_MA_PAR_ERR                      0x00 0x00 0x00
@@ -808,6 +782,9 @@ UMASK_MA_PAR_ERR                      0x00 0x00 0x00
 EVENT_MULTICAS                        0x17  MBOX
 UMASK_MULTICAS                        0x00 0x00 0x00
 
+EVENT_PAGE_EMPTY                      0x15  MBOX
+UMASK_PAGE_EMPTY                      0x00 0x00 0x00
+
 EVENT_PAGE_HIT                        0x14  MBOX
 UMASK_PAGE_HIT                        0x00 0x00 0x00
 
@@ -821,9 +798,8 @@ EVENT_PGT_PAGE_EV                     0x16  MBOX
 UMASK_PGT_PAGE_EV_OPN2CLS             0x00 0x0B 0x00
 UMASK_PGT_PAGE_EV_CLS2OPN             0x01 0x0B 0x00
 
-EVENT_PGT_PAGE_EV2                    0x15  MBOX
-UMASK_PGT_PAGE_EV2_AUTO_CLS           0x00 0x0C 0x00
-UMASK_PGT_PAGE_EV2_PAGE_EMPTY         0x01 0x0C 0x00
+EVENT_RETRIES                         0x0B  MBOX
+UMASK_RETRIES_ALL                     0x00 0x00 0x00
 
 EVENT_REFRESH                         0x06  MBOX
 UMASK_REFRESH                         0x00 0x00 0x00
@@ -845,12 +821,44 @@ UMASK_THERM_TRP_DN_ALL_GT_MID_RISE    0x03 0x0D 0x00
 UMASK_THERM_TRP_DN_ALL_GT_MID_FALL    0x02 0x0D 0x00
 UMASK_THERM_TRP_DN_ALL_GT_LO          0x01 0x0D 0x00
 UMASK_THERM_TRP_DN_ALL_LT_LO          0x00 0x0D 0x00
+UMASK_THERM_TRP_DN_DIMM0_GT_MID_RISE  0x03 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_MID_RISE  0x03 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_MID_RISE  0x03 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_MID_RISE  0x03 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_GT_MID_FALL  0x02 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_MID_FALL  0x02 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_MID_FALL  0x02 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_MID_FALL  0x02 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_GT_LO        0x01 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_LO        0x01 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_LO        0x01 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_LO        0x01 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_LT_LO        0x00 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_LT_LO        0x00 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_LT_LO        0x00 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_LT_LO        0x00 0x0D 0x04
 
 EVENT_THERM_TRP_UP                    0x04  MBOX
 UMASK_THERM_TRP_UP_ALL_GT_MID_RISE    0x03 0x0E 0x00
 UMASK_THERM_TRP_UP_ALL_GT_MID_FALL    0x02 0x0E 0x00
 UMASK_THERM_TRP_UP_ALL_GT_LO          0x01 0x0E 0x00
 UMASK_THERM_TRP_UP_ALL_LT_LO          0x00 0x0E 0x00
+UMASK_THERM_TRP_UP_DIMM0_GT_MID_RISE  0x03 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_MID_RISE  0x03 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_MID_RISE  0x03 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_MID_RISE  0x03 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_GT_MID_FALL  0x02 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_MID_FALL  0x02 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_MID_FALL  0x02 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_MID_FALL  0x02 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_GT_LO        0x01 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_LO        0x01 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_LO        0x01 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_LO        0x01 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_LT_LO        0x00 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_LT_LO        0x00 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_LT_LO        0x00 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_LT_LO        0x00 0x0E 0x04
 
 EVENT_TRANS_CMDS                      0x12  MBOX
 UMASK_TRANS_CMDS                      0x00 0x00 0x00
@@ -859,112 +867,164 @@ EVENT_TT_CMD_CONFLICT                 0x19  MBOX
 UMASK_TT_CMD_CONFLICT                 0x00 0x00 0x00
 
 EVENT_ACK_BEFORE_LAST_SNP             0x19  BBOX0C3|BBOX1C3
-UMASK_ACK_BEFORE_LAST_SNP             0x03
+UMASK_ACK_BEFORE_LAST_SNP             0x00
 
 EVENT_ADDR_IN_MATCH             0x04  BBOX0C2|BBOX1C2
-UMASK_ADDR_IN_MATCH             0x02
+OPTIONS_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_ADDR_IN_MATCH             0x00
 
 EVENT_CONFLICTS             0x17  BBOX0C3|BBOX1C3
-UMASK_CONFLICTS             0x03
+UMASK_CONFLICTS             0x00
 
 EVENT_COHQ_BYPASS             0x0E  BBOX0C3|BBOX1C3
-UMASK_COHQ_BYPASS             0x03
+UMASK_COHQ_BYPASS             0x00
 
-EVENT_COHQ_IMT_ALLOC_WAIT             0x0E  BBOX0C3|BBOX1C3
-UMASK_COHQ_IMT_ALLOC_WAIT             0x03
+EVENT_COHQ_IMT_ALLOC_WAIT             0x13  BBOX0C3|BBOX1C3
+UMASK_COHQ_IMT_ALLOC_WAIT             0x00
 
-EVENT_DIRQ_INSERTS             0x17  BBOX0C1|BBOX1C1
-UMASK_DIRQ_INSERTS             0x01
+EVENT_DIRQ_INSERTS                  0x17  BBOX0C1|BBOX1C1
+UMASK_DIRQ_INSERTS                  0x00
 
 EVENT_DIRQ_OCCUPANCY             0x17  BBOX0C0|BBOX1C0
 UMASK_DIRQ_OCCUPANCY             0x00
 
 EVENT_DEMAND_FETCH             0x0F  BBOX0C3|BBOX1C3
-UMASK_DEMAND_FETCH             0x03
+UMASK_DEMAND_FETCH             0x00
 
 EVENT_DRSQ_INSERTS             0x09  BBOX0C1|BBOX1C1
-UMASK_DRSQ_INSERTS             0x01
+UMASK_DRSQ_INSERTS             0x00
 
 EVENT_DRSQ_OCCUPANCY             0x09  BBOX0C0|BBOX1C0
 UMASK_DRSQ_OCCUPANCY             0x00
 
 EVENT_EARLY_ACK             0x02  BBOX0C3|BBOX1C3
-UMASK_EARLY_ACK             0x03
+UMASK_EARLY_ACK             0x00
 
 EVENT_IMPLICIT_WBS             0x12  BBOX0C3|BBOX1C3
-UMASK_IMPLICIT_WBS             0x03
+UMASK_IMPLICIT_WBS             0x00
 
-EVENT_IMT_FULL             0x12  BBOX0C3|BBOX1C3
-UMASK_IMT_FULL             0x03
+EVENT_IMT_FULL             0x16  BBOX0C3|BBOX1C3
+UMASK_IMT_FULL             0x00
 
 EVENT_IMT_INSERTS_ALL             0x07  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_ALL             0x01
+UMASK_IMT_INSERTS_ALL             0x00
 
 EVENT_IMT_INSERTS_INVITOE             0x0F  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_INVITOE             0x01
+UMASK_IMT_INSERTS_INVITOE             0x00
 
 EVENT_IMT_INSERTS_IOH             0x0A  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH             0x01
+UMASK_IMT_INSERTS_IOH             0x00
 
 EVENT_IMT_INSERTS_IOH_INVITOE             0x10  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH_INVITOE             0x01
+UMASK_IMT_INSERTS_IOH_INVITOE             0x00
 
 EVENT_IMT_INSERTS_IOH_WR             0x0D  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH_WR             0x01
+UMASK_IMT_INSERTS_IOH_WR             0x00
 
 EVENT_IMT_INSERTS_NON_IOH             0x0B  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH             0x01
+UMASK_IMT_INSERTS_NON_IOH             0x00
 
 EVENT_IMT_INSERTS_NON_IOH_INVITOE             0x1C  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH_INVITOE             0x01
+UMASK_IMT_INSERTS_NON_IOH_INVITOE             0x00
 
-EVENT_INSERTS_NON_IOH_RD             0x1F  BBOX0C1|BBOX1C1
-UMASK_INSERTS_NON_IOH_RD             0x01
+EVENT_IMT_INSERTS_NON_IOH_RD             0x1F  BBOX0C1|BBOX1C1
+UMASK_IMT_INSERTS_NON_IOH_RD             0x00
 
 EVENT_IMT_INSERTS_NON_IOH_WR             0x0E  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH_WR             0x01
+UMASK_IMT_INSERTS_NON_IOH_WR             0x00
 
 EVENT_IMT_INSERTS_RD             0x1D  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_RD             0x01
+UMASK_IMT_INSERTS_RD             0x00
 
 EVENT_IMT_INSERTS_WR             0x0C  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_WR             0x01
+UMASK_IMT_INSERTS_WR             0x00
 
 EVENT_IMT_NE_CYCLES             0x07  BBOX0C2|BBOX1C2
-UMASK_IMT_NE_CYCLES             0x02
+UMASK_IMT_NE_CYCLES             0x00
 
 EVENT_IMT_PREALLOC             0x06  BBOX0C3|BBOX1C3
-UMASK_IMT_PREALLOC             0x03
+UMASK_IMT_PREALLOC             0x00
 
 EVENT_IMT_VALID_OCCUPANCY             0x07  BBOX0C0|BBOX1C0
 UMASK_IMT_VALID_OCCUPANCY             0x00
 
 EVENT_MSG_ADDR_IN_MATCH             0x01  BBOX0C0|BBOX1C0
+OPTIONS_MSG_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
 UMASK_MSG_ADDR_IN_MATCH             0x00
 
-EVENT_MSGS_B_TO_S             0x03  BBOX0C1|BBOX1C1
-UMASK_MSGS_B_TO_S             0x01
-
 EVENT_MSGS_B_TO_S             0x03  BBOX0C2|BBOX1C2
-UMASK_MSGS_B_TO_S             0x02
+UMASK_MSGS_B_TO_S             0x00
+
+EVENT_MSGS_S_TO_B             0x02  BBOX0C2|BBOX1C2
+UMASK_MSGS_S_TO_B             0x00
 
 EVENT_MSG_IN_MATCH             0x01  BBOX0C1|BBOX1C1
-UMASK_MSG_IN_MATCH             0x01
+OPTIONS_MSG_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_IN_MATCH             0x00
 
 EVENT_MSGS_IN_NON_SNP             0x01  BBOX0C2|BBOX1C2
-UMASK_MSGS_IN_NON_SNP             0x02
+UMASK_MSGS_IN_NON_SNP             0x00
 
 EVENT_MSG_OPCODE_ADDR_IN_MATCH             0x03  BBOX0C0|BBOX1C0
+OPTIONS_MSG_OPCODE_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
 UMASK_MSG_OPCODE_ADDR_IN_MATCH             0x00
 
 EVENT_MSG_OPCODE_IN_MATCH             0x05  BBOX0C1|BBOX1C1
-UMASK_MSG_OPCODE_IN_MATCH             0x01
+OPTIONS_MSG_OPCODE_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OPCODE_IN_MATCH             0x00
 
 EVENT_MSG_OPCODE_OUT_MATCH             0x06  BBOX0C1|BBOX1C1
-UMASK_MSG_OPCODE_OUT_MATCH             0x01
+OPTIONS_MSG_OPCODE_OUT_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OPCODE_OUT_MATCH             0x00
 
 EVENT_MSG_OUT_MATCH             0x02  BBOX0C1|BBOX1C1
-UMASK_MSG_OUT_MATCH             0x01
+OPTIONS_MSG_OUT_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OUT_MATCH             0x00
+
+EVENT_OPCODE_ADDR_IN_MATCH             0x02  BBOX0C0|BBOX1C0
+OPTIONS_OPCODE_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_ADDR_IN_MATCH             0x00
+
+EVENT_OPCODE_IN_MATCH             0x03  BBOX0C1|BBOX1C1
+OPTIONS_OPCODE_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_IN_MATCH             0x00
+
+EVENT_OPCODE_OUT_MATCH             0x04  BBOX0C1|BBOX1C1
+OPTIONS_OPCODE_OUT_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_OUT_MATCH             0x00
+
+EVENT_RBOX_VNA_UNAVAIL              0x15 BBOX0C3|BBOX1C3
+UMASK_RBOX_VNA_UNAVAIL              0x00
+
+EVENT_SBOX_VN0_UNAVAIL              0x14 BBOX0C3|BBOX1C3
+UMASK_SBOX_VN0_UNAVAIL              0x00
+
+EVENT_SNPOQ_INSERTS                 0x12 BBOX0C1|BBOX1C1
+UMASK_SNPOQ_INSERTS                 0x00
+
+EVENT_SNPOQ_OCCUPANCY               0x12 BBOX0C0|BBOX1C0
+UMASK_SNPOQ_OCCUPANCY               0x00
+
+EVENT_TF_ALL                        0x04 BBOX0C0|BBOX1C0
+UMASK_TF_ALL                        0x00
+
+EVENT_TF_INVITOE                    0x06 BBOX0C0|BBOX1C0
+UMASK_TF_INVITOE                    0x00
+
+EVENT_TF_IOH                        0x0B BBOX0C0|BBOX1C0
+UMASK_TF_IOH                        0x00
+
+EVENT_TF_IOH_INVITOE                0x0F BBOX0C0|BBOX1C0
+UMASK_TF_IOH_INVITOE                0x00
+
+EVENT_TF_IOH_NON_INVITOE_RD         0x1C BBOX0C0|BBOX1C0
+UMASK_TF_IOH_NON_INVITOE_RD         0x00
+
+EVENT_TF_IOH_WR                     0x0D BBOX0C0|BBOX1C0
+UMASK_TF_IOH_WR                     0x00
+
+EVENT_TF_WR                         0x05 BBOX0C0|BBOX1C0
+UMASK_TF_WR                         0x00
 
 EVENT_ALLOC_TO_ARB                              0x00  RBOX0
 UMASK_ALLOC_TO_ARB_PORT0_IPERF0_NCB             0x00 0x01  0x09
@@ -974,7 +1034,7 @@ UMASK_ALLOC_TO_ARB_PORT0_IPERF0_NDR             0x00 0x08  0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF0_SNP             0x00 0x10  0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF0_HOM_VN0         0x00 0x20  0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF0_HOM_VN1         0x00 0x40  0x09
-UMASK_ALLOC_TO_ARB_PORT0_IPERF0_ALL             0x00 0xFF  0x09
+UMASK_ALLOC_TO_ARB_PORT0_IPERF0_ALL             0x00 0x7F  0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF1_NCB             0x01 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF1_NCS             0x01 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF1_DRS_VN01        0x01 0x04   0x09
@@ -982,7 +1042,7 @@ UMASK_ALLOC_TO_ARB_PORT0_IPERF1_NDR             0x01 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF1_SNP             0x01 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF1_HOM_VN0         0x01 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT0_IPERF1_HOM_VN1         0x01 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT0_IPERF1_ALL             0x01 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT0_IPERF1_ALL             0x01 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF0_NCB             0x06 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF0_NCS             0x06 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF0_DRS_VN01        0x06 0x04   0x09
@@ -990,7 +1050,7 @@ UMASK_ALLOC_TO_ARB_PORT1_IPERF0_NDR             0x06 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF0_SNP             0x06 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF0_HOM_VN0         0x06 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF0_HOM_VN1         0x06 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT1_IPERF0_ALL             0x06 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT1_IPERF0_ALL             0x06 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF1_NCB             0x07 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF1_NCS             0x07 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF1_DRS_VN01        0x07 0x04   0x09
@@ -998,7 +1058,7 @@ UMASK_ALLOC_TO_ARB_PORT1_IPERF1_NDR             0x07 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF1_SNP             0x07 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF1_HOM_VN0         0x07 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT1_IPERF1_HOM_VN1         0x07 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT1_IPERF1_ALL             0x07 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT1_IPERF1_ALL             0x07 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF0_NCB             0x0C 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF0_NCS             0x0C 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF0_DRS_VN01        0x0C 0x04   0x09
@@ -1006,7 +1066,7 @@ UMASK_ALLOC_TO_ARB_PORT2_IPERF0_NDR             0x0C 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF0_SNP             0x0C 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF0_HOM_VN0         0x0C 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF0_HOM_VN1         0x0C 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT2_IPERF0_ALL             0x0C 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT2_IPERF0_ALL             0x0C 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF1_NCB             0x0D 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF1_NCS             0x0D 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF1_DRS_VN01        0x0D 0x04   0x09
@@ -1014,7 +1074,7 @@ UMASK_ALLOC_TO_ARB_PORT2_IPERF1_NDR             0x0D 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF1_SNP             0x0D 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF1_HOM_VN0         0x0D 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT2_IPERF1_HOM_VN1         0x0D 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT2_IPERF1_ALL             0x0D 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT2_IPERF1_ALL             0x0D 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF0_NCB             0x12 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF0_NCS             0x12 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF0_DRS_VN01        0x12 0x04   0x09
@@ -1022,7 +1082,7 @@ UMASK_ALLOC_TO_ARB_PORT3_IPERF0_NDR             0x12 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF0_SNP             0x12 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF0_HOM_VN0         0x12 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF0_HOM_VN1         0x12 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT3_IPERF0_ALL             0x12 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT3_IPERF0_ALL             0x12 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF1_NCB             0x13 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF1_NCS             0x13 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF1_DRS_VN01        0x13 0x04   0x09
@@ -1030,7 +1090,7 @@ UMASK_ALLOC_TO_ARB_PORT3_IPERF1_NDR             0x13 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF1_SNP             0x13 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF1_HOM_VN0         0x13 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT3_IPERF1_HOM_VN1         0x13 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT3_IPERF1_ALL             0x13 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT3_IPERF1_ALL             0x13 0x7F   0x09
 
 EVENT_ALLOC_TO_ARB                              0x00  RBOX1
 UMASK_ALLOC_TO_ARB_PORT4_IPERF0_NCB             0x00 0x01  0x09
@@ -1040,7 +1100,7 @@ UMASK_ALLOC_TO_ARB_PORT4_IPERF0_NDR             0x00 0x08  0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF0_SNP             0x00 0x10  0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF0_HOM_VN0         0x00 0x20  0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF0_HOM_VN1         0x00 0x40  0x09
-UMASK_ALLOC_TO_ARB_PORT4_IPERF0_ALL             0x00 0xFF  0x09
+UMASK_ALLOC_TO_ARB_PORT4_IPERF0_ALL             0x00 0x7F  0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF1_NCB             0x01 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF1_NCS             0x01 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF1_DRS_VN01        0x01 0x04   0x09
@@ -1048,7 +1108,7 @@ UMASK_ALLOC_TO_ARB_PORT4_IPERF1_NDR             0x01 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF1_SNP             0x01 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF1_HOM_VN0         0x01 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT4_IPERF1_HOM_VN1         0x01 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT4_IPERF1_ALL             0x01 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT4_IPERF1_ALL             0x01 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF0_NCB             0x06 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF0_NCS             0x06 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF0_DRS_VN01        0x06 0x04   0x09
@@ -1056,7 +1116,7 @@ UMASK_ALLOC_TO_ARB_PORT5_IPERF0_NDR             0x06 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF0_SNP             0x06 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF0_HOM_VN0         0x06 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF0_HOM_VN1         0x06 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT5_IPERF0_ALL             0x06 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT5_IPERF0_ALL             0x06 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF1_NCB             0x07 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF1_NCS             0x07 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF1_DRS_VN01        0x07 0x04   0x09
@@ -1064,7 +1124,7 @@ UMASK_ALLOC_TO_ARB_PORT5_IPERF1_NDR             0x07 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF1_SNP             0x07 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF1_HOM_VN0         0x07 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT5_IPERF1_HOM_VN1         0x07 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT5_IPERF1_ALL             0x07 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT5_IPERF1_ALL             0x07 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF0_NCB             0x0C 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF0_NCS             0x0C 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF0_DRS_VN01        0x0C 0x04   0x09
@@ -1072,7 +1132,7 @@ UMASK_ALLOC_TO_ARB_PORT6_IPERF0_NDR             0x0C 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF0_SNP             0x0C 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF0_HOM_VN0         0x0C 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF0_HOM_VN1         0x0C 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT6_IPERF0_ALL             0x0C 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT6_IPERF0_ALL             0x0C 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF1_NCB             0x0D 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF1_NCS             0x0D 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF1_DRS_VN01        0x0D 0x04   0x09
@@ -1080,7 +1140,7 @@ UMASK_ALLOC_TO_ARB_PORT6_IPERF1_NDR             0x0D 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF1_SNP             0x0D 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF1_HOM_VN0         0x0D 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT6_IPERF1_HOM_VN1         0x0D 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT6_IPERF1_ALL             0x0D 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT6_IPERF1_ALL             0x0D 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF0_NCB             0x12 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF0_NCS             0x12 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF0_DRS_VN01        0x12 0x04   0x09
@@ -1088,7 +1148,7 @@ UMASK_ALLOC_TO_ARB_PORT7_IPERF0_NDR             0x12 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF0_SNP             0x12 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF0_HOM_VN0         0x12 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF0_HOM_VN1         0x12 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT7_IPERF0_ALL             0x12 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT7_IPERF0_ALL             0x12 0x7F   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF1_NCB             0x13 0x01   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF1_NCS             0x13 0x02   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF1_DRS_VN01        0x13 0x04   0x09
@@ -1096,7 +1156,7 @@ UMASK_ALLOC_TO_ARB_PORT7_IPERF1_NDR             0x13 0x08   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF1_SNP             0x13 0x10   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF1_HOM_VN0         0x13 0x20   0x09
 UMASK_ALLOC_TO_ARB_PORT7_IPERF1_HOM_VN1         0x13 0x40   0x09
-UMASK_ALLOC_TO_ARB_PORT7_IPERF1_ALL             0x13 0xFF   0x09
+UMASK_ALLOC_TO_ARB_PORT7_IPERF1_ALL             0x13 0x7F   0x09
 
 
 EVENT_EOT_INSERTS                             0x00  RBOX0
@@ -2236,7 +2296,7 @@ UMASK_QUE_ARB_BID_PORT0_QLX0_HOM        0x02 0x00 0x00
 UMASK_QUE_ARB_BID_PORT0_QLX0_SNP        0x02 0x00 0x01
 UMASK_QUE_ARB_BID_PORT0_QLX0_NDR        0x02 0x00 0x02
 UMASK_QUE_ARB_BID_PORT0_QLX0_NCS        0x02 0x00 0x03
-UMASK_QUE_ARB_BID_PORT0_QLX0_DRS        0x02 0x00 0x02
+UMASK_QUE_ARB_BID_PORT0_QLX0_DRS        0x02 0x00 0x04
 UMASK_QUE_ARB_BID_PORT0_QLX0_NCB        0x02 0x00 0x05
 UMASK_QUE_ARB_BID_PORT0_QLX1_HOM        0x03 0x00 0x00
 UMASK_QUE_ARB_BID_PORT0_QLX1_SNP        0x03 0x00 0x01
@@ -3313,6 +3373,7 @@ EVENT_TO_R_NDR_MSGQ_OCCUPANCY                   0x0D SBOX
 UMASK_TO_R_NDR_MSGQ_OCCUPANCY                   0x00
 
 EVENT_TO_R_PROG_EV                              0x00 SBOX
+OPTIONS_TO_R_PROG_EV                            EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
 UMASK_TO_R_PROG_EV                              0x00
 
 EVENT_TO_R_B_REQUESTS                           0x6C SBOX
diff --git a/src/includes/perfmon_nehalem_counters.h b/src/includes/perfmon_nehalem_counters.h
index d3831c1..538d87f 100644
--- a/src/includes/perfmon_nehalem_counters.h
+++ b/src/includes/perfmon_nehalem_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_nehalem_counters.h
  *
- *      Description:  Counter Header File of perfmon module for Nehalem.
+ *      Description:  Counter Header File of perfmon module for Intel Nehalem.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,27 +30,38 @@
  */
 
 #define NUM_COUNTERS_CORE_NEHALEM 7
-#define NUM_COUNTERS_UNCORE_NEHALEM 15
-#define NUM_COUNTERS_NEHALEM 15
+#define NUM_COUNTERS_UNCORE_NEHALEM 16
+#define NUM_COUNTERS_NEHALEM 16
 
-static PerfmonCounterMap nehalem_counter_map[NUM_COUNTERS_NEHALEM] = {
+#define NEH_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define NEH_VALID_OPTIONS_PMC EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+#define NEH_VALID_OPTIONS_UNCORE EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+
+static RegisterMap nehalem_counter_map[NUM_COUNTERS_NEHALEM] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0",PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
-    {"FIXC1",PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
-    {"FIXC2",PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+    {"FIXC0",PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, NEH_VALID_OPTIONS_FIXED},
+    {"FIXC1",PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, NEH_VALID_OPTIONS_FIXED},
+    {"FIXC2",PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, NEH_VALID_OPTIONS_FIXED},
     /* PMC Counters: 4 48bit wide */
-    {"PMC0",PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1",PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
-    {"PMC2",PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
-    {"PMC3",PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+    {"PMC0",PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, NEH_VALID_OPTIONS_PMC},
+    {"PMC1",PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, NEH_VALID_OPTIONS_PMC},
+    {"PMC2",PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, NEH_VALID_OPTIONS_PMC},
+    {"PMC3",PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, NEH_VALID_OPTIONS_PMC},
     /* Uncore PMC Counters: 8 48bit wide */
-    {"UPMC0",PMC7,  UNCORE, MSR_UNCORE_PERFEVTSEL0, MSR_UNCORE_PMC0, 0, 0},
-    {"UPMC1",PMC8,  UNCORE, MSR_UNCORE_PERFEVTSEL1, MSR_UNCORE_PMC1, 0, 0},
-    {"UPMC2",PMC9,  UNCORE, MSR_UNCORE_PERFEVTSEL2, MSR_UNCORE_PMC2, 0, 0},
-    {"UPMC3",PMC10, UNCORE, MSR_UNCORE_PERFEVTSEL3, MSR_UNCORE_PMC3, 0, 0},
-    {"UPMC4",PMC11, UNCORE, MSR_UNCORE_PERFEVTSEL4, MSR_UNCORE_PMC4, 0, 0},
-    {"UPMC5",PMC12, UNCORE, MSR_UNCORE_PERFEVTSEL5, MSR_UNCORE_PMC5, 0, 0},
-    {"UPMC6",PMC13, UNCORE, MSR_UNCORE_PERFEVTSEL6, MSR_UNCORE_PMC6, 0, 0},
-    {"UPMC7",PMC14, UNCORE, MSR_UNCORE_PERFEVTSEL7, MSR_UNCORE_PMC7, 0, 0}
+    {"UPMC0",PMC7,  UNCORE, MSR_UNCORE_PERFEVTSEL0, MSR_UNCORE_PMC0, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMC1",PMC8,  UNCORE, MSR_UNCORE_PERFEVTSEL1, MSR_UNCORE_PMC1, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMC2",PMC9,  UNCORE, MSR_UNCORE_PERFEVTSEL2, MSR_UNCORE_PMC2, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMC3",PMC10, UNCORE, MSR_UNCORE_PERFEVTSEL3, MSR_UNCORE_PMC3, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMC4",PMC11, UNCORE, MSR_UNCORE_PERFEVTSEL4, MSR_UNCORE_PMC4, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMC5",PMC12, UNCORE, MSR_UNCORE_PERFEVTSEL5, MSR_UNCORE_PMC5, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMC6",PMC13, UNCORE, MSR_UNCORE_PERFEVTSEL6, MSR_UNCORE_PMC6, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMC7",PMC14, UNCORE, MSR_UNCORE_PERFEVTSEL7, MSR_UNCORE_PMC7, 0, 0, NEH_VALID_OPTIONS_UNCORE},
+    {"UPMCFIX",PMC15, UNCORE, MSR_UNCORE_FIXED_CTR_CTRL, MSR_UNCORE_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK}
+};
+
+static BoxMap nehalem_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [FIXED] =  {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [UNCORE] = {MSR_UNCORE_PERF_GLOBAL_CTRL, MSR_UNCORE_PERF_GLOBAL_STATUS, MSR_UNCORE_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48}
 };
 
diff --git a/src/includes/perfmon_nehalem_events.txt b/src/includes/perfmon_nehalem_events.txt
index 0eeed50..f6f88b0 100644
--- a/src/includes/perfmon_nehalem_events.txt
+++ b/src/includes/perfmon_nehalem_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_nehalem_events.txt
-# 
+#
 #      Description:  Event list for Intel Nehalem
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -61,7 +62,7 @@ UMASK_STORE_BLOCK_ANY            0x0F
 EVENT_PARTIAL_ADDRESS_ALIAS      0x07  PMC
 UMASK_PARTIAL_ADDRESS_ALIAS      0x01
 
-EVENT_DTLB_LOAD_MISSES                0x08  PMC
+EVENT_DTLB_LOAD_MISSES                 0x08  PMC
 UMASK_DTLB_LOAD_MISSES_ANY             0x01
 UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED  0x02
 UMASK_DTLB_LOAD_MISSES_STLB_HIT        0x10
@@ -531,6 +532,13 @@ UMASK_SIMD_INT_64_PACKED_LOGICAL        0x10
 UMASK_SIMD_INT_64_PACKED_ARITH          0x20
 UMASK_SIMD_INT_64_SHUFFLE_MOVE          0x40
 
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+
+EVENT_UNCORE_CLOCKTICKS                 0x00 UPMCFIX
+UMASK_UNCORE_CLOCKTICKS                 0x00
+
 EVENT_UNC_GQ_CYCLES_FULL                0x00   UPMC
 UMASK_UNC_GQ_CYCLES_FULL_READ_TRACKER         0x01
 UMASK_UNC_GQ_CYCLES_FULL_WRITE_TRACKER        0x02
@@ -720,6 +728,12 @@ UMASK_UNC_QMC_PRIORITY_UPDATES_ANY            0x07
 EVENT_UNC_QHL_FRC_ACK_CNFLTS_LOCAL            0x33   UPMC
 UMASK_UNC_QHL_FRC_ACK_CNFLTS_LOCAL            0x04
 
+EVENT_UNC_ADDR_OPCODE_MATCH                   0x35  UPMC
+OPTIONS_UNC_ADDR_OPCODE_MATCH_OR              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_OR                0x00 0x06
+OPTIONS_UNC_ADDR_OPCODE_MATCH_AND             EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_AND               0x01 0x01
+
 EVENT_UNC_QPI_TX_STALLED_SINGLE_FLIT            0x40   UPMC
 UMASK_UNC_QPI_TX_STALLED_SINGLE_FLIT_HOME_LINK_0      0x01
 UMASK_UNC_QPI_TX_STALLED_SINGLE_FLIT_SNOOP_LINK_0     0x02
@@ -788,4 +802,3 @@ EVENT_UNC_DRAM_PRE_ALL                  0x66   UPMC
 UMASK_UNC_DRAM_PRE_ALL_CH0              0x01
 UMASK_UNC_DRAM_PRE_ALL_CH1              0x02
 UMASK_UNC_DRAM_PRE_ALL_CH2              0x04
-
diff --git a/src/includes/perfmon_p6_events.txt b/src/includes/perfmon_p6_events.txt
index 0db8338..45be4ed 100644
--- a/src/includes/perfmon_p6_events.txt
+++ b/src/includes/perfmon_p6_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_p6_events.txt
-# 
-#      Description:  Event list for Pentium 3
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Description:  Event list for Intel Pentium 3
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/perfmon_phi.h b/src/includes/perfmon_phi.h
index 0f5dd54..7901f2e 100644
--- a/src/includes/perfmon_phi.h
+++ b/src/includes/perfmon_phi.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_phi.h
  *
- *      Description:  Header File of perfmon module for Xeon Phi.
+ *      Description:  Header File of perfmon module for Intel Xeon Phi.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,127 +30,201 @@
  */
 
 #include <perfmon_phi_events.h>
-#include <perfmon_phi_groups.h>
 #include <perfmon_phi_counters.h>
+#include <error.h>
+#include <affinity.h>
 
 static int perfmon_numCountersPhi = NUM_COUNTERS_PHI;
-static int perfmon_numGroupsPhi = NUM_GROUPS_PHI;
 static int perfmon_numArchEventsPhi = NUM_ARCH_EVENTS_PHI;
 
-void perfmon_init_phi(PerfmonThread *thread)
+int perfmon_init_phi(int cpu_id)
 {
-    uint32_t flags = 0x0UL;
-    int cpu_id = thread->processorId;
-
-    msr_write(cpu_id, MSR_MIC_PERFEVTSEL0, 0x0UL);
-    msr_write(cpu_id, MSR_MIC_PERFEVTSEL1, 0x0UL);
-    msr_write(cpu_id, MSR_MIC_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_MIC_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_MIC_SPFLT_CONTROL, 0x0ULL);
-    msr_write(cpu_id, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_MIC_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-
-    flags |= (1<<16);  /* user mode flag */
-    flags |= (1<<22);  /* enable flag */
-
-    msr_write(cpu_id, MSR_MIC_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_MIC_PERFEVTSEL1, flags);
+    return 0;
 }
 
-void perfmon_setupCounterThread_phi(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int phi_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
     uint64_t flags = 0x0ULL;
-    uint64_t reg = phi_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
 
-    if (phi_counter_map[index].type == PMC)
+    flags |= (1ULL<<16)|(1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
     {
-        flags = (1<<22)|(1<<16);
-
-        /* Intel with standard 8 bit event mask: [7:0] */
-        flags |= (event->umask<<8) + event->eventId;
-
-        msr_write(cpu_id, reg , flags);
-
-        if (perfmon_verbose)
+        for(int j=0;j<event->numberOfOptions;j++)
         {
-            printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                    cpu_id,
-                    LLU_CAST reg,
-                    LLU_CAST flags);
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL) <<24;
+                    break;
+                default:
+                    break;
+            }
         }
     }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
 }
 
-void perfmon_startCountersThread_phi(int thread_id)
+int perfmon_setupCounterThread_phi(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags = 0ULL;
-    int processorId = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    msr_write(processorId, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL);
-
-    for ( int i=0; i<NUM_COUNTERS_PHI; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        if (type == PMC)
         {
-            msr_write(processorId, phi_counter_map[i].counterRegister , 0x0ULL);
-            flags |= (1<<(i));  /* enable counter */
+            phi_pmc_setup(cpu_id, index, event);
+            eventSet->events[i].threadCounter[thread_id].init = TRUE;
         }
     }
+    return 0;
+}
+
+int perfmon_startCountersThread_phi(int thread_id, PerfmonEventSet* eventSet)
+{
+    uint64_t flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL));
 
-    if (perfmon_verbose)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_MIC_PERF_GLOBAL_CTRL, LLU_CAST flags);
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE) 
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].counterRegister , 0x0ULL));
+            flags |= (1ULL<<(index));  /* enable counter */
+        }
     }
 
-    msr_write(processorId, MSR_MIC_PERF_GLOBAL_CTRL, flags);
-    flags |= (1ULL<<63);
-    msr_write(processorId, MSR_MIC_SPFLT_CONTROL, flags);
-    msr_write(processorId, MSR_MIC_PERF_GLOBAL_OVF_CTRL, 0x000000003ULL);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, flags));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, flags|(1ULL<<63)));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_OVF_CTRL, flags));
+    return 0;
 }
 
-void perfmon_stopCountersThread_phi(int thread_id)
+int perfmon_stopCountersThread_phi(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t counter_result = 0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    msr_write(cpu_id, MSR_MIC_SPFLT_CONTROL, 0x0ULL);
-    msr_write(cpu_id, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, 0x0ULL));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL));
 
-    for ( int i=0; i<NUM_COUNTERS_PHI; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            perfmon_threadData[thread_id].counters[i].counterData =
-                msr_read(cpu_id, phi_counter_map[i].counterRegister);
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, phi_counter_map[index].counterRegister, &counter_result));
+            if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+            {
+                uint64_t ovf_values = 0x0ULL;
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_STATUS, &ovf_values));
+                if (ovf_values & (1ULL<<index))
+                {
+                    eventSet->events[i].threadCounter[thread_id].overflows++;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_OVF_CTRL, (1ULL<<index)));
+                }
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
+    return 0;
+}
+
+int perfmon_readCountersThread_phi(int thread_id, PerfmonEventSet* eventSet)
+{
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t counter_result = 0x0ULL;
+    uint64_t core_flags = 0x0ULL;
 
-    flags = msr_read(cpu_id,MSR_MIC_PERF_GLOBAL_STATUS);
-//    printf ("Status: 0x%llX \n", LLU_CAST flags);
+    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, &core_flags));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, 0x0ULL));
 
-    if((flags & 0x3))
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        printf ("Overflow occured \n");
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[i].counterRegister, &counter_result));
+            if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+            {
+                uint64_t ovf_values = 0x0ULL;
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_STATUS, &ovf_values));
+                if (ovf_values & (1ULL<<index))
+                {
+                    eventSet->events[i].threadCounter[thread_id].overflows++;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_OVF_CTRL, (1ULL<<index)));
+                }
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+        }
     }
+
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, core_flags|(1ULL<<63)));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, core_flags));
+    return 0;
 }
 
-void perfmon_readCountersThread_phi(int thread_id)
+
+int perfmon_finalizeCountersThread_phi(int thread_id, PerfmonEventSet* eventSet)
 {
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = 0x0ULL;
 
-    for ( int i=0; i<NUM_COUNTERS_PHI; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            perfmon_threadData[thread_id].counters[i].counterData =
-                msr_read(cpu_id, phi_counter_map[i].counterRegister);
+            continue;
         }
+        RegisterIndex index = eventSet->events[i].index;
+        ovf_values_core |= (1ULL<<(index));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[i].configRegister, 0x0ULL));
     }
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_CTRL, 0x0ULL));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_SPFLT_CONTROL, 0x0ULL));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_MIC_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+    return 0;
 }
-
diff --git a/src/includes/perfmon_phi_counters.h b/src/includes/perfmon_phi_counters.h
index edf0658..13cc543 100644
--- a/src/includes/perfmon_phi_counters.h
+++ b/src/includes/perfmon_phi_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_phi_counters.h
  *
- *      Description: Counter Header File of perfmon module.
+ *      Description: Counter Header File of perfmon module for Intel Xeon Phi.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,8 +32,14 @@
 #define NUM_COUNTERS_PHI 2
 #define NUM_COUNTERS_CORE_PHI 2
 
-static PerfmonCounterMap phi_counter_map[NUM_COUNTERS_PHI] = {
-    {"PMC0", PMC0, PMC, MSR_MIC_PERFEVTSEL0, MSR_MIC_PMC0, 0, 0},
-    {"PMC1", PMC1, PMC, MSR_MIC_PERFEVTSEL1, MSR_MIC_PMC1, 0, 0}
+#define PHI_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK| \
+                              EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_THRESHOLD
+
+static RegisterMap phi_counter_map[NUM_COUNTERS_PHI] = {
+    {"PMC0", PMC0, PMC, MSR_MIC_PERFEVTSEL0, MSR_MIC_PMC0, 0, 0, PHI_VALID_OPTIONS_PMC},
+    {"PMC1", PMC1, PMC, MSR_MIC_PERFEVTSEL1, MSR_MIC_PMC1, 0, 0, PHI_VALID_OPTIONS_PMC}
 };
 
+static BoxMap phi_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_MIC_PERF_GLOBAL_CTRL, MSR_MIC_PERF_GLOBAL_STATUS, MSR_MIC_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 40}
+};
diff --git a/src/includes/perfmon_phi_events.txt b/src/includes/perfmon_phi_events.txt
index d6393ba..7c512e3 100644
--- a/src/includes/perfmon_phi_events.txt
+++ b/src/includes/perfmon_phi_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_phi_events.txt
-# 
+#
 #      Description:  Event list for Intel Xeon Phi
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/perfmon_pm.h b/src/includes/perfmon_pm.h
index 88346d1..0158a3c 100644
--- a/src/includes/perfmon_pm.h
+++ b/src/includes/perfmon_pm.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Header File of perfmon module Pentium M.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -30,136 +31,196 @@
 
 #include <perfmon_pm_events.h>
 #include <perfmon_pm_counters.h>
+#include <error.h>
+#include <affinity.h>
 
-#define NUM_GROUPS_PM 5
 
 static int perfmon_numCounters_pm = NUM_COUNTERS_PM;
-static int perfmon_numGroups_pm = NUM_GROUPS_PM;
 static int perfmon_numArchEvents_pm = NUM_ARCH_EVENTS_PM;
 
-static PerfmonGroupMap pm_group_map[NUM_GROUPS_PM] = {
-	{"FLOPS_DP",FLOPS_DP,0,"Double Precision MFlops/s",
-        "EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP:PMC0,EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_DP:PMC1"},
-	{"FLOPS_SP",FLOPS_SP,0,"Single Precision MFlops/s",
-        "EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP:PMC0,EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP:PMC1"},
-	{"L2",L2,0,"L2 cache bandwidth in MBytes/s",
-        "L2_LINES_IN_ALL_ALL:PMC0,L2_LINES_OUT_ALL_ALL:PMC1"},
-	{"BRANCH",BRANCH,0,"Branch prediction miss rate",
-        "BR_INST_EXEC:PMC0,BR_INST_MISSP_EXEC:PMC1"},
-	{"CPI",CPI,0,"Cycles per instruction","UOPS_RETIRED:PMC0"}
-};
-
-void perfmon_init_pm(PerfmonThread *thread)
-{
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
 
-    /* Preinit of two PMC counters */
-    //flags |= (1<<16);  /* user mode flag */
-    //flags |= (1<<19);  /* pin control flag */
-    //    flags |= (1<<22);  /* enable flag */
-
-    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);*/
+int perfmon_init_pm(int cpu_id)
+{
+    return 0;
 }
 
-void perfmon_setupCounterThread_pm(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int pm_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
-    uint64_t flags;
-    uint64_t reg = pm_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
-    flags = (1<<16)|(1<<19);
+    uint64_t flags = 0x0ULL;
 
-    /* Intel with standard 8 bit event mask: [7:0] */
+    flags = (1ULL<<16)|(1ULL<<19);
     flags |= (event->umask<<8) + event->eventId;
 
-    msr_write(cpu_id, reg , flags);
+    if (event->numberOfOptions > 0)
+    {
+        for(int j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL)<<24;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+int perfmon_setupCounterThread_pm(int thread_id, PerfmonEventSet* eventSet)
+{
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if (perfmon_verbose)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        printf("[%d] perfmon_setup_counter: Write Register 0x%llX , Flags: 0x%llX \n",
-                cpu_id,
-                LLU_CAST reg,
-                LLU_CAST flags);
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        pm_pmc_setup(cpu_id, index, event);
     }
+    return 0;
 }
 
 
-void perfmon_startCountersThread_pm(int thread_id)
+int perfmon_startCountersThread_pm(int thread_id, PerfmonEventSet* eventSet)
 {
     uint64_t flags = 0ULL;
-    int processorId = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if (perfmon_threadData[thread_id].counters[0].init == TRUE)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, LLU_CAST 0x0ULL, SETUP_PMC_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].counterRegister , 0x0ULL));
+        }
+    }
+    if (eventSet->numberOfEvents > 0)
     {
-        msr_write(processorId, pm_counter_map[0].counterRegister , 0x0ULL);
-        msr_write(processorId, pm_counter_map[1].counterRegister , 0x0ULL);
-
         /* on p6 only MSR_PERFEVTSEL0 has the enable bit
          * it enables both counters as long MSR_PERFEVTSEL1 
          * has a valid configuration */
-        flags = msr_read(processorId, MSR_PERFEVTSEL0);
-        flags |= (1<<22);  /* enable flag */
-
-        if (perfmon_verbose)
-        {
-            printf("perfmon_start_counters: Write Register 0x%X , \
-                    Flags: 0x%llX \n",MSR_PERFEVTSEL0, LLU_CAST flags);
-        }
-
-        msr_write(processorId, MSR_PERFEVTSEL0, flags);
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, &flags));
+        flags |= (1<<22);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERFEVTSEL0, LLU_CAST flags, UNFREEZE_PMC);
     }
-
+    return 0;
 }
 
-void perfmon_stopCountersThread_pm(int thread_id)
+int perfmon_stopCountersThread_pm(int thread_id, PerfmonEventSet* eventSet)
 {
-    int i;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
+    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, &counter_result));
+    counter_result &= ~(1<<22);
+    VERBOSEPRINTREG(cpu_id, MSR_PERFEVTSEL0, counter_result, FREEZE_PMC);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, counter_result));
 
-    for (i=0;i<NUM_COUNTERS_PM;i++) 
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE) 
         {
-            perfmon_threadData[thread_id].counters[i].counterData =
-                msr_read(cpu_id, pm_counter_map[i].counterRegister);
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter_map[index].counterRegister, &counter_result));
+            VERBOSEPRINTREG(cpu_id, counter_map[index].counterRegister, counter_result, READ_PMC);
+            if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+            {
+                eventSet->events[i].threadCounter[thread_id].overflows++;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
+    return 0;
 }
 
-void perfmon_printDerivedMetrics_pm(PerfmonGroup group)
+int perfmon_readCountersThread_pm(int thread_id, PerfmonEventSet* eventSet)
 {
+    uint64_t counter_result = 0x0ULL;
+    uint64_t pmc_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    switch ( group )
-    {
-        case FLOPS_DP:
-
-        case FLOPS_SP:
+    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, &pmc_flags));
+    pmc_flags &= ~(1<<22);
+    VERBOSEPRINTREG(cpu_id, MSR_PERFEVTSEL0, pmc_flags & ~(1<<22), FREEZE_PMC);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, pmc_flags & ~(1<<22)));
 
-        case L2:
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE) 
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint32_t counter = counter_map[index].counterRegister;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter, &counter_result));
+            if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+            {
+                eventSet->events[i].threadCounter[thread_id].overflows++;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+        }
+    }
 
-        case BRANCH:
+    VERBOSEPRINTREG(cpu_id, MSR_PERFEVTSEL0, pmc_flags, UNFREEZE_PMC);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERFEVTSEL0, pmc_flags));
+    return 0;
+}
 
-        case _NOGROUP:
-            fprintf (stderr, "The Pentium M supports only two counters. Therefore derived metrics are not computed due to missing runtime!\n" );
-            break;
+int perfmon_finalizeCountersThread_pm(int thread_id, PerfmonEventSet* eventSet)
+{
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-        default:
-            fprintf (stderr, "perfmon_printDerivedMetricsCore2: Unknown group! Exiting!\n" );
-            exit (EXIT_FAILURE);
-            break;
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        uint32_t reg = counter_map[index].configRegister;
+        if ((reg) && ((type == PMC)||(type == FIXED)))
+        {
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+            VERBOSEPRINTPCIREG(cpu_id, MSR_DEV, reg, 0x0ULL, CLEAR_CTL);
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
+    return 0;
 }
 
-
diff --git a/src/includes/perfmon_pm_counters.h b/src/includes/perfmon_pm_counters.h
index 9119096..5a3ccf8 100644
--- a/src/includes/perfmon_pm_counters.h
+++ b/src/includes/perfmon_pm_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_pm_counters.h
  *
- *      Description: Counter Header File of perfmon module.
+ *      Description: Counter Header File of perfmon module for Intel Pentium M.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,8 +32,13 @@
 #define NUM_COUNTERS_PM 2
 #define NUM_COUNTERS_CORE_PM 2
 
-static PerfmonCounterMap pm_counter_map[NUM_COUNTERS_PM] = {
-    {"PMC0",PMC0, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1",PMC1, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0}
+#define PM_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK
+
+static RegisterMap pm_counter_map[NUM_COUNTERS_PM] = {
+    {"PMC0", PMC0, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, PM_VALID_OPTIONS_PMC},
+    {"PMC1", PMC1, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, PM_VALID_OPTIONS_PMC}
 };
 
+static BoxMap pm_box_map[NUM_UNITS] = {
+    [PMC] = {0, 0, 0, 0, 0, 0, 48}
+};
diff --git a/src/includes/perfmon_pm_events.txt b/src/includes/perfmon_pm_events.txt
index 9ed83a8..a588c70 100644
--- a/src/includes/perfmon_pm_events.txt
+++ b/src/includes/perfmon_pm_events.txt
@@ -1,16 +1,16 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_pm_events.txt
-# 
+#
 #      Description:  Event list for Intel Pentium M
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -262,22 +262,22 @@ UMASK_BUS_HITM_DRV_SELF           0x00
 EVENT_BUS_SNOOP_STALL             0x7E      PMC
 UMASK_BUS_SNOOP_STALL_SELF        0x00
 
-EVENT_FLOPS                       0xC1      PMC
+EVENT_FLOPS                       0xC1      PMC0
 UMASK_FLOPS                       0x00
 
-EVENT_FP_COMP_OPS_EXE             0x10      PMC
+EVENT_FP_COMP_OPS_EXE             0x10      PMC0
 UMASK_FP_COMP_OPS_EXE             0x00
 
-EVENT_FP_ASSIST                   0x11      PMC
+EVENT_FP_ASSIST                   0x11      PMC1
 UMASK_FP_ASSIST                   0x00
 
-EVENT_MUL                         0x12      PMC
+EVENT_MUL                         0x12      PMC1
 UMASK_MUL                         0x00
 
-EVENT_DIV                         0x13      PMC
+EVENT_DIV                         0x13      PMC1
 UMASK_DIV                         0x00
 
-EVENT_CYCLES_DIV_BUSY             0x14      PMC
+EVENT_CYCLES_DIV_BUSY             0x14      PMC0
 UMASK_CYCLES_DIV_BUSY             0x00
 
 EVENT_LD_BLOCKS                   0x03      PMC
@@ -289,13 +289,13 @@ UMASK_SB_DRAINS                   0x00
 EVENT_MISALIGN_MEM_REF            0x05      PMC
 UMASK_MISALIGN_MEM_REF            0x00
 
-EVENT_EMON_KNI_PREF_DISPATCHED       0x07      PMC
+EVENT_EMON_KNI_PREF_DISPATCHED       0x07      PMC0|PMC1
 UMASK_EMON_KNI_PREF_DISPATCHED_NTA   0x00
 UMASK_EMON_KNI_PREF_DISPATCHED_T1    0x01
 UMASK_EMON_KNI_PREF_DISPATCHED_T2    0x02
 UMASK_EMON_KNI_PREF_DISPATCHED_WEAK  0x03
 
-EVENT_EMON_KNI_PREF_MISS        0x4B      PMC
+EVENT_EMON_KNI_PREF_MISS        0x4B      PMC0|PMC1
 UMASK_EMON_KNI_PREF_MISS_NTA    0x00
 UMASK_EMON_KNI_PREF_MISS_T1     0x01
 UMASK_EMON_KNI_PREF_MISS_T2     0x02
@@ -310,13 +310,13 @@ UMASK_UOPS_RETIRED             0x00
 EVENT_INST_DECODED             0xD0      PMC
 UMASK_INST_DECODED             0x00
 
-EVENT_EMON_SSE_SSE2_INST_RETIRED                0xD8      PMC
+EVENT_EMON_SSE_SSE2_INST_RETIRED                0xD8      PMC0|PMC1
 UMASK_EMON_SSE_SSE2_INST_RETIRED_ALL_SP         0x00
 UMASK_EMON_SSE_SSE2_INST_RETIRED_SCALAR_SP      0x01
 UMASK_EMON_SSE_SSE2_INST_RETIRED_PACKED_DP      0x02
 UMASK_EMON_SSE_SSE2_INST_RETIRED_SCALAR_DP      0x03
 
-EVENT_EMON_SSE_SSE2_COMP_INST_RETIRED                0xD9      PMC
+EVENT_EMON_SSE_SSE2_COMP_INST_RETIRED                0xD9      PMC0|PMC1
 UMASK_EMON_SSE_SSE2_COMP_INST_RETIRED_ALL_SP         0x00
 UMASK_EMON_SSE_SSE2_COMP_INST_RETIRED_SCALAR_SP      0x01
 UMASK_EMON_SSE_SSE2_COMP_INST_RETIRED_PACKED_DP      0x02
diff --git a/src/includes/perfmon_sandybridge.h b/src/includes/perfmon_sandybridge.h
index f11714a..2041a1a 100644
--- a/src/includes/perfmon_sandybridge.h
+++ b/src/includes/perfmon_sandybridge.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_sandybridge.h
  *
- *      Description:  Header File of perfmon module for Sandy Bridge.
+ *      Description:  Header File of perfmon module for Intel Sandy Bridge.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,666 +30,1675 @@
  */
 
 #include <perfmon_sandybridge_events.h>
-#include <perfmon_sandybridge_groups.h>
+#include <perfmon_sandybridgeEP_events.h>
 #include <perfmon_sandybridge_counters.h>
+#include <error.h>
+#include <affinity.h>
 
+static int perfmon_numCountersSandybridgeEP = NUM_COUNTERS_SANDYBRIDGEEP;
+static int perfmon_numCoreCountersSandybridgeEP = NUM_COUNTERS_CORE_SANDYBRIDGEEP;
+static int perfmon_numArchEventsSandybridgeEP = NUM_ARCH_EVENTS_SANDYBRIDGEEP;
 static int perfmon_numCountersSandybridge = NUM_COUNTERS_SANDYBRIDGE;
-static int perfmon_numGroupsSandybridge = NUM_GROUPS_SANDYBRIDGE;
+static int perfmon_numCoreCountersSandybridge = NUM_COUNTERS_CORE_SANDYBRIDGE;
 static int perfmon_numArchEventsSandybridge = NUM_ARCH_EVENTS_SANDYBRIDGE;
 
-#define OFFSET_PMC 3
 
-void perfmon_init_sandybridge(PerfmonThread *thread)
+int perfmon_init_sandybridge(int cpu_id)
 {
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    /* Initialize registers */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC2, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC3, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
-    /* initialize fixed counters
-     * FIXED 0: Instructions retired
-     * FIXED 1: Clocks unhalted core
-     * FIXED 2: Clocks unhalted ref */
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
-    /* Preinit of PERFEVSEL registers */
-    //flags |= (1<<22);  /* enable flag */
-    //flags |= (1<<16);  /* user mode flag */
-
-    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
-    /* TODO Robust implementation which also works if stuff is not there */
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
-    {
-        if ( cpuid_info.model == SANDYBRIDGE_EP )
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
+}
+
+uint32_t snb_fixed_setup(RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
         {
-            /* Only root can access pci address space in direct mode */
-            if (accessClient_mode != DAEMON_AM_DIRECT)
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+                break;
+            default:
+                break;
+        }
+    }
+    return flags;
+}
+
+int snb_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
+    uint64_t offcore_flags = 0x0ULL;
+
+    flags |= (1ULL<<22);  /* enable flag */
+    flags |= (1ULL<<16);  /* user mode flag */
+
+    /* Intel with standard 8 bit event mask: [7:0] */
+    flags |= (event->umask<<8) + event->eventId;
+
+    /* set custom cfg and cmask */
+    if ((event->cfgBits != 0) &&
+        (event->eventId != 0xB7) &&
+        (event->eventId != 0xBB))
+    {
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
+    if (event->numberOfOptions > 0)
+    {
+        for(j=0;j<event->numberOfOptions;j++)
+        {
+            switch (event->options[j].type)
             {
-                uint32_t  uflags = 0x10100U; /* enable freeze (bit 16), freeze (bit 8) */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-
-                uflags = 0x0U;
-                uflags |= (1<<22);  /* enable flag */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTL_3, uflags);
-
-                uflags |= (1<<19);  /* reset fixed counter */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-
-                /* iMC counters need to be manually reset to zero */
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_0_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_0_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_1_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_1_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_2_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_2_B, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_3_A, 0x0U);
-                pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_CTR_3_B, 0x0U);
-
-                /* FIXME: Not yet tested/ working due to BIOS issues on test
-                 * machines */
-#if 0
-                /* QPI registers can be zeroed with single write */
-                uflags = 0x0113UL; /*enable freeze (bit 16), freeze (bit 8), reset */
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
-                uflags = 0x0UL;
-                uflags |= (1UL<<22);  /* enable flag */
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_CTL_3, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_0, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_1, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_2, uflags);
-                pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_CTL_3, uflags);
-#endif
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL)<<24);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0x8FFFULL);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value<<16);
+                    break;
+                default:
+                    break;
             }
         }
     }
-//    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+    }
+    else if (event->eventId == 0xBB)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
 }
 
-#define BOX_GATE_SNB(channel,label) \
-    if (perfmon_verbose) { \
-        printf("[%d] perfmon_setup_counter (label): Write Register 0x%llX , Flags: 0x%llX \n", \
-                cpu_id, \
-                LLU_CAST reg, \
-                LLU_CAST flags); \
-    } \
-    if(haveLock) { \
-        uflags = (1<<22); \
-        uflags |= (event->umask<<8) + event->eventId;  \
-        pci_write(cpu_id, channel,  reg, uflags);  \
+int snb_mbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(dev, cpu_id))
+    {
+        return -ENODEV;
     }
 
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= ((event->options[j].value & 0xFFULL)<<24);
+                break;
+            default:
+                break;
+        }
+    }
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_MBOX);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    return 0;
+}
 
-void perfmon_setupCounterThread_sandybridge(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+
+uint32_t snb_cbox_filter(PerfmonEvent *event)
 {
-    int haveLock = 0;
-    uint64_t flags;
-    uint32_t uflags;
-    uint64_t reg = sandybridge_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-    uint64_t orig_fixed_flags = fixed_flags;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
+    int j;
+    uint32_t ret = 0x0;
+    uint64_t mask = 0x0ULL;
+    int set_state = 0;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    for(j=0;j<event->numberOfOptions;j++)
     {
-        haveLock = 1;
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_OPCODE:
+                if ((event->options[j].value == 0x180) ||
+                    (event->options[j].value == 0x181) ||
+                    (event->options[j].value == 0x182) ||
+                    (event->options[j].value == 0x187) ||
+                    (event->options[j].value == 0x18C) ||
+                    (event->options[j].value == 0x18D) ||
+                    (event->options[j].value == 0x190) ||
+                    (event->options[j].value == 0x191) ||
+                    (event->options[j].value == 0x192) ||
+                    (event->options[j].value == 0x194) ||
+                    (event->options[j].value == 0x195) ||
+                    (event->options[j].value == 0x19C) ||
+                    (event->options[j].value == 0x19E) ||
+                    (event->options[j].value == 0x1C4) ||
+                    (event->options[j].value == 0x1C5) ||
+                    (event->options[j].value == 0x1C8) ||
+                    (event->options[j].value == 0x1E4) ||
+                    (event->options[j].value == 0x1E5) ||
+                    (event->options[j].value == 0x1E6))
+                {
+                    ret |= ((event->options[j].value & 0x1FFULL) << 23);
+                }
+                else
+                {
+                    ERROR_PRINT(Invalid value 0x%llx for opcode option, LLU_CAST event->options[j].value);
+                }
+                break;
+            case EVENT_OPTION_STATE:
+                if (event->options[j].value & 0x1F)
+                {
+                    ret |= ((event->options[j].value & 0x1FULL) << 18);
+                    set_state = 1;
+                }
+                else
+                {
+                    ERROR_PRINT(Invalid value 0x%llx for state option, LLU_CAST event->options[j].value);
+                }
+                break;
+            case EVENT_OPTION_NID:
+                mask = 0x0ULL;
+                for (int i=0; i<affinityDomains.numberOfNumaDomains;i++)
+                    mask |= (1ULL<<i);
+                if (event->options[j].value & mask)
+                {
+                    ret |= ((event->options[j].value & 0xFFULL) << 10);
+                }
+                else
+                {
+                    ERROR_PRINT(Invalid value 0x%llx for node id option, LLU_CAST event->options[j].value);
+                }
+                break;
+            case EVENT_OPTION_TID:
+                if (event->options[j].value <= 0xF)
+                {
+                    ret |= (event->options[j].value & 0x1FULL);
+                }
+                else
+                {
+                    ERROR_PRINT(Invalid value 0x%llx for thread id option, LLU_CAST event->options[j].value);
+                }
+                break;
+            default:
+                break;
+        }
     }
+    if ((event->eventId == 0x34) && (set_state == 0))
+    {
+        ret |= (0x1FULL << 18);
+    }
+    return ret;
+}
 
-    switch (sandybridge_counter_map[index].type)
+int snb_cbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
     {
-        case PMC:
+        return 0;
+    }
 
-            //flags = msr_read(cpu_id,reg);
-            //flags &= ~(0xFFFFU);   /* clear lower 16bits */
-            flags = (1<<22)|(1<<16);
+    flags |= (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
 
-            /* Intel with standard 8 bit event mask: [7:0] */
-            flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        uint32_t optflags = snb_cbox_filter(event);
+        uint32_t filter_reg = box_map[counter_map[index].type].filterRegister1;
+        if (optflags != 0x0U)
+        {
+            VERBOSEPRINTREG(cpu_id, filter_reg, LLU_CAST optflags, SETUP_CBOX_FILTER);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, filter_reg, optflags));
+        }
+    }
 
-            if (event->cfgBits != 0) /* set custom cfg and cmask */
-            {
-                flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-                flags |= ((event->cmask<<8) + event->cfgBits)<<16;
-            }
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_TID:
+                flags |= (1ULL<<19);
+                break;
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= (event->options[j].value & 0xFFULL)<<24;
+                break;
+            default:
+                break;
+        }
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_CBOX);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
 
-            if (perfmon_verbose)
-            {
-                printf("[%d] perfmon_setup_counter PMC: Write Register 0x%llX , Flags: 0x%llX \n",
-                        cpu_id,
-                        LLU_CAST reg,
-                        LLU_CAST flags);
-            }
 
-            msr_write(cpu_id, reg , flags);
-            break;
+int snb_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
 
-        case FIXED:
-            fixed_flags |= (0x2 << (index*4));
-            break;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
 
-        case POWER:
-            break;
+    flags |= (1ULL<<17);
+    flags |= (event->umask<<8) + event->eventId;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= ((event->options[j].value & 0x1FULL) << 24);
+                break;
+            default:
+                break;
+        }
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_UBOX)
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
 
-        case MBOX0:
-            BOX_GATE_SNB(PCI_IMC_DEVICE_CH_0,MBOX0);
-            break;
+int snb_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
+    uint64_t match = 0x0ULL;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
 
-        case MBOX1:
-            BOX_GATE_SNB(PCI_IMC_DEVICE_CH_1,MBOX1);
-            break;
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= ((event->options[j].value & 0xFFULL) << 24);
+                break;
+            case EVENT_OPTION_OPCODE:
+                VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+                                    LLU_CAST (event->options[j].value & 0x3FULL), SETUP_BBOX_OPCODE);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_OPCODEMATCH,
+                                    (event->options[j].value & 0x3FULL)));
+                break;
+            case EVENT_OPTION_MATCH0:
+                match = event->options[j].value & 0xFFFFFFC0ULL;
+                VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, LLU_CAST match, SETUP_BBOX_MATCH0);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH0, match));
+                match = (event->options[j].value >> 32) & 0x3FFFULL;
+                VERBOSEPRINTPCIREG(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, LLU_CAST match, SETUP_BBOX_MATCH1);
+                CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, PCI_UNC_HA_PMON_ADDRMATCH1, match));
+                break;
+            default:
+                break;
+        }
+    }
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_BBOX);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev,  counter_map[index].configRegister, flags));
+    return 0;
+}
 
-        case MBOX2:
-            BOX_GATE_SNB(PCI_IMC_DEVICE_CH_2,MBOX2);
-            break;
 
-        case MBOX3:
-            BOX_GATE_SNB(PCI_IMC_DEVICE_CH_3,MBOX3);
-            break;
+int snb_wbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
 
-        case SBOX0:
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
 
-            /* CTO_COUNT event requires programming of MATCH/MASK registers */
-            if (event->eventId == 0x38)
-            {
-                if(haveLock)
+    flags = (1ULL<<22);
+    flags |= event->eventId & 0xFF;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= ((event->options[j].value & 0x1FULL) << 24);
+                break;
+            case EVENT_OPTION_OCCUPANCY:
+                flags |= ((event->options[j].value & 0x3ULL) << 14);
+                break;
+            case EVENT_OPTION_OCCUPANCY_EDGE:
+                flags |= (1ULL<<31);
+                break;
+            case EVENT_OPTION_OCCUPANCY_INVERT:
+                flags |= (1ULL<<30);
+                break;
+            case EVENT_OPTION_OCCUPANCY_FILTER:
+                VERBOSEPRINTREG(cpu_id, MSR_UNC_PCU_PMON_BOX_FILTER, LLU_CAST event->options[j].value & 0xFFFFFFFFULL, SETUP_WBOX_FILTER);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PCU_PMON_BOX_FILTER, event->options[j].value & 0xFFFFFFFFULL));
+            default:
+                break;
+        }
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_WBOX);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+int snb_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event, PciDeviceIndex filterdev)
+{
+    int j;
+    uint32_t flags = 0x0U;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
+
+    flags = (1ULL<<22);
+    flags |= event->cfgBits;
+    flags |= (event->umask<<8) + event->eventId;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= ((event->options[j].value & 0xFFULL) << 24);
+                break;
+            case EVENT_OPTION_MATCH0:
+                if (pci_checkDevice(filterdev, cpu_id))
                 {
-                    //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0, reg);
-                    //uflags &= ~(0xFFFFU);
-                    uflags = (1<<22);
-                    uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
-                    printf("UFLAGS 0x%x \n",uflags);
-                    pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  reg, uflags);
-
-                    /* program MATCH0 */
-                    uflags = 0x0UL;
-                    uflags = (event->cmask<<13) + (event->umask<<8);
-                    printf("MATCH UFLAGS 0x%x \n",uflags);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_0, PCI_UNC_QPI_PMON_MATCH_0, uflags);
-
-                    /* program MASK0 */
-                    uflags = 0x0UL;
-                    uflags = (0x3F<<12) + (event->cfgBits<<4);
-                    printf("MASK UFLAGS 0x%x \n",uflags);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_0, PCI_UNC_QPI_PMON_MASK_0, uflags);
+                    VERBOSEPRINTPCIREG(cpu_id, filterdev, PCI_UNC_QPI_PMON_MATCH_0,
+                                    event->options[j].value & 0x8003FFF8ULL, SETUP_SBOX_MATCH0);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, filterdev, PCI_UNC_QPI_PMON_MATCH_0,
+                                    event->options[j].value & 0x8003FFF8ULL));
                 }
-            }
-            else
-            {
-                BOX_GATE_SNB(PCI_QPI_DEVICE_PORT_0,SBOX0);
-            }
+                else
+                {
+                    DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                }
+                break;
+            case EVENT_OPTION_MATCH1:
+                if (pci_checkDevice(filterdev, cpu_id))
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, filterdev, PCI_UNC_QPI_PMON_MATCH_1,
+                                    event->options[j].value & 0x000F000FULL, SETUP_SBOX_MATCH1);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, filterdev, PCI_UNC_QPI_PMON_MATCH_1,
+                                    event->options[j].value & 0x000F000FULL));
+                }
+                else
+                {
+                    DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                }
+                break;
+            case EVENT_OPTION_MASK0:
+                if (pci_checkDevice(filterdev, cpu_id))
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, filterdev, PCI_UNC_QPI_PMON_MASK_0,
+                                    event->options[j].value & 0x8003FFF8ULL, SETUP_SBOX_MASK0);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, filterdev, PCI_UNC_QPI_PMON_MASK_0,
+                                    event->options[j].value & 0x8003FFF8ULL));
+                }
+                else
+                {
+                    DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                }
+                break;
+            case EVENT_OPTION_MASK1:
+                if (pci_checkDevice(filterdev, cpu_id))
+                {
+                    VERBOSEPRINTPCIREG(cpu_id, filterdev, PCI_UNC_QPI_PMON_MASK_1,
+                                    event->options[j].value & 0x000F000FULL, SETUP_SBOX_MASK1);
+                    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, filterdev, PCI_UNC_QPI_PMON_MASK_1,
+                                    event->options[j].value & 0x000F000FULL));
+                }
+                else
+                {
+                    DEBUG_PRINT(DEBUGLEV_ONLY_ERROR, Filtering for counter %s cannot be applied. PCI device not available, counter_map[index].key);
+                }
+                break;
+            default:
+                break;
+        }
+    }
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_SBOX);
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev,  counter_map[index].configRegister, flags));
+    return 0;
+}
 
-            break;
 
-        case SBOX1:
 
-            /* CTO_COUNT event requires programming of MATCH/MASK registers */
-            if (event->eventId == 0x38)
-            {
-                if(haveLock)
-                {
-                    //uflags = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1, reg);
-                    //uflags &= ~(0xFFFFU);
-                    uflags = (1<<22);
-                    uflags |= (1UL<<21) + event->eventId; /* Set extension bit */
-                    pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  reg, uflags);
-
-                    /* program MATCH0 */
-                    uflags = 0x0UL;
-                    uflags = (event->cmask<<13) + (event->umask<<8);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_1, PCI_UNC_QPI_PMON_MATCH_0, uflags);
-
-                    /* program MASK0 */
-                    uflags = 0x0UL;
-                    uflags = (0x3F<<12) + (event->cfgBits<<4);
-                    pci_write(cpu_id, PCI_QPI_MASK_DEVICE_PORT_1, PCI_UNC_QPI_PMON_MASK_0, uflags);
-                }
-            }
-            else
-            {
-                BOX_GATE_SNB(PCI_QPI_DEVICE_PORT_0,SBOX0);
-            }
-            break;
+int snb_rbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(dev, cpu_id))
+    {
+        return -ENODEV;
+    }
 
-        default:
-            /* should never be reached */
-            break;
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= ((event->options[j].value & 0xFFULL) << 24);
+                break;
+            default:
+                break;
+        }
     }
-    if (fixed_flags != orig_fixed_flags)
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_RBOX)
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    return 0;
+}
+
+int snb_pbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = 0x0U;
+    PciDeviceIndex dev = counter_map[index].device;
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (!pci_checkDevice(dev, cpu_id))
     {
-        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+        return -ENODEV;
     }
+
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    for(j=0;j<event->numberOfOptions;j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_EDGE:
+                flags |= (1ULL<<18);
+                break;
+            case EVENT_OPTION_INVERT:
+                flags |= (1ULL<<23);
+                break;
+            case EVENT_OPTION_THRESHOLD:
+                flags |= ((event->options[j].value & 0xFFULL) << 24);
+                break;
+            default:
+                break;
+        }
+    }
+    VERBOSEPRINTPCIREG(cpu_id, dev, counter_map[index].configRegister, LLU_CAST flags, SETUP_PBOX)
+    CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter_map[index].configRegister, flags));
+    return 0;
 }
 
-void perfmon_startCountersThread_sandybridge(int thread_id)
+// Macros to stop counting and reset control registers
+// FREEZE(_AND_RESET_CTL) uses central box register to freeze (bit 8 + 16) and bit 1 to reset control registers
+#define SNB_FREEZE_AND_RESET_CTL_BOX(id) \
+    if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+    { \
+        VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, 0x10101U, FREEZE_AND_RESET_CTL_BOX_##id) \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, 0x10101ULL)); \
+    }
+
+#define SNB_FREEZE_BOX(id) \
+    if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+    { \
+        VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, 0x10100U, FREEZE_AND_RESET_CTL_BOX_##id) \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, 0x10100ULL)); \
+    }
+
+// FREEZE(_AND_RESET_CTL)_PCI uses central box register to freeze (bit 8 + 16) and bit 1 to reset control registers
+// Checks whether PCI device exists, because this is the first operation we do on the devices
+#define SNB_FREEZE_AND_RESET_CTL_PCI_BOX(id) \
+    if (haveLock && \
+        (eventSet->regTypeMask & (REG_TYPE_MASK(id))) && \
+        (pci_checkDevice(box_map[id].device, cpu_id) == 0)) \
+    { \
+        VERBOSEPRINTPCIREG(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x10101ULL, FREEZE_AND_RESET_CTL_PCI_BOX_##id); \
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x10101ULL)); \
+    }
+
+#define SNB_FREEZE_PCI_BOX(id) \
+    if (haveLock && \
+        (eventSet->regTypeMask & (REG_TYPE_MASK(id))) && \
+        (pci_checkDevice(box_map[id].device, cpu_id) == 0)) \
+    { \
+        VERBOSEPRINTPCIREG(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x10100ULL, FREEZE_PCI_BOX_##id) \
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x10100ULL)); \
+    }
+
+// MBOX*FIX have a slightly different scheme, setting the whole register to 0 freeze the counter
+#define SNB_FREEZE_MBOXFIX(number) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX##number##FIX))) && \
+                    (pci_checkDevice(PCI_IMC_DEVICE_0_CH_##number, cpu_id))) \
+    { \
+        VERBOSEPRINTPCIREG(cpu_id, PCI_IMC_DEVICE_0_CH_##number, PCI_UNC_MC_PMON_FIXED_CTL, 0x0ULL, FREEZE_MBOXFIX##number) \
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, PCI_IMC_DEVICE_0_CH_##number,  PCI_UNC_MC_PMON_FIXED_CTL, 0x0ULL)); \
+    }
+
+
+
+int perfmon_setupCounterThread_sandybridge(
+        int thread_id,
+        PerfmonEventSet* eventSet)
 {
+    int i;
     int haveLock = 0;
     uint64_t flags = 0x0ULL;
-    uint32_t uflags = 0x10000UL; /* Clear freeze bit */
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
+    }
+    SNB_FREEZE_BOX(CBOX0);
+    SNB_FREEZE_BOX(CBOX1);
+    SNB_FREEZE_BOX(CBOX2);
+    SNB_FREEZE_BOX(CBOX3);
+    SNB_FREEZE_BOX(CBOX4);
+    SNB_FREEZE_BOX(CBOX5);
+    SNB_FREEZE_BOX(CBOX6);
+    SNB_FREEZE_BOX(CBOX7);
+
+    SNB_FREEZE_PCI_BOX(MBOX0);
+    SNB_FREEZE_PCI_BOX(MBOX1);
+    SNB_FREEZE_PCI_BOX(MBOX2);
+    SNB_FREEZE_PCI_BOX(MBOX3);
+
+    SNB_FREEZE_MBOXFIX(0);
+    SNB_FREEZE_MBOXFIX(1);
+    SNB_FREEZE_MBOXFIX(2);
+    SNB_FREEZE_MBOXFIX(3);
+
+    SNB_FREEZE_PCI_BOX(SBOX0);
+    SNB_FREEZE_PCI_BOX(SBOX1);
+
+    SNB_FREEZE_PCI_BOX(RBOX0);
+    SNB_FREEZE_PCI_BOX(RBOX1);
+
+    SNB_FREEZE_PCI_BOX(PBOX);
+
+    SNB_FREEZE_PCI_BOX(BBOX0);
+    SNB_FREEZE_BOX(WBOX);
+
+    for (i=0;i < eventSet->numberOfEvents;i++)
+    {
+        flags = 0x0ULL;
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        RegisterIndex index = eventSet->events[i].index;
+        PciDeviceIndex dev = counter_map[index].device;
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        switch (type)
+        {
+            case PMC:
+                snb_pmc_setup(cpu_id, index, event);
+                break;
+
+            case FIXED:
+                /* initialize fixed counters
+                 * FIXED 0: Instructions retired
+                 * FIXED 1: Clocks unhalted core
+                 * FIXED 2: Clocks unhalted ref */
+                fixed_flags |= snb_fixed_setup(index,event);
+                /* Written in the end of function for all fixed purpose registers */
+                break;
+
+            case POWER:
+                break;
+
+            case MBOX0:
+            case MBOX1:
+            case MBOX2:
+            case MBOX3:
+                snb_mbox_setup(cpu_id, index, event);
+                break;
+
+            case MBOX0FIX:
+                break;
+            case MBOX1FIX:
+                break;
+            case MBOX2FIX:
+                break;
+            case MBOX3FIX:
+                break;
+
+            case CBOX0:
+            case CBOX1:
+            case CBOX2:
+            case CBOX3:
+            case CBOX4:
+            case CBOX5:
+            case CBOX6:
+            case CBOX7:
+                snb_cbox_setup(cpu_id, index, event);
+                break;
+
+            case UBOX:
+                snb_ubox_setup(cpu_id, index, event);
+                break;
+                
+            case UBOXFIX:
+                VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST 0x0ULL, SETUP_UBOXFIX)
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, 0x0ULL));
+                break;
+
+            case SBOX0:
+                snb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_0);
+                break;
+            case SBOX1:
+                snb_sbox_setup(cpu_id, index, event, PCI_QPI_MASK_DEVICE_PORT_1);
+                break;
+
+            case SBOX0FIX:
+            case SBOX1FIX:
+                break;
+
+            case BBOX0:
+                snb_bbox_setup(cpu_id, index, event);
+                break;
+
+            case WBOX:
+                snb_wbox_setup(cpu_id, index, event);
+                break;
+
+            case RBOX0:
+            case RBOX1:
+                snb_rbox_setup(cpu_id, index, event);
+                break;
+
+            case PBOX:
+                snb_pbox_setup(cpu_id, index, event);
+                break;
+
+
+            default:
+                break;
+        }
+    }
+    
+    if (fixed_flags > 0x0)
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+    }
+    return 0;
+}
+
+
+// Macros for MSR HPM counters
+// UNFREEZE(_AND_RESET_CTR) uses the central box registers to unfreeze and reset the counter registers
+#define SNB_UNFREEZE_BOX(id) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) { \
+        VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, LLU_CAST 0x0ULL, UNFREEZE_BOX_##id) \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, 0x0ULL)); \
+    }
+
+#define SNB_UNFREEZE_AND_RESET_CTR_BOX(id) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) { \
+        VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, LLU_CAST 0x2ULL, UNFREEZE_BOX_##id) \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, 0x2ULL)); \
+    }
+
+// ENABLE(_AND_RESET_CTR) uses the control registers to enable (bit 22) and reset the counter registers (bit 19)
+#define SNB_ENABLE_BOX(id, reg) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &tmp)); \
+        tmp |= (1ULL<<22); \
+        VERBOSEPRINTREG(cpu_id, reg, LLU_CAST tmp, ENABLE_BOX_##id) \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, tmp)); \
+    }
+
+#define SNB_ENABLE_AND_RESET_CTR_BOX(id) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].ctrlRegister, &tmp)); \
+        tmp |= (1ULL<<22)|(1ULL<<17); \
+        VERBOSEPRINTREG(cpu_id, box_map[id].ctrlRegister, LLU_CAST tmp, ENABLE_AND_RESET_CTR_BOX_##id) \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, tmp)); \
+    }
+
+// UNFREEZE(_AND_RESET_CTR)_PCI is similar to MSR UNFREEZE but for PCI devices
+#define SNB_UNFREEZE_PCI_BOX(id) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+                && (pci_checkDevice(box_map[id].device, cpu_id))) \
+    { \
+        VERBOSEPRINTPCIREG(cpu_id, box_map[id].device, box_map[id].ctrlRegister, LLU_CAST 0x0ULL, UNFREEZE_PCI_BOX_##id) \
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x0ULL)); \
+    }
+#define SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(id) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+                && (pci_checkDevice(box_map[id].device, cpu_id))) \
+    { \
+        VERBOSEPRINTPCIREG(cpu_id, box_map[id].device, box_map[id].ctrlRegister, LLU_CAST 0x2ULL, UNFREEZE_AND_RESET_CTR_PCI_BOX_##id) \
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, box_map[id].device, box_map[id].ctrlRegister, 0x2ULL)); \
+    }
+
+// UNFREEZE(_AND_RESET_CTR)_MBOXFIX is kind of ENABLE for PCI but uses bit 19 for reset
+#define SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(number) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX##number##FIX))) && \
+                    (pci_checkDevice(PCI_IMC_DEVICE_0_CH_##number, cpu_id))) \
+    { \
+        VERBOSEPRINTPCIREG(cpu_id, PCI_IMC_DEVICE_0_CH_##number, \
+                PCI_UNC_MC_PMON_FIXED_CTL, LLU_CAST (1ULL<<22)|(1ULL<<19), UNFREEZE_AND_RESET_CTR_MBOX##number##FIX) \
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, PCI_IMC_DEVICE_0_CH_##number, PCI_UNC_MC_PMON_FIXED_CTL, (1ULL<<22)|(1ULL<<19))); \
+    }
+#define SNB_UNFREEZE_MBOXFIX(number) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX##number##FIX))) && \
+                    (pci_checkDevice(PCI_IMC_DEVICE_0_CH_##number, cpu_id))) \
+    { \
+        VERBOSEPRINTPCIREG(cpu_id, PCI_IMC_DEVICE_0_CH_##number, \
+                PCI_UNC_MC_PMON_FIXED_CTL, LLU_CAST (1ULL<<22), UNFREEZE_MBOXFIX##id) \
+        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, PCI_IMC_DEVICE_0_CH_##number,  PCI_UNC_MC_PMON_FIXED_CTL, (1ULL<<22))); \
+    }
+
+int perfmon_startCountersThread_sandybridge(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t tmp = 0x0ULL;
+    uint64_t flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
 
-    for ( int i=0; i<perfmon_numCountersSandybridge; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            switch (sandybridge_counter_map[i].type)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t reg = counter_map[index].configRegister;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t counter2 = counter_map[index].counterRegister2;
+            PciDeviceIndex dev = counter_map[index].device;
+            switch (type)
             {
                 case PMC:
-                    msr_write(cpu_id, sandybridge_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    flags |= (1<<(index-cpuid_info.perf_num_fixed_ctr));  /* enable counter */
                     break;
 
                 case FIXED:
-                    msr_write(cpu_id, sandybridge_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1ULL<<(i+32));  /* enable fixed counter */
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    flags |= (1ULL<<(index+32));  /* enable fixed counter */
                     break;
 
                 case POWER:
                     if(haveLock)
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_read(cpu_id, sandybridge_counter_map[i].counterRegister);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&tmp));
+                        eventSet->events[i].threadCounter[thread_id].startData = tmp;
                     }
-
                     break;
 
                 case MBOX0:
-                    if(haveLock)
+                case MBOX1:
+                case MBOX2:
+                case MBOX3:
+                    if (haveLock && pci_checkDevice(dev, cpu_id))
                     {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter2, 0x0ULL));
                     }
                     break;
 
-                case MBOX1:
-                    if(haveLock)
+                case MBOX0FIX:
+                case MBOX1FIX:
+                case MBOX2FIX:
+                case MBOX3FIX:
+                    /*if (haveLock && pci_checkDevice(dev, cpu_id))
                     {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                    }
+                        tmp = 0x0ULL;
+                        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, counter1, &tmp));
+                        eventSet->events[i].threadCounter[thread_id].startData = tmp;
+                    }*/
                     break;
 
-                case MBOX2:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                    }
+
+                case SBOX0:
+                case SBOX1:
+                case SBOX0FIX:
+                case SBOX1FIX:
+                case CBOX0:
+                case CBOX1:
+                case CBOX2:
+                case CBOX3:
+                case CBOX4:
+                case CBOX5:
+                case CBOX6:
+                case CBOX7:
                     break;
 
-                case MBOX3:
-                    if(haveLock)
+                case UBOX:
+                    //SNB_ENABLE_AND_RESET_CTR_BOX(UBOX);
+                    if (haveLock)
                     {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg, &tmp));
+                        tmp |= (1ULL<<22)|(1ULL<<17);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, tmp));
                     }
                     break;
+                case UBOXFIX:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    SNB_ENABLE_BOX(UBOXFIX, reg);
+                    break;
 
-                case MBOXFIX:
-                    if(haveLock)
+                case BBOX0:
+                    if (haveLock && pci_checkDevice(dev, cpu_id))
                     {
-                        pci_write(cpu_id, counter_map[i].device,  PCI_UNC_MC_PMON_FIXED_CTL, 0x48000UL);
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter1, 0x0ULL));
+                        CHECK_PCI_WRITE_ERROR(HPMwrite(cpu_id, dev, counter2, 0x0ULL));
                     }
                     break;
 
-                case SBOX0:
-                    if(haveLock)
+                case WBOX:
+                    if (haveLock)
                     {
-                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_UNC_PCU_PMON_BOX_FILTER, 0x0U));
                     }
                     break;
-
-                case SBOX1:
+                case WBOX0FIX:
+                case WBOX1FIX:
                     if(haveLock)
                     {
-                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &tmp));
+                        eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[WBOX0FIX].regWidth);
                     }
                     break;
-
                 default:
-                    /* should never be reached */
                     break;
             }
         }
     }
 
-    if (perfmon_verbose)
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_OR_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
     }
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+    SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX0);
+    SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX1);
+    SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX2);
+    SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX3);
+    SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX4);
+    SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX5);
+    SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX6);
+    SNB_UNFREEZE_AND_RESET_CTR_BOX(CBOX7);
+    SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(SBOX0);
+    SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(SBOX1);
+    SNB_UNFREEZE_PCI_BOX(MBOX0);
+    SNB_UNFREEZE_PCI_BOX(MBOX1);
+    SNB_UNFREEZE_PCI_BOX(MBOX2);
+    SNB_UNFREEZE_PCI_BOX(MBOX3);
+    SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(0);
+    SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(1);
+    SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(2);
+    SNB_UNFREEZE_AND_RESET_CTR_MBOXFIX(3);
+    SNB_UNFREEZE_PCI_BOX(BBOX0);
+    SNB_UNFREEZE_AND_RESET_CTR_BOX(WBOX);
+    SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(RBOX0);
+    SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(RBOX1);
+    SNB_UNFREEZE_AND_RESET_CTR_PCI_BOX(PBOX);
+    return 0;
 }
 
-void perfmon_stopCountersThread_sandybridge(int thread_id)
+// Read MSR counter register
+#define SNB_READ_BOX(id, reg1) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id)))) \
+    { \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, reg1, &counter_result)); \
+        VERBOSEPRINTREG(cpu_id, reg1, LLU_CAST counter_result, READ_BOX_##id) \
+    }
+
+// Read PCI counter registers and combine them to a single value
+#define SNB_READ_PCI_BOX(id, dev, reg1, reg2) \
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(id))) && pci_checkDevice(dev, cpu_id)) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, reg1, &tmp)); \
+        counter_result = (tmp<<32); \
+        CHECK_PCI_READ_ERROR(HPMread(cpu_id, dev, reg2, &tmp)); \
+        counter_result += tmp; \
+        VERBOSEPRINTPCIREG(cpu_id, dev, reg1, LLU_CAST counter_result, READ_PCI_BOX_##id) \
+    }
+
+// Check counter result for overflows. We do not handle overflows directly, that is done in the getResults function in perfmon.c
+// SandyBridge has no bits indicating that overflows occured, therefore we use this simple check
+#define SNB_CHECK_OVERFLOW \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        eventSet->events[i].threadCounter[thread_id].overflows++; \
+    }
+
+
+int perfmon_stopCountersThread_sandybridge(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    uint32_t uflags = 0x10100UL; /* Set freeze bit */
     uint64_t counter_result = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    SNB_FREEZE_BOX(CBOX0);
+    SNB_FREEZE_BOX(CBOX1);
+    SNB_FREEZE_BOX(CBOX2);
+    SNB_FREEZE_BOX(CBOX3);
+    SNB_FREEZE_BOX(CBOX4);
+    SNB_FREEZE_BOX(CBOX5);
+    SNB_FREEZE_BOX(CBOX6);
+    SNB_FREEZE_BOX(CBOX7);
+
+    SNB_FREEZE_PCI_BOX(MBOX0);
+    SNB_FREEZE_PCI_BOX(MBOX1);
+    SNB_FREEZE_PCI_BOX(MBOX2);
+    SNB_FREEZE_PCI_BOX(MBOX3);
 
-    for ( int i=0; i < perfmon_numCountersSandybridge; i++ ) 
+    SNB_FREEZE_AND_RESET_CTL_PCI_BOX(SBOX0);
+    SNB_FREEZE_AND_RESET_CTL_PCI_BOX(SBOX1);
+
+    SNB_FREEZE_AND_RESET_CTL_PCI_BOX(RBOX0);
+    SNB_FREEZE_AND_RESET_CTL_PCI_BOX(RBOX1);
+
+    SNB_FREEZE_AND_RESET_CTL_PCI_BOX(PBOX);
+
+    SNB_FREEZE_PCI_BOX(BBOX0);
+    SNB_FREEZE_AND_RESET_CTL_BOX(WBOX);
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE) 
         {
-            switch (sandybridge_counter_map[i].type)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t reg = counter_map[index].configRegister;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t counter2 = counter_map[index].counterRegister2;
+            switch (type)
             {
                 case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index - cpuid_info.perf_num_fixed_ctr)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL,
+                                                        (1ULL<<(index - cpuid_info.perf_num_fixed_ctr))));
+                        }
+                    }
+                    break;
 
                 case FIXED:
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, sandybridge_counter_map[i].counterRegister);
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index+32)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<(index+32))));
+                        }
+                    }
                     break;
 
                 case POWER:
-                    if(haveLock)
+                    if (haveLock)
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_info.energyUnit *
-                            ( power_read(cpu_id, sandybridge_counter_map[i].counterRegister) -
-                              perfmon_threadData[thread_id].counters[i].counterData);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        SNB_CHECK_OVERFLOW;
                     }
                     break;
 
                 case THERMAL:
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                             thermal_read(cpu_id);
+                    CHECK_MSR_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
                     break;
 
                 case MBOX0:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                        counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                sandybridge_counter_map[i].counterRegister);
-
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                    sandybridge_counter_map[i].counterRegister2);
-
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                    }
+                    
+                    SNB_READ_PCI_BOX(MBOX0, dev, counter1, counter2);
+                    VERBOSEPRINTPCIREG(cpu_id, dev, reg,  LLU_CAST counter_result, READ_MBOX0);
+                    SNB_CHECK_OVERFLOW;
                     break;
 
                 case MBOX1:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_1,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-                        counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
-                                sandybridge_counter_map[i].counterRegister);
-
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
-                                    sandybridge_counter_map[i].counterRegister2);
-
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                    }
+                    SNB_READ_PCI_BOX(MBOX1, dev, counter1, counter2);
+                    VERBOSEPRINTPCIREG(cpu_id, dev, reg,  LLU_CAST counter_result, READ_MBOX1);
+                    SNB_CHECK_OVERFLOW;
                     break;
 
                 case MBOX2:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_2,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
-
-                        counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
-                                sandybridge_counter_map[i].counterRegister);
-
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
-                                    sandybridge_counter_map[i].counterRegister2);
-
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                    }
+                    SNB_READ_PCI_BOX(MBOX2, dev, counter1, counter2);
+                    VERBOSEPRINTPCIREG(cpu_id, dev, reg,  LLU_CAST counter_result, READ_MBOX2);
+                    SNB_CHECK_OVERFLOW;
                     break;
 
                 case MBOX3:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_3,  PCI_UNC_MC_PMON_BOX_CTL, uflags);
+                    SNB_READ_PCI_BOX(MBOX3, dev, counter1, counter2);
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, READ_MBOX3);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
-                                sandybridge_counter_map[i].counterRegister);
+                case MBOX0FIX:
+                    SNB_READ_PCI_BOX(MBOX0FIX, dev, counter1, counter2);
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, READ_MBOX0FIX);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case MBOX1FIX:
+                    SNB_READ_PCI_BOX(MBOX1FIX, dev, counter1, counter2);
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, READ_MBOX1FIX);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case MBOX2FIX:
+                    SNB_READ_PCI_BOX(MBOX2FIX, dev, counter1, counter2);
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, READ_MBOX2FIX);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case MBOX3FIX:
+                    SNB_READ_PCI_BOX(MBOX3FIX, dev, counter1, counter2);
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, READ_MBOX3FIX);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
-                                    sandybridge_counter_map[i].counterRegister2);
+                case SBOX0:
+                    SNB_READ_PCI_BOX(SBOX0, dev, counter1, counter2);
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, READ_SBOX0);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                    }
+                case SBOX1:
+                    SNB_READ_PCI_BOX(SBOX1, dev, counter1, counter2);
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, READ_SBOX1);
+                    SNB_CHECK_OVERFLOW;
                     break;
 
-                case MBOXFIX:
-                    if(haveLock)
+                case SBOX0FIX:
+                case SBOX1FIX:
+                    if (haveLock && pci_checkDevice(dev, cpu_id))
                     {
-                        pci_write(cpu_id, PCI_IMC_DEVICE_CH_0,  PCI_UNC_MC_PMON_FIXED_CTL, uflags);
-
-                        counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                sandybridge_counter_map[i].counterRegister);
-
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                    sandybridge_counter_map[i].counterRegister2);
-
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
+                        HPMread(cpu_id, dev, counter1, &counter_result);
+                        if (eventSet->events[i].event.eventId == 0x00)
+                        {
+                            switch(extractBitField(counter_result, 3, 0))
+                            {
+                                case 0x2:
+                                    counter_result = 5.6E9;
+                                    break;
+                                case 0x3:
+                                    counter_result = 6.4E9;
+                                    break;
+                                case 0x4:
+                                    counter_result = 7.2E9;
+                                    break;
+                                case 0x5:
+                                    counter_result = 8.0E9;
+                                    break;
+                                case 0x6:
+                                    counter_result = 8.8E9;
+                                    break;
+                                case 0x7:
+                                    counter_result = 9.6E9;
+                                    break;
+                                default:
+                                    counter_result = 0;
+                                    break;
+                            }
+                        }
+                        else if (eventSet->events[i].event.eventId == 0x01)
+                        {
+                            counter_result = extractBitField(counter_result, 1, 4);
+                        }
+                        VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, READ_SBOXFIX);
                     }
                     break;
 
-                case SBOX0:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_0,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
-
-                        counter_result = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0,
-                                sandybridge_counter_map[i].counterRegister);
-
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_QPI_DEVICE_PORT_0,
-                                    sandybridge_counter_map[i].counterRegister2);
+                case CBOX0:
+                    SNB_READ_BOX(CBOX0, counter1);
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, READ_CBOX0);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX1:
+                    SNB_READ_BOX(CBOX1, counter1);
+                    VERBOSEPRINTPCIREG(cpu_id, dev, counter1,  LLU_CAST counter_result, READ_CBOX1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX2:
+                    SNB_READ_BOX(CBOX2, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX3:
+                    SNB_READ_BOX(CBOX3, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX4:
+                    SNB_READ_BOX(CBOX4, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX5:
+                    SNB_READ_BOX(CBOX5, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX6:
+                    SNB_READ_BOX(CBOX6, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX7:
+                    SNB_READ_BOX(CBOX7, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                    }
+                case UBOX:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                case UBOXFIX:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    SNB_CHECK_OVERFLOW;
                     break;
 
-                case SBOX1:
-                    if(haveLock)
-                    {
-                        pci_write(cpu_id, PCI_QPI_DEVICE_PORT_1,  PCI_UNC_QPI_PMON_BOX_CTL, uflags);
-                        counter_result = pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1,
-                                sandybridge_counter_map[i].counterRegister);
+                case BBOX0:
+                    SNB_READ_PCI_BOX(BBOX0, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        counter_result = (counter_result<<32) +
-                            pci_read(cpu_id, PCI_QPI_DEVICE_PORT_1,
-                                    sandybridge_counter_map[i].counterRegister2);
+                case WBOX:
+                    SNB_READ_BOX(WBOX, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case WBOX0FIX:
+                    SNB_READ_BOX(WBOX0FIX, counter1);
+                    break;
+                case WBOX1FIX:
+                    SNB_READ_BOX(WBOX1FIX, counter1);
+                    break;
 
-                        perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                    }
+                case RBOX0:
+                    SNB_READ_PCI_BOX(RBOX0, dev, counter1, counter2);
+                    break;
+                case RBOX1:
+                    SNB_READ_PCI_BOX(RBOX1, dev, counter1, counter2);
                     break;
 
+                case PBOX:
+                    SNB_READ_PCI_BOX(PBOX, dev, counter1, counter2);
+                    break;
                 default:
-                    /* should never be reached */
                     break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData =
+                    field64(counter_result, 0, box_map[type].regWidth);
         }
     }
 
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    //    printf ("Status: 0x%llX \n", LLU_CAST flags);
-    if ( (flags & 0x3) || (flags & (0x3ULL<<32)) ) 
-    {
-        printf ("Overflow occured \n");
-    }
+    return 0;
 }
 
-void perfmon_readCountersThread_sandybridge(int thread_id)
+int perfmon_readCountersThread_sandybridge(int thread_id, PerfmonEventSet* eventSet)
 {
     uint64_t counter_result = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t pmc_flags = 0x0ULL;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    for ( int i=0; i<perfmon_numCountersSandybridge; i++ )
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &pmc_flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    SNB_FREEZE_BOX(CBOX0);
+    SNB_FREEZE_BOX(CBOX1);
+    SNB_FREEZE_BOX(CBOX2);
+    SNB_FREEZE_BOX(CBOX3);
+    SNB_FREEZE_BOX(CBOX4);
+    SNB_FREEZE_BOX(CBOX5);
+    SNB_FREEZE_BOX(CBOX6);
+    SNB_FREEZE_BOX(CBOX7);
+
+    SNB_FREEZE_PCI_BOX(MBOX0);
+    SNB_FREEZE_PCI_BOX(MBOX1);
+    SNB_FREEZE_PCI_BOX(MBOX2);
+    SNB_FREEZE_PCI_BOX(MBOX3);
+
+    SNB_FREEZE_MBOXFIX(0);
+    SNB_FREEZE_MBOXFIX(1);
+    SNB_FREEZE_MBOXFIX(2);
+    SNB_FREEZE_MBOXFIX(3);
+
+    SNB_FREEZE_PCI_BOX(SBOX0);
+    SNB_FREEZE_PCI_BOX(SBOX1);
+
+    SNB_FREEZE_PCI_BOX(RBOX0);
+    SNB_FREEZE_PCI_BOX(RBOX1);
+
+    SNB_FREEZE_PCI_BOX(PBOX);
+
+    SNB_FREEZE_PCI_BOX(BBOX0);
+    SNB_FREEZE_BOX(WBOX);
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if ((sandybridge_counter_map[i].type == PMC) ||
-                    (sandybridge_counter_map[i].type == FIXED))
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, sandybridge_counter_map[i].counterRegister);
+                continue;
             }
-            else
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            uint64_t counter2 = counter_map[index].counterRegister2;
+            switch (type)
             {
-                if(haveLock)
-                {
-                    switch (sandybridge_counter_map[i].type)
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
                     {
-                        case POWER:
-                            perfmon_threadData[thread_id].counters[i].counterData =
-                                power_info.energyUnit *
-                                power_read(cpu_id, sandybridge_counter_map[i].counterRegister);
-                            break;
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index - cpuid_info.perf_num_fixed_ctr)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL,
+                                                        (1ULL<<(index - cpuid_info.perf_num_fixed_ctr))));
+                        }
+                    }
+                    break;
 
-                        case MBOX0:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                    sandybridge_counter_map[i].counterRegister);
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index+32)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<(index+32))));
+                        }
+                    }
+                    break;
 
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_0,
-                                        sandybridge_counter_map[i].counterRegister2);
+                case THERMAL:
+                    CHECK_MSR_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
+                    break;
 
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
+                case POWER:
+                    if (haveLock)
+                    {
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        SNB_CHECK_OVERFLOW;
+                    }
+                    break;
 
-                        case MBOX1:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
-                                    sandybridge_counter_map[i].counterRegister);
+                case MBOX0:
+                    SNB_READ_PCI_BOX(MBOX0, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_1,
-                                        sandybridge_counter_map[i].counterRegister2);
+                case MBOX1:
+                    SNB_READ_PCI_BOX(MBOX1, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
+                case MBOX2:
+                    SNB_READ_PCI_BOX(MBOX2, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        case MBOX2:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
-                                    sandybridge_counter_map[i].counterRegister);
+                case MBOX3:
+                    SNB_READ_PCI_BOX(MBOX3, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_2,
-                                        sandybridge_counter_map[i].counterRegister2);
+                case UBOX:
+                case UBOXFIX:
+                    if (haveLock)
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                        SNB_CHECK_OVERFLOW;
+                    }
 
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
+                case CBOX0:
+                    SNB_READ_BOX(CBOX0, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX1:
+                    SNB_READ_BOX(CBOX1, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX2:
+                    SNB_READ_BOX(CBOX2, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX3:
+                    SNB_READ_BOX(CBOX3, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX4:
+                    SNB_READ_BOX(CBOX4, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX5:
+                    SNB_READ_BOX(CBOX5, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX6:
+                    SNB_READ_BOX(CBOX6, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case CBOX7:
+                    SNB_READ_BOX(CBOX7, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        case MBOX3:
-                            counter_result = pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
-                                    sandybridge_counter_map[i].counterRegister);
+                case BBOX0:
+                    SNB_READ_PCI_BOX(BBOX0, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                            counter_result = (counter_result<<32) +
-                                pci_read(cpu_id, PCI_IMC_DEVICE_CH_3,
-                                        sandybridge_counter_map[i].counterRegister2);
+                case SBOX0:
+                    SNB_READ_PCI_BOX(SBOX0, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                            perfmon_threadData[thread_id].counters[i].counterData = counter_result;
-                            break;
+                case SBOX1:
+                    SNB_READ_PCI_BOX(SBOX1, dev, counter1, counter2);
+                    SNB_CHECK_OVERFLOW;
+                    break;
 
-                        default:
-                            /* should never be reached */
-                            break;
+                case SBOX0FIX:
+                case SBOX1FIX:
+                    HPMread(cpu_id, dev, counter1, &counter_result);
+                    if (eventSet->events[i].event.eventId == 0x00)
+                    {
+                        switch(extractBitField(counter_result, 3, 0))
+                        {
+                            case 0x2:
+                                counter_result = 5.6E9;
+                                break;
+                            case 0x3:
+                                counter_result = 6.4E9;
+                                break;
+                            case 0x4:
+                                counter_result = 7.2E9;
+                                break;
+                            case 0x5:
+                                counter_result = 8.0E9;
+                                break;
+                            case 0x6:
+                                counter_result = 8.8E9;
+                                break;
+                            case 0x7:
+                                counter_result = 9.6E9;
+                                break;
+                            default:
+                                counter_result = 0;
+                                break;
+                        }
                     }
-                }
+                    else if (eventSet->events[i].event.eventId == 0x01)
+                    {
+                        counter_result = extractBitField(counter_result, 1, 4);
+                    }
+                    eventSet->events[i].threadCounter[thread_id].startData = 0x0ULL;
+                    break;
+
+                case WBOX:
+                    SNB_READ_BOX(WBOX, counter1);
+                    SNB_CHECK_OVERFLOW;
+                    break;
+                case WBOX0FIX:
+                    SNB_READ_BOX(WBOX0FIX, counter1);
+                    break;
+                case WBOX1FIX:
+                    SNB_READ_BOX(WBOX1FIX, counter1);
+                    break;
+
+                default:
+                    break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData =
+                    field64(counter_result, 0, box_map[type].regWidth);
         }
     }
+
+    SNB_UNFREEZE_BOX(CBOX0);
+    SNB_UNFREEZE_BOX(CBOX1);
+    SNB_UNFREEZE_BOX(CBOX2);
+    SNB_UNFREEZE_BOX(CBOX3);
+    SNB_UNFREEZE_BOX(CBOX4);
+    SNB_UNFREEZE_BOX(CBOX5);
+    SNB_UNFREEZE_BOX(CBOX6);
+    SNB_UNFREEZE_BOX(CBOX7);
+
+    SNB_UNFREEZE_PCI_BOX(MBOX0);
+    SNB_UNFREEZE_PCI_BOX(MBOX1);
+    SNB_UNFREEZE_PCI_BOX(MBOX2);
+    SNB_UNFREEZE_PCI_BOX(MBOX3);
+
+    SNB_UNFREEZE_MBOXFIX(0);
+    SNB_UNFREEZE_MBOXFIX(1);
+    SNB_UNFREEZE_MBOXFIX(2);
+    SNB_UNFREEZE_MBOXFIX(3);
+
+    SNB_UNFREEZE_PCI_BOX(SBOX0);
+    SNB_UNFREEZE_PCI_BOX(SBOX1);
+
+    SNB_UNFREEZE_PCI_BOX(RBOX0);
+    SNB_UNFREEZE_PCI_BOX(RBOX1);
+
+    SNB_UNFREEZE_PCI_BOX(PBOX);
+
+    SNB_UNFREEZE_PCI_BOX(BBOX0);
+    SNB_UNFREEZE_BOX(WBOX);
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, pmc_flags));
+    }
+
+    return 0;
 }
 
+int perfmon_finalizeCountersThread_sandybridge(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    int haveTileLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            RegisterIndex index = eventSet->events[i].index;
+            PciDeviceIndex dev = counter_map[index].device;
+            uint64_t reg = counter_map[index].configRegister;
+            switch(type)
+            {
+                case PMC:
+                    ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                    if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                    }
+                    else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                    }
+                    break;
+                case FIXED:
+                    ovf_values_core |= (1ULL<<(index+32));
+                    break;
+                default:
+                    break;
+            }
+            if ((reg) &&
+                (((type == PMC)||(type == FIXED)) || ((type >= UNCORE) && (haveLock) && (pci_checkDevice(dev, cpu_id)))))
+            {
+                VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+            }
+            eventSet->events[i].threadCounter[thread_id].init = FALSE;
+        }
+    }
+
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, LLU_CAST ovf_values_core, CLEAR_GLOBAL_OVF)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, CLEAR_GLOBAL_CTRL)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    return 0;
+}
diff --git a/src/includes/perfmon_sandybridgeEP_events.txt b/src/includes/perfmon_sandybridgeEP_events.txt
new file mode 100644
index 0000000..bb0b5fb
--- /dev/null
+++ b/src/includes/perfmon_sandybridgeEP_events.txt
@@ -0,0 +1,1282 @@
+# =======================================================================================
+#
+#      Filename:  perfmon_sandybridgeEP_events.txt
+#
+#      Description:  Event list for Intel SandyBridge EP
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
+#      Project:  likwid
+#
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+#
+#      This program is free software: you can redistribute it and/or modify it under
+#      the terms of the GNU General Public License as published by the Free Software
+#      Foundation, either version 3 of the License, or (at your option) any later
+#      version.
+#
+#      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+#      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+#      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+#
+#      You should have received a copy of the GNU General Public License along with
+#      this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# =======================================================================================
+
+EVENT_TEMP_CORE          0x00   TMP0
+UMASK_TEMP_CORE          0x00
+
+EVENT_PWR_PKG_ENERGY          0x00   PWR0
+UMASK_PWR_PKG_ENERGY          0x00
+
+EVENT_PWR_PP0_ENERGY          0x00   PWR1
+UMASK_PWR_PP0_ENERGY          0x00
+
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
+
+EVENT_PWR_DRAM_ENERGY          0x00   PWR3
+UMASK_PWR_DRAM_ENERGY          0x00
+
+EVENT_INSTR_RETIRED              0x00   FIXC0
+UMASK_INSTR_RETIRED_ANY          0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC1
+UMASK_CPU_CLK_UNHALTED_CORE      0x00
+
+EVENT_CPU_CLK_UNHALTED           0x00   FIXC2
+UMASK_CPU_CLK_UNHALTED_REF       0x00
+
+EVENT_LOAD_BLOCKS                 0x03  PMC
+UMASK_LOAD_BLOCKS_DATA_UNKNOWN    0x01
+UMASK_LOAD_BLOCKS_STORE_FORWARD   0x02
+UMASK_LOAD_BLOCKS_NO_SR           0x08
+UMASK_LOAD_BLOCKS_ALL_BLOCK       0x10
+
+EVENT_MISALIGN_MEM_REF           0x05  PMC
+UMASK_MISALIGN_MEM_REF_LOAD      0x01
+UMASK_MISALIGN_MEM_REF_STORE     0x02
+UMASK_MISALIGN_MEM_REF_ANY       0x03
+
+EVENT_LD_BLOCKS_PARTIAL      0x07  PMC
+UMASK_LD_BLOCKS_PARTIAL_ADDRESS_ALIAS   0x01  PMC
+UMASK_LD_BLOCKS_PARTIAL_ALL_STA_BLOCK   0x08  PMC
+
+EVENT_DTLB_LOAD_MISSES                 0x08  PMC
+UMASK_DTLB_LOAD_MISSES_CAUSES_A_WALK   0x01
+UMASK_DTLB_LOAD_MISSES_WALK_COMPLETED  0x02
+UMASK_DTLB_LOAD_MISSES_WALK_DURATION   0x04
+
+EVENT_INT_MISC                  0x0D  PMC
+UMASK_INT_MISC_RECOVERY_CYCLES   0x03 0x41 0x01
+UMASK_INT_MISC_STALL_CYCLES     0x40
+
+EVENT_UOPS_ISSUED                  0x0E  PMC
+UMASK_UOPS_ISSUED_ANY           0x01
+
+EVENT_FP_COMP_OPS_EXE            0x10   PMC
+UMASK_FP_COMP_OPS_EXE_X87       0x01
+UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_DOUBLE     0x10
+UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_SINGLE     0x20
+UMASK_FP_COMP_OPS_EXE_SSE_FP_PACKED_SINGLE     0x40
+UMASK_FP_COMP_OPS_EXE_SSE_FP_SCALAR_DOUBLE     0x80
+
+EVENT_SIMD_FP_256_PACKED       0x11   PMC
+UMASK_SIMD_FP_256_PACKED_SINGLE     0x01
+UMASK_SIMD_FP_256_PACKED_DOUBLE     0x02
+
+EVENT_ARITH                      0x14   PMC
+UMASK_ARITH_FPU_DIV_ACTIVE       0x01
+UMASK_ARITH_NUM_DIV              0x01 0xC5 0x01
+
+EVENT_INSTS_WRITTEN_TO_IQ            0x17   PMC
+UMASK_INSTS_WRITTEN_TO_IQ_INSTS        0x01
+
+EVENT_L2_RQSTS                   0x24   PMC
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_HIT 0x01
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_MISS 0x02
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD 0x03
+UMASK_L2_RQSTS_RFO_HITS           0x04
+UMASK_L2_RQSTS_RFO_MISS          0x08
+UMASK_L2_RQSTS_RFO_ANY           0x0C
+UMASK_L2_RQSTS_CODE_RD_HITS        0x10
+UMASK_L2_RQSTS_CODE_RD_MISS       0x20
+UMASK_L2_RQSTS_ALL_CODE_CODE_RD   0x30
+UMASK_L2_RQSTS_PF_HIT      0x40
+UMASK_L2_RQSTS_PF_MISS     0x80
+UMASK_L2_RQSTS_ALL_PF        0xC0
+UMASK_L2_RQSTS_MISS              0xAA
+
+EVENT_L2_STORE_LOCK_RQSTS            0x27   PMC
+UMASK_L2_STORE_LOCK_RQSTS_MISS       0x01
+UMASK_L2_STORE_LOCK_RQSTS_HIT_E       0x04
+UMASK_L2_STORE_LOCK_RQSTS_HIT_M       0x08
+UMASK_L2_STORE_LOCK_RQSTS_ALL        0x0F
+
+EVENT_L1D_WB_RQST                  0x28   PMC
+UMASK_L1D_WB_RQST_HIT_E          0x04
+UMASK_L1D_WB_RQST_HIT_M          0x08
+
+EVENT_L3_LAT_CACHE               0x2E   PMC
+UMASK_L3_LAT_CACHE_REFERENCE     0x4F
+UMASK_L3_LAT_CACHE_MISS          0x41
+
+EVENT_CPU_CLOCK_UNHALTED         0x3C   PMC
+UMASK_CPU_CLOCK_UNHALTED_THREAD_P  0x00
+UMASK_CPU_CLOCK_UNHALTED_REF_P     0x01
+
+EVENT_L1D_PEND_MISS              0x48   PMC1
+UMASK_L1D_PEND_MISS_PENDING      0x01
+
+EVENT_DTLB_STORE_MISSES                 0x49   PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK   0x01
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED  0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION   0x04
+UMASK_DTLB_STORE_MISSES_STLB_HIT        0x10
+
+EVENT_LOAD_HIT_PRE               0x4C    PMC
+UMASK_LOAD_HIT_PRE_SW_PF               0x01
+UMASK_LOAD_HIT_PRE_HW_PF               0x02
+
+EVENT_HW_PRE_REQ               0x4E    PMC
+UMASK_HW_PRE_REQ_DL1_MISS      0x02
+
+EVENT_L1D                        0x51   PMC
+UMASK_L1D_REPLACEMENT             0x01
+UMASK_L1D_ALLOCATED_IN_M          0x02
+UMASK_L1D_M_EVICT                 0x04
+UMASK_L1D_ALL_M_REPLACEMENT       0x08
+
+EVENT_PARTIAL_RAT_STALLS               0x59    PMC
+UMASK_PARTIAL_RAT_STALLS_FLAGS_MERGE_UOP   0x20
+UMASK_PARTIAL_RAT_STALLS_SLOW_LEA_WINDOW   0x40
+UMASK_PARTIAL_RAT_STALLS_MUL_SINGLE_UOP   0x80
+
+EVENT_RESOURCE_STALLS2               0x5B    PMC
+UMASK_RESOURCE_STALLS2_ALL_FL_EMPTY     0x0C
+UMASK_RESOURCE_STALLS2_ALL_PRF_CONTROL     0x0F
+UMASK_RESOURCE_STALLS2_BOB_FULL     0x40
+UMASK_RESOURCE_STALLS2_OOO_RSRC     0x4F
+
+EVENT_CPL_CYCLES               0x5C    PMC
+UMASK_CPL_CYCLES_RING0             0x01
+UMASK_CPL_CYCLES_RING123             0x02
+
+EVENT_RS_EVENTS               0x5E    PMC
+UMASK_RS_EVENTS_EMPTY_CYCLES    0x01
+
+EVENT_OFFCORE_REQUESTS_OUTSTANDING          0x60   PMC
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_DEMAND_RFO   0x04
+UMASK_OFFCORE_REQUESTS_OUTSTANDING_ALL_DATA_RD   0x08
+
+EVENT_CACHE_LOCK_CYCLES          0x63   PMC
+UMASK_CACHE_LOCK_CYCLES_SPLIT_LOCK_UC_LOCK_DURATION      0x01
+UMASK_CACHE_LOCK_CYCLES_CACHE_LOCK_DURATION       0x02
+
+EVENT_IDQ               0x79   PMC
+UMASK_IDQ_EMPTY         0x02
+UMASK_IDQ_MITE_UOPS     0x04
+UMASK_IDQ_DSB_UOPS      0x08
+UMASK_IDQ_MS_DSB_UOPS   0x10
+UMASK_IDQ_MS_MITE_UOPS  0x20
+UMASK_IDQ_MS_UOPS       0x30
+
+EVENT_ICACHE                  0x80   PMC
+UMASK_ICACHE_HITS             0x01
+UMASK_ICACHE_MISSES             0x02
+UMASK_ICACHE_ACCESSES           0x03
+UMASK_ICACHE_IFETCH_STALL       0x04
+
+EVENT_ITLB_MISSES                 0x85      PMC
+UMASK_ITLB_MISSES_CAUSES_A_WALK   0x01
+UMASK_ITLB_MISSES_WALK_COMPLETED  0x02
+UMASK_ITLB_MISSES_WALK_DURATION   0x04
+UMASK_ITLB_MISSES_STLB_HIT   0x10
+
+EVENT_ILD_STALL                 0x87      PMC
+UMASK_ILD_STALL_LCP             0x01
+UMASK_ILD_STALL_IQ_FULL         0x04
+
+EVENT_BR_INST_EXEC               0x88   PMC
+UMASK_BR_INST_EXEC_COND_TAKEN          0x81
+UMASK_BR_INST_EXEC_COND_NON_TAKEN      0x41
+UMASK_BR_INST_EXEC_DIRECT_JMP_TAKEN        0x82
+UMASK_BR_INST_EXEC_DIRECT_JMP_NON_TAKEN        0x42
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN     0x84
+UMASK_BR_INST_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN     0x44
+UMASK_BR_INST_EXEC_RETURN_NEAR_TAKEN           0x88
+UMASK_BR_INST_EXEC_RETURN_NEAR_NON_TAKEN           0x48
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_TAKEN      0x90
+UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL_NON_TAKEN      0x50
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_TAKEN    0xA0 
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN    0x60 
+UMASK_BR_INST_EXEC_ALL_BRANCHES                   0xFF 
+
+EVENT_BR_MISP_EXEC                    0x89   PMC
+UMASK_BR_MISP_EXEC_COND_TAKEN               0x81
+UMASK_BR_MISP_EXEC_COND_NON_TAKEN               0x41
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_TAKEN  0x84
+UMASK_BR_MISP_EXEC_INDIRECT_JMP_NON_CALL_RET_NON_TAKEN  0x44
+UMASK_BR_MISP_EXEC_RETURN_NEAR_TAKEN        0x88
+UMASK_BR_MISP_EXEC_RETURN_NEAR_NON_TAKEN        0x48
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_TAKEN   0x90
+UMASK_BR_MISP_EXEC_DIRECT_NEAR_CALL_NON_TAKEN   0x50
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_TAKEN 0xA0
+UMASK_BR_MISP_EXEC_INDIRECT_NEAR_CALL_NON_TAKEN 0x60
+UMASK_BR_MISP_EXEC_ALL_BRANCHES       0xFF
+
+EVENT_IDQ_UOPS_NOT_DELIVERED                    0x9C   PMC
+UMASK_IDQ_UOPS_NOT_DELIVERED_CORE               0x01
+
+EVENT_UOPS_DISPATCHED_PORT                 0xA1   PMC
+UMASK_UOPS_DISPATCHED_PORT_PORT_0           0x01
+UMASK_UOPS_DISPATCHED_PORT_PORT_1           0x02
+UMASK_UOPS_DISPATCHED_PORT_PORT_2_LD        0x04
+UMASK_UOPS_DISPATCHED_PORT_PORT_2_STA       0x08
+UMASK_UOPS_DISPATCHED_PORT_PORT_2           0x0C
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_LD           0x10
+UMASK_UOPS_DISPATCHED_PORT_PORT_3_STA           0x20
+UMASK_UOPS_DISPATCHED_PORT_PORT_3           0x30
+UMASK_UOPS_DISPATCHED_PORT_PORT_4           0x40
+UMASK_UOPS_DISPATCHED_PORT_PORT_5           0x80
+
+EVENT_RESOURCE_STALLS                 0xA2   PMC
+UMASK_RESOURCE_STALLS_ANY             0x01
+UMASK_RESOURCE_STALLS_LB              0x02
+UMASK_RESOURCE_STALLS_RS              0x04
+UMASK_RESOURCE_STALLS_B               0x08
+UMASK_RESOURCE_STALLS_ROB             0x10
+UMASK_RESOURCE_STALLS_FCSW            0x20
+UMASK_RESOURCE_STALLS_MXCSR           0x40
+UMASK_RESOURCE_STALLS_OTHER           0x80
+
+EVENT_DSB2MITE_SWITCHES                  0xAB   PMC
+UMASK_DSB2MITE_SWITCHES_COUNT            0x01
+UMASK_DSB2MITE_SWITCHES_PENALTY_CYCLES   0x02
+
+EVENT_DSB_FILL                         0xAC   PMC
+UMASK_DSB_FILL_OTHER_CANCEL            0x02
+UMASK_DSB_FILL_EXCEED_DSB_LINES        0x08
+UMASK_DSB_FILL_ALL_CANCEL        0x0A
+
+EVENT_ITLB                         0xAE   PMC
+UMASK_ITLB_ITLB_FLUSH            0x01
+
+EVENT_OFFCORE_REQUESTS     0xB0   PMC
+UMASK_OFFCORE_REQUESTS_DEMAND_DATA_RD   0x01
+UMASK_OFFCORE_REQUESTS_DEMAND_CODE_RD   0x02
+UMASK_OFFCORE_REQUESTS_DEMAND_RFO       0x04
+UMASK_OFFCORE_REQUESTS_ALL_DATA_RD      0x08
+
+EVENT_UOPS_DISPATCHED               0xB1   PMC
+UMASK_UOPS_DISPATCHED_THREAD            0x01
+UMASK_UOPS_DISPATCHED_CORE              0x02
+
+EVENT_OFFCORE_REQUESTS_BUFFER     0xB2  PMC
+UMASK_OFFCORE_REQUESTS_BUFFER_SQ_FULL     0x01
+
+EVENT_AGU_BYPASS_CANCEL          0xB6  PMC
+UMASK_AGU_BYPASS_CANCEL_COUNT     0x01
+
+EVENT_TLB_FLUSH          0xBD  PMC
+UMASK_TLB_FLUSH_DTLB_THREAD     0x01
+UMASK_TLB_FLUSH_STLB_ANY        0x20
+
+EVENT_L1D_BLOCKS          0xBF  PMC
+UMASK_L1D_BLOCKS_BANK_CONFLICT_CYCLES    0x05 0x41 0x01
+
+EVENT_INST_RETIRED                  0xC0  PMC0
+UMASK_INST_RETIRED_ANY_P            0x00
+UMASK_INST_RETIRED_PREC_DIST              0x01
+
+EVENT_OTHER_ASSISTS                  0xC1  PMC
+UMASK_OTHER_ASSISTS_ITLB_MISS_RETIRED     0x02
+UMASK_OTHER_ASSISTS_AVX_TO_SSE            0x10
+UMASK_OTHER_ASSISTS_SSE_TO_AVX            0x20
+
+EVENT_UOPS_RETIRED                  0xC2  PMC
+UMASK_UOPS_RETIRED_ALL              0x01
+UMASK_UOPS_RETIRED_RETIRE_SLOTS     0x02
+
+EVENT_MACHINE_CLEARS              0xC3  PMC
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING    0x02
+UMASK_MACHINE_CLEARS_SMC                0x04
+UMASK_MACHINE_CLEARS_MASKMOV            0x20
+
+EVENT_BR_INST_RETIRED               0xC4  PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_INST_RETIRED_CONDITIONAL   0x01
+UMASK_BR_INST_RETIRED_NEAR_CALL     0x02
+UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x04
+UMASK_BR_INST_RETIRED_NEAR_RETURN   0x08
+UMASK_BR_INST_RETIRED_NOT_TAKEN     0x10
+UMASK_BR_INST_RETIRED_NEAR_TAKEN    0x20
+UMASK_BR_INST_RETIRED_FAR_BRANCH    0x40
+
+EVENT_BR_MISP_RETIRED               0xC5  PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES  0x00
+UMASK_BR_MISP_RETIRED_CONDITIONAL  0x01
+UMASK_BR_MISP_RETIRED_NEAR_CALL     0x02
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES     0x04
+UMASK_BR_MISP_RETIRED_NOT_TAKEN      0x10
+UMASK_BR_MISP_RETIRED_TAKEN      0x20
+
+EVENT_FP_ASSIST               0xCA  PMC
+UMASK_FP_ASSIST_X87_OUTPUT               0x02
+UMASK_FP_ASSIST_X87_INPUT                0x04
+UMASK_FP_ASSIST_SIMD_OUTPUT               0x08
+UMASK_FP_ASSIST_SIMD_INPUT               0x10
+UMASK_FP_ASSIST_ANY               0x1E
+
+EVENT_HW_INTERRUPTS_RECEIVED               0xCB  PMC
+UMASK_HW_INTERRUPTS_RECEIVED               0x01
+
+EVENT_ROB_MISC_EVENT_LBR_INSERTS               0xCC  PMC
+UMASK_ROB_MISC_EVENT_LBR_INSERTS               0x20
+
+EVENT_MEM_UOP_RETIRED            0xD0    PMC
+UMASK_MEM_UOP_RETIRED_LOADS            0x81
+UMASK_MEM_UOP_RETIRED_STORES           0x82
+UMASK_MEM_UOP_RETIRED_LOADS_STLB_MISS         0x11
+UMASK_MEM_UOP_RETIRED_STORES_STLB_MISS        0x12
+UMASK_MEM_UOP_RETIRED_LOADS_LOCK              0x21
+UMASK_MEM_UOP_RETIRED_STORES_LOCK             0x22
+UMASK_MEM_UOP_RETIRED_LOADS_SPLIT             0x41
+UMASK_MEM_UOP_RETIRED_STORES_SPLIT            0x42
+
+EVENT_MEM_LOAD_UOPS_RETIRED               0xD1   PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_HIT       0x01
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_MISS      0x08
+UMASK_MEM_LOAD_UOPS_RETIRED_L1_ALL       0x09
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_HIT       0x02
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_MISS      0x10
+UMASK_MEM_LOAD_UOPS_RETIRED_L2_ALL       0x12
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_HIT       0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_MISS      0x20
+UMASK_MEM_LOAD_UOPS_RETIRED_L3_ALL       0x24
+UMASK_MEM_LOAD_UOPS_RETIRED_HIT_LFB      0x40
+
+EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED               0xD2   PMC
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS         0x01
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT          0x02
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM         0x04
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE         0x08
+
+EVENT_MEM_LOAD_UOPS_MISC_RETIRED               0xD4   PMC
+UMASK_MEM_LOAD_UOPS_MISC_RETIRED_LLC_MISS      0x02
+
+EVENT_L2_TRANS               0xF0  PMC
+UMASK_L2_TRANS_DEMAND_DATA_RD          0x01
+UMASK_L2_TRANS_RFO           0x02
+UMASK_L2_TRANS_CODE_RD       0x04
+UMASK_L2_TRANS_ALL_PREF      0x08
+UMASK_L2_TRANS_L1D_WB        0x10
+UMASK_L2_TRANS_L2_FILL       0x20
+UMASK_L2_TRANS_L2_WB         0x40
+UMASK_L2_TRANS_ALL_REQUESTS  0x80
+
+EVENT_L2_LINES_IN                   0xF1   PMC
+UMASK_L2_LINES_IN_I           0x01
+UMASK_L2_LINES_IN_S            0x02
+UMASK_L2_LINES_IN_E           0x04
+UMASK_L2_LINES_IN_ALL               0x07
+
+EVENT_L2_LINES_OUT                  0xF2   PMC
+UMASK_L2_LINES_OUT_DEMAND_CLEAN     0x01
+UMASK_L2_LINES_OUT_DEMAND_DIRTY     0x02
+UMASK_L2_LINES_OUT_PF_CLEAN   0x04
+UMASK_L2_LINES_OUT_PF_DIRTY   0x08
+UMASK_L2_LINES_OUT_DIRTY_ALL              0x0A
+
+EVENT_SQ_MISC                         0xF4  PMC
+UMASK_SQ_MISC_SPLIT_LOCK              0x10
+
+EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED               0xD2  PMC
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS              0x01
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT              0x02
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM              0x04
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NON              0x08
+
+EVENT_MEM_LOAD_UOPS_RETIRED          0xD1  PMC
+UMASK_MEM_LOAD_UOPS_RETIRED_LLC_HIT           0x04
+UMASK_MEM_LOAD_UOPS_RETIRED_LLC_MISS           0x20
+
+EVENT_MEM_LOAD_UOPS_LLC_HIT_RETIRED          0xD2  PMC
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_MISS           0x01
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT            0x02
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM           0x04
+UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NONE           0x08
+
+EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED          0xD3  PMC
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM           0x01
+UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_DRAM            0x04
+
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM               EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM       EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x78040
+UMASK_OFFCORE_RESPONSE_0_LOCAL_DRAM                 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM      EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x7FF80
+UMASK_OFFCORE_RESPONSE_0_REMOTE_DRAM                0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM               EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM       EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x78040
+UMASK_OFFCORE_RESPONSE_1_LOCAL_DRAM                 0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM      EVENT_OPTION_MATCH0=0x08FFF,EVENT_OPTION_MATCH1=0x7FF80
+UMASK_OFFCORE_RESPONSE_1_REMOTE_DRAM                0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_CBOX_CLOCKTICKS                         0x00  CBOX
+UMASK_CBOX_CLOCKTICKS                         0x00
+
+EVENT_COUNTER0_OCCUPANCY              0x1F  CBOX0C1|CBOX0C2|CBOX0C3|CBOX1C1|CBOX1C2|CBOX1C3|CBOX2C1|CBOX2C2|CBOX2C3|CBOX3C1|CBOX03C2|CBOX3C3|CBOX4C1|CBOX4C2|CBOX4C3|CBOX5C1|CBOX5C2|CBOX5C3|CBOX6C1|CBOX6C2|CBOX6C3|CBOX7C1|CBOX7C2|CBOX7C3
+UMASK_COUNTER0_OCCUPANCY              0x00
+
+EVENT_ISMQ_DRD_MISS_OCC              0x21  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_ISMQ_DRD_MISS_OCC              0x00
+
+EVENT_LLC_LOOKUP              0x34  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+DEFAULT_OPTIONS_LLC_LOOKUP_DATA_READ EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_DATA_READ          0x03
+DEFAULT_OPTIONS_LLC_LOOKUP_WRITE    EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_WRITE              0x05
+DEFAULT_OPTIONS_LLC_LOOKUP_DATA_READ_AND_ALL EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_DATA_READ_AND_ALL WRITE 0x7
+DEFAULT_OPTIONS_LLC_LOOKUP_REMOTE_SNOOP EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_REMOTE_SNOOP       0x09
+DEFAULT_OPTIONS_LLC_LOOKUP_NID      EVENT_OPTION_STATE=0x1
+UMASK_LLC_LOOKUP_NID                0x40
+
+EVENT_LLC_VICTIMS              0x37  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_LLC_VICTIMS_M_STATE      0x01
+UMASK_LLC_VICTIMS_E_STATE      0x02
+UMASK_LLC_VICTIMS_S_STATE      0x04
+UMASK_LLC_VICTIMS_MISS         0x08
+UMASK_LLC_VICTIMS_ALL_STATES   0x0F
+OPTIONS_LLC_VICTIMS_NID        EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID          0x40
+OPTIONS_LLC_VICTIMS_NID_MISSES EVENT_OPTION_NID_MASK
+UMASK_LLC_VICTIMS_NID_MISSES   0x41
+
+EVENT_CBOX_MISC              0x39  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_CBOX_MISC_RSPI_WAS_FSE      0x01
+UMASK_CBOX_MISC_WC_ALIASING       0x02
+UMASK_CBOX_MISC_STARTED           0x04
+UMASK_CBOX_MISC_RFO_HIT_S         0x08
+
+EVENT_RING_AD_USED              0x1B  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3
+UMASK_RING_AD_USED_UP_EVEN      0x01
+UMASK_RING_AD_USED_UP_ODD       0x02
+UMASK_RING_AD_USED_DOWN_EVEN    0x04
+UMASK_RING_AD_USED_DOWN_ODD     0x08
+
+EVENT_RING_AK_USED              0x1C  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3
+UMASK_RING_AK_USED_UP_EVEN      0x01
+UMASK_RING_AK_USED_UP_ODD       0x02
+UMASK_RING_AK_USED_DOWN_EVEN    0x04
+UMASK_RING_AK_USED_DOWN_ODD     0x08
+
+EVENT_RING_BL_USED              0x1D  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3
+UMASK_RING_BL_USED_UP_EVEN      0x01
+UMASK_RING_BL_USED_UP_ODD       0x02
+UMASK_RING_BL_USED_DOWN_EVEN    0x04
+UMASK_RING_BL_USED_DOWN_ODD     0x08
+
+EVENT_RING_BOUNCES              0x05  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RING_BOUNCES_AK_CORE      0x02
+UMASK_RING_BOUNCES_BL_CORE      0x04
+UMASK_RING_BOUNCES_IV_CORE      0x08
+
+EVENT_RING_IV_USED              0x1E  CBOX0C2|CBOX1C2|CBOX2C2|CBOX3C2|CBOX4C2|CBOX5C2|CBOX6C2|CBOX7C2|CBOX0C3|CBOX1C3|CBOX2C3|CBOX3C3|CBOX4C3|CBOX5C3|CBOX6C3|CBOX7C3
+UMASK_RING_IV_USED_ANY          0x0F
+
+EVENT_RING_SRC_THRTL            0x05  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RING_SRC_THRTL            0x07
+
+EVENT_RXR_EXT_STARVED               0x12  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_EXT_STARVED_IRQ           0x01
+UMASK_RXR_EXT_STARVED_IPQ           0x02
+UMASK_RXR_EXT_STARVED_ISMQ          0x04
+UMASK_RXR_EXT_STARVED_ISMQ_BIDS     0x08
+
+EVENT_RXR_INSERTS                0x13  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_INSERTS_IRQ            0x01
+UMASK_RXR_INSERTS_IRQ_REJECTED   0x02
+UMASK_RXR_INSERTS_IPQ            0x04
+UMASK_RXR_INSERTS_VFIFO          0x10
+
+EVENT_RXR_IPQ_RETRY                0x31  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_IPQ_RETRY_ANY            0x01
+UMASK_RXR_IPQ_RETRY_FULL           0x02
+UMASK_RXR_IPQ_RETRY_ADDR_CONFLICT  0x04
+UMASK_RXR_IPQ_RETRY_QPI_CREDITS    0x10
+
+EVENT_RXR_IRQ_RETRY                0x32  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_IRQ_RETRY_ANY            0x01
+UMASK_RXR_IRQ_RETRY_FULL           0x02
+UMASK_RXR_IRQ_RETRY_ADDR_CONFLICT  0x04
+UMASK_RXR_IRQ_RETRY_RTID           0x08
+UMASK_RXR_IRQ_RETRY_QPI_CREDITS    0x10
+
+EVENT_RXR_ISMQ_RETRY                0x33  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_RXR_ISMQ_RETRY_ANY            0x01
+UMASK_RXR_ISMQ_RETRY_FULL           0x02
+UMASK_RXR_ISMQ_RETRY_ADDR_CONFLICT  0x04
+UMASK_RXR_ISMQ_RETRY_RTID           0x08
+UMASK_RXR_ISMQ_RETRY_QPI_CREDITS    0x10
+
+EVENT_RXR_OCCUPANCY                0x11  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0
+UMASK_RXR_OCCUPANCY_IRQ            0x01
+UMASK_RXR_OCCUPANCY_IRQ_REJECTED   0x02
+UMASK_RXR_OCCUPANCY_IPQ            0x04
+UMASK_RXR_OCCUPANCY_VIFO           0x10
+
+EVENT_TOR_INSERTS                    0x35  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+OPTIONS_TOR_INSERTS_OPCODE           EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_OPCODE             0x01
+UMASK_TOR_INSERTS_EVICTION           0x04
+UMASK_TOR_INSERTS_WB                 0x10
+OPTIONS_TOR_INSERTS_MISS_OPCODE      EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_MISS_OPCODE        0x03
+UMASK_TOR_INSERTS_MISS_ALL           0x0A
+OPTIONS_TOR_INSERTS_NID_OPCODE       EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_OPCODE         0x41
+OPTIONS_TOR_INSERTS_NID_EVICION      EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_EVICTION       0x44
+OPTIONS_TOR_INSERTS_NID_ALL          EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_ALL            0x48
+OPTIONS_TOR_INSERTS_NID_WB           EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_WB             0x50
+OPTIONS_TOR_INSERTS_NID_MISS_OPCODE  EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_INSERTS_NID_MISS_OPCODE    0x43
+OPTIONS_TOR_INSERTS_NID_ALL          EVENT_OPTION_NID_MASK
+UMASK_TOR_INSERTS_NID_MISS_ALL       0x4A
+
+EVENT_TOR_OCCUPANCY                    0x36  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0
+OPTIONS_TOR_OCCUPANCY_OPCODE           EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_OPCODE             0x01
+UMASK_TOR_OCCUPANCY_EVICTION           0x04
+UMASK_TOR_OCCUPANCY_ALL                0x08
+OPTIONS_TOR_OCCUPANCY_MISS_OPCODE      EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_MISS_OPCODE        0x03
+UMASK_TOR_OCCUPANCY_MISS_ALL           0x0A
+OPTIONS_TOR_OCCUPANCY_NID_OPCODE       EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_OPCODE         0x41
+OPTIONS_TOR_OCCUPANCY_NID_EVICTION     EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_EVICTION       0x44
+OPTIONS_TOR_OCCUPANCY_NID_ALL          EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_ALL            0x48
+OPTIONS_TOR_OCCUPANCY_NID_MISS_OPCODE  EVENT_OPTION_NID_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_OPCODE    0x43
+OPTIONS_TOR_OCCUPANCY_NID_MISS_ALL     EVENT_OPTION_NID_MASK
+UMASK_TOR_OCCUPANCY_NID_MISS_ALL       0x4A
+
+EVENT_TXT_ADS_USED                0x04  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_TXT_ADS_USED            0x00
+
+EVENT_TXT_INSERTS                0x02  CBOX0C0|CBOX1C0|CBOX2C0|CBOX3C0|CBOX4C0|CBOX5C0|CBOX6C0|CBOX7C0|CBOX0C1|CBOX1C1|CBOX2C1|CBOX3C1|CBOX4C1|CBOX5C1|CBOX6C1|CBOX7C1
+UMASK_TXT_INSERTS_AD_CACHE            0x01
+UMASK_TXT_INSERTS_AK_CACHE            0x02
+UMASK_TXT_INSERTS_BL_CACHE            0x04
+UMASK_TXT_INSERTS_IV_CACHE            0x08
+UMASK_TXT_INSERTS_AD_CORE             0x10
+UMASK_TXT_INSERTS_AK_CORE             0x20
+UMASK_TXT_INSERTS_BL_CORE             0x40
+
+EVENT_BBOX_CLOCKTICKS                0x00  BBOX
+UMASK_BBOX_CLOCKTICKS                0x00
+
+EVENT_CONFLICT_CYCLES                0x0B  BBOX
+UMASK_CONFLICT_CYCLES_NO_CONFLICT    0x01
+UMASK_CONFLICT_CYCLES_CONFLICT       0x02
+
+EVENT_DIRECT2CORE_COUNT                0x11  BBOX
+UMASK_DIRECT2CORE_COUNT                0x00
+
+EVENT_DIRECT2CORE_CYCLES_DISABLED      0x12  BBOX
+UMASK_DIRECT2CORE_CYCLES_DISABLED      0x00
+
+EVENT_DIRECT2CORE_TXN_OVERRIDE         0x13  BBOX
+UMASK_DIRECT2CORE_TXN_OVERRIDE         0x00
+
+EVENT_DIRECTORY_LOOKUP             0x0C  BBOX
+UMASK_DIRECTORY_LOOKUP_SNP         0x01
+UMASK_DIRECTORY_LOOKUP_NO_SNP      0x02
+
+EVENT_DIRECTORY_UPDATE             0x0D  BBOX
+UMASK_DIRECTORY_UPDATE_SET         0x01
+UMASK_DIRECTORY_UPDATE_CLEAR       0x02
+UMASK_DIRECTORY_UPDATE_ANY         0x03
+
+EVENT_IGR_NO_CREDIT_CYCLES             0x22  BBOX
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI0     0x01
+UMASK_IGR_NO_CREDIT_CYCLES_AD_QPI1     0x02
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI0     0x04
+UMASK_IGR_NO_CREDIT_CYCLES_BL_QPI1     0x08
+
+EVENT_IMC_RETRY     0x1E  BBOX
+UMASK_IMC_RETRY     0x00
+
+EVENT_IMC_WRITES                   0x1A  BBOX
+UMASK_IMC_WRITES_FULL              0x01
+UMASK_IMC_WRITES_PARTIAL           0x02
+UMASK_IMC_WRITES_FULL_ISOCH        0x04
+UMASK_IMC_WRITES_PARTIAL_ISOCH     0x08
+UMASK_IMC_WRITES_ALL               0x0F
+
+EVENT_REQUESTS                   0x01  BBOX
+UMASK_REQUESTS_READS             0x03
+UMASK_REQUESTS_WRITES            0x0C
+
+EVENT_RPQ_CYCLES_NO_REG_CREDITS           0x15  BBOX
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN0      0x01
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN1      0x02
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN2      0x04
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_CHN3      0x08
+UMASK_RPQ_CYCLES_NO_REG_CREDITS_ALL       0x0F
+
+EVENT_TAD_REQUESTS_G0               0x1B  BBOX
+UMASK_TAD_REQUESTS_G0_REGION_0      0x01
+UMASK_TAD_REQUESTS_G0_REGION_1      0x02
+UMASK_TAD_REQUESTS_G0_REGION_2      0x04
+UMASK_TAD_REQUESTS_G0_REGION_3      0x08
+UMASK_TAD_REQUESTS_G0_REGION_4      0x10
+UMASK_TAD_REQUESTS_G0_REGION_5      0x20
+UMASK_TAD_REQUESTS_G0_REGION_6      0x40
+UMASK_TAD_REQUESTS_G0_REGION_7      0x80
+
+EVENT_TAD_REQUESTS_G1               0x1C  BBOX
+UMASK_TAD_REQUESTS_G1_REGION_8      0x01
+UMASK_TAD_REQUESTS_G1_REGION_9      0x02
+UMASK_TAD_REQUESTS_G1_REGION_10      0x04
+UMASK_TAD_REQUESTS_G1_REGION_11      0x08
+
+EVENT_TRACKER_INSERTS                   0x06  BBOX
+UMASK_TRACKER_INSERTS_ALL             0x03
+
+EVENT_TXR_AD                   0x0F  BBOX
+UMASK_TXR_AD_NDR             0x01
+UMASK_TXR_AD_SNP             0x02
+
+EVENT_TXR_AD_CYCLES_FULL                  0x2A  BBOX
+UMASK_TXR_AD_CYCLES_FULL_SCHED0           0x01
+UMASK_TXR_AD_CYCLES_FULL_SCHED1           0x02
+UMASK_TXR_AD_CYCLES_FULL_ALL              0x03
+
+EVENT_TXR_AK_CYCLES_FULL                  0x32  BBOX
+UMASK_TXR_AK_CYCLES_FULL_SCHED0           0x01
+UMASK_TXR_AK_CYCLES_FULL_SCHED1           0x02
+UMASK_TXR_AK_CYCLES_FULL_ALL              0x03
+
+EVENT_TXR_AK_NDR              0x0E  BBOX
+UMASK_TXR_AK_NDR              0x00
+
+EVENT_TXR_BL              0x10  BBOX
+UMASK_TXR_BL_DRS_CACHE    0x01
+UMASK_TXR_BL_DRS_CORE     0x02
+UMASK_TXR_BL_DRS_QPI      0x04
+
+EVENT_TXR_BL_CYCLES_FULL                  0x36  BBOX
+UMASK_TXR_BL_CYCLES_FULL_SCHED0           0x01
+UMASK_TXR_BL_CYCLES_FULL_SCHED1           0x02
+UMASK_TXR_BL_CYCLES_FULL_ALL              0x03
+
+EVENT_WPQ_CYCLES_NO_REG_CREDITS                0x18  BBOX
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN0           0x01
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN1           0x02
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN2           0x04
+UMASK_WPQ_CYCLES_NO_REG_CREDITS_CHN3           0x08
+
+EVENT_DRAM_CLOCKTICKS             0x00  MBOX0FIX|MBOX1FIX|MBOX2FIX|MBOX3FIX
+UMASK_DRAM_CLOCKTICKS             0x00
+
+EVENT_ACT_COUNT                  0x01  MBOX
+UMASK_ACT_COUNT                  0x00
+
+EVENT_CAS_COUNT                  0x04  MBOX
+UMASK_CAS_COUNT_RD_REF           0x01
+UMASK_CAS_COUNT_RD_UNDERFILL     0x02
+UMASK_CAS_COUNT_RD               0x03
+UMASK_CAS_COUNT_WR_WMM           0x04
+UMASK_CAS_COUNT_WR_RMM           0x08
+UMASK_CAS_COUNT_WR               0x0C
+UMASK_CAS_COUNT_ALL              0x0F
+
+EVENT_DRAM_PRE_ALL                  0x06  MBOX
+UMASK_DRAM_PRE_ALL                  0x00
+
+EVENT_DRAM_REFRESH                  0x05  MBOX
+UMASK_DRAM_REFRESH_PANIC            0x02
+UMASK_DRAM_REFRESH_HIGH             0x04
+
+EVENT_ECC_CORRECTABLE_ERRORS           0x09  MBOX
+UMASK_ECC_CORRECTABLE_ERRORS           0x00
+
+EVENT_MAJOR_MODES                  0x07  MBOX
+UMASK_MAJOR_MODES_READ             0x01
+UMASK_MAJOR_MODES_WRITE            0x02
+UMASK_MAJOR_MODES_PARTIAL          0x04
+UMASK_MAJOR_MODES_ISOCH            0x08
+
+EVENT_POWER_CHANNEL_DLLOFF           0x84  MBOX
+UMASK_POWER_CHANNEL_DLLOFF           0x00
+
+EVENT_POWER_CHANNEL_PPD           0x85  MBOX
+UMASK_POWER_CHANNEL_PPD           0x00
+
+EVENT_POWER_CKE_CYCLES                  0x83  MBOX
+UMASK_POWER_CKE_CYCLES_RANK0            0x01
+UMASK_POWER_CKE_CYCLES_RANK1            0x02
+UMASK_POWER_CKE_CYCLES_RANK2            0x04
+UMASK_POWER_CKE_CYCLES_RANK3            0x08
+UMASK_POWER_CKE_CYCLES_RANK4            0x10
+UMASK_POWER_CKE_CYCLES_RANK5            0x20
+UMASK_POWER_CKE_CYCLES_RANK6            0x40
+UMASK_POWER_CKE_CYCLES_RANK7            0x80
+
+EVENT_POWER_CRITICAL_THROTTLE_CYCLES           0x86  MBOX
+UMASK_POWER_CRITICAL_THROTTLE_CYCLES           0x00
+
+EVENT_POWER_SELF_REFRESH           0x43  MBOX
+UMASK_POWER_SELF_REFRESH           0x00
+
+EVENT_POWER_THROTTLE_CYCLES                  0x41  MBOX
+UMASK_POWER_THROTTLE_CYCLES_RANK0            0x01
+UMASK_POWER_THROTTLE_CYCLES_RANK1            0x02
+UMASK_POWER_THROTTLE_CYCLES_RANK2            0x04
+UMASK_POWER_THROTTLE_CYCLES_RANK3            0x08
+UMASK_POWER_THROTTLE_CYCLES_RANK4            0x10
+UMASK_POWER_THROTTLE_CYCLES_RANK5            0x20
+UMASK_POWER_THROTTLE_CYCLES_RANK6            0x40
+UMASK_POWER_THROTTLE_CYCLES_RANK7            0x80
+
+EVENT_PREEMPTION           0x08  MBOX
+UMASK_PREEMPTION_RD_PREEMPT_RD           0x01
+UMASK_PREEMPTION_RD_PREEMPT_WR           0x02
+
+EVENT_PRE_COUNT           0x02  MBOX
+UMASK_PRE_COUNT_PAGE_MISS           0x01
+UMASK_PRE_COUNT_PAGE_CLOSE           0x02
+
+EVENT_RPQ_CYCLES_FULL           0x12  MBOX
+UMASK_RPQ_CYCLES_FULL           0x00
+
+EVENT_RPQ_CYCLES_NE           0x11  MBOX
+UMASK_RPQ_CYCLES_NE           0x00
+
+EVENT_RPQ_INSERTS           0x10  MBOX
+UMASK_RPQ_INSERTS           0x00
+
+EVENT_RPQ_OCCUPANCY           0x80  MBOX
+UMASK_RPQ_OCCUPANCY           0x00
+
+EVENT_WPQ_CYCLES_FULL           0x22  MBOX
+UMASK_WPQ_CYCLES_FULL           0x00
+
+EVENT_WPQ_CYCLES_NE           0x21  MBOX
+UMASK_WPQ_CYCLES_NE           0x00
+
+EVENT_WPQ_INSERTS           0x20  MBOX
+UMASK_WPQ_INSERTS           0x00
+
+EVENT_WPQ_OCCUPANCY           0x81  MBOX
+UMASK_WPQ_OCCUPANCY           0x00
+
+EVENT_WPQ_READ_HIT           0x23  MBOX
+UMASK_WPQ_READ_HIT           0x00
+
+EVENT_WPQ_WRITE_HIT           0x24  MBOX
+UMASK_WPQ_WRITE_HIT           0x00
+
+EVENT_WBOX_CLOCKTICKS           0x00  WBOX
+UMASK_WBOX_CLOCKTICKS           0x00
+
+EVENT_CORE0_TRANSITION_CYCLES           0x03  WBOX
+UMASK_CORE0_TRANSITION_CYCLES           0x00
+
+EVENT_CORE1_TRANSITION_CYCLES           0x04  WBOX
+UMASK_CORE1_TRANSITION_CYCLES           0x00
+
+EVENT_CORE2_TRANSITION_CYCLES           0x05  WBOX
+UMASK_CORE2_TRANSITION_CYCLES           0x00
+
+EVENT_CORE3_TRANSITION_CYCLES           0x06  WBOX
+UMASK_CORE3_TRANSITION_CYCLES           0x00
+
+EVENT_CORE4_TRANSITION_CYCLES           0x07  WBOX
+UMASK_CORE4_TRANSITION_CYCLES           0x00
+
+EVENT_CORE5_TRANSITION_CYCLES           0x08  WBOX
+UMASK_CORE5_TRANSITION_CYCLES           0x00
+
+EVENT_CORE6_TRANSITION_CYCLES           0x09  WBOX
+UMASK_CORE6_TRANSITION_CYCLES           0x00
+
+EVENT_CORE7_TRANSITION_CYCLES           0x0A  WBOX
+UMASK_CORE7_TRANSITION_CYCLES           0x00
+
+EVENT_DEMOTIONS_CORE0           0x1E  WBOX
+OPTIONS_DEMOTIONS_CORE0         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE0           0x00
+
+EVENT_DEMOTIONS_CORE1           0x1F  WBOX
+OPTIONS_DEMOTIONS_CORE1         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE1           0x00
+
+EVENT_DEMOTIONS_CORE2           0x20  WBOX
+OPTIONS_DEMOTIONS_CORE2         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE2           0x00
+
+EVENT_DEMOTIONS_CORE3           0x21  WBOX
+OPTIONS_DEMOTIONS_CORE3         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE3           0x00
+
+EVENT_DEMOTIONS_CORE4           0x22  WBOX
+OPTIONS_DEMOTIONS_CORE4         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE4           0x00
+
+EVENT_DEMOTIONS_CORE5           0x23  WBOX
+OPTIONS_DEMOTIONS_CORE5         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE5           0x00
+
+EVENT_DEMOTIONS_CORE6           0x24  WBOX
+OPTIONS_DEMOTIONS_CORE6         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE6           0x00
+
+EVENT_DEMOTIONS_CORE7           0x25  WBOX
+OPTIONS_DEMOTIONS_CORE7         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_DEMOTIONS_CORE7           0x00
+
+EVENT_FREQ_BAND0_CYCLES           0x0B  WBOX
+OPTIONS_FREQ_BAND0_CYCLES         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND0_CYCLES           0x00
+
+EVENT_FREQ_BAND1_CYCLES           0x0C  WBOX
+OPTIONS_FREQ_BAND1_CYCLES         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND1_CYCLES           0x00
+
+EVENT_FREQ_BAND2_CYCLES           0x0D  WBOX
+OPTIONS_FREQ_BAND2_CYCLES         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND2_CYCLES           0x00
+
+EVENT_FREQ_BAND3_CYCLES           0x0E  WBOX
+OPTIONS_FREQ_BAND3_CYCLES         EVENT_OPTION_OCCUPANCY_FILTER_MASK
+UMASK_FREQ_BAND3_CYCLES           0x00
+
+EVENT_FREQ_MAX_CURRENT_CYCLES           0x07  WBOX
+UMASK_FREQ_MAX_CURRENT_CYCLES           0x00
+
+EVENT_FREQ_MAX_LIMIT_THERMAL_CYCLES           0x04  WBOX
+UMASK_FREQ_MAX_LIMIT_THERMAL_CYCLES           0x00
+
+EVENT_FREQ_MAX_POWER_CYCLES           0x05  WBOX
+UMASK_FREQ_MAX_POWER_CYCLES           0x00
+
+EVENT_FREQ_MAX_OS_CYCLES           0x06  WBOX
+UMASK_FREQ_MAX_OS_CYCLES           0x00
+
+EVENT_FREQ_MIN_IO_P_CYCLES           0x01  WBOX
+UMASK_FREQ_MIN_IO_P_CYCLES           0x00
+
+EVENT_FREQ_MIN_PERF_P_CYCLES           0x02  WBOX
+UMASK_FREQ_MIN_PERF_P_CYCLES           0x00
+
+EVENT_FREQ_TRANS_CYCLES           0x00  WBOX
+UMASK_FREQ_TRANS_CYCLES           0x00
+
+EVENT_MEMORY_PHASE_SHEDDING_CYCLES           0x2F  WBOX
+UMASK_MEMORY_PHASE_SHEDDING_CYCLES           0x00
+
+EVENT_POWER_STATE_OCCUPANCY           0x80  WBOX
+UMASK_POWER_STATE_OCCUPANCY_CORES_C0           0x40
+UMASK_POWER_STATE_OCCUPANCY_CORES_C3           0x80
+UMASK_POWER_STATE_OCCUPANCY_CORES_C6           0xC0
+
+EVENT_PROCHOT_EXTERNAL_CYCLES           0x0A  WBOX
+UMASK_PROCHOT_EXTERNAL_CYCLES           0x00
+
+EVENT_PROCHOT_INTERNAL_CYCLES           0x09  WBOX
+UMASK_PROCHOT_INTERNAL_CYCLES           0x00
+
+EVENT_TOTAL_TRANSITION_CYCLES           0x0B  WBOX
+UMASK_TOTAL_TRANSITION_CYCLES           0x00
+
+EVENT_VOLT_TRANS_CYCLES_CHANGE           0x03  WBOX
+UMASK_VOLT_TRANS_CYCLES_CHANGE           0x00
+
+EVENT_VOLT_TRANS_CYCLES_DECREASE           0x02  WBOX
+UMASK_VOLT_TRANS_CYCLES_DECREASE           0x00
+
+EVENT_VOLT_TRANS_CYCLES_INCREASE           0x01  WBOX
+UMASK_VOLT_TRANS_CYCLES_INCREASE           0x00
+
+EVENT_VR_HOT_CYCLES           0x32  WBOX
+UMASK_VR_HOT_CYCLES           0x00
+
+EVENT_CORES_IN_C3               0x00 WBOXFIX0
+UMASK_CORES_IN_C3               0x00
+
+EVENT_CORES_IN_C6               0x00 WBOXFIX1
+UMASK_CORES_IN_C6               0x00
+
+EVENT_SBOX_CLOCKTICKS           0x14  SBOX
+UMASK_SBOX_CLOCKTICKS           0x00
+
+EVENT_CTO_COUNT           0x38  SBOX
+UMASK_CTO_COUNT           0x00 0x200000
+
+EVENT_DIRECT2CORE           0x13  SBOX
+UMASK_DIRECT2CORE_SUCCESS             0x01
+UMASK_DIRECT2CORE_FAILURE_CREDITS     0x02
+UMASK_DIRECT2CORE_FAILURE_RBT         0x04
+UMASK_DIRECT2CORE_FAILURE_CREDIRTS_RBT 0x08
+
+EVENT_L1_POWER_CYCLES           0x12  SBOX
+UMASK_L1_POWER_CYCLES           0x00
+
+EVENT_RXL0P_POWER_CYCLES           0x10  SBOX
+UMASK_RXL0P_POWER_CYCLES           0x00
+
+EVENT_RXL0_POWER_CYCLES           0x0F  SBOX
+UMASK_RXL0_POWER_CYCLES           0x00
+
+EVENT_RXL_BYPASSED           0x09  SBOX
+UMASK_RXL_BYPASSED           0x00
+
+EVENT_RXL_CREDITS_CONSUMED_VN0           0x1E  SBOX
+UMASK_RXL_CREDITS_CONSUMED_VN0_DRS       0x01 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCB       0x02 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_NCS       0x04 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_HOM       0x08 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_SNP       0x10 0x200000
+UMASK_RXL_CREDITS_CONSUMED_VN0_NDR       0x20 0x200000
+
+EVENT_RXL_CREDITS_CONSUMED_VNA           0x1D  SBOX
+UMASK_RXL_CREDITS_CONSUMED_VNA           0x00 0x200000
+
+EVENT_RXL_FLITS_G0              0x01  SBOX
+UMASK_RXL_FLITS_G0_IDLE         0x01
+UMASK_RXL_FLITS_G0_DATA         0x02
+UMASK_RXL_FLITS_G0_NON_DATA     0x04
+
+EVENT_RXL_FLITS_G1              0x02  SBOX
+UMASK_RXL_FLITS_G1_SNP          0x01 0x200000
+UMASK_RXL_FLITS_G1_HOM_REQ      0x02 0x200000
+UMASK_RXL_FLITS_G1_HOM_NONREQ   0x04 0x200000
+UMASK_RXL_FLITS_G1_HOM          0x06 0x200000
+UMASK_RXL_FLITS_G1_DRS_DATA     0x08 0x200000
+UMASK_RXL_FLITS_G1_DRS_NONDATA  0x10 0x200000
+UMASK_RXL_FLITS_G1_DRS          0x60 0x200000
+
+EVENT_RXL_FLITS_G2              0x03  SBOX
+UMASK_RXL_FLITS_G2_NDR_AD       0x01 0x200000
+UMASK_RXL_FLITS_G2_NDR_AK       0x02 0x200000
+UMASK_RXL_FLITS_G2_NCB_DATA     0x04 0x200000
+UMASK_RXL_FLITS_G2_NCB_NODATA   0x08 0x200000
+UMASK_RXL_FLITS_G2_NCB          0x06 0x200000
+UMASK_RXL_FLITS_G2_NCS          0x10 0x200000
+
+EVENT_RXL_INSERTS           0x08  SBOX
+UMASK_RXL_INSERTS           0x00
+
+EVENT_RXL_INSERTS_DRS           0x09  SBOX
+UMASK_RXL_INSERTS_DRS           0x00 0x200000
+
+EVENT_RXL_INSERTS_HOM           0x0C  SBOX
+UMASK_RXL_INSERTS_HOM           0x00 0x200000
+
+EVENT_RXL_INSERTS_NCB           0x0A  SBOX
+UMASK_RXL_INSERTS_NCB           0x00 0x200000
+
+EVENT_RXL_INSERTS_NCS           0x0B  SBOX
+UMASK_RXL_INSERTS_NCS           0x00 0x200000
+
+EVENT_RXL_INSERTS_NDR           0x0E  SBOX
+UMASK_RXL_INSERTS_NDR           0x00 0x200000
+
+EVENT_RXL_INSERTS_SNP           0x0D  SBOX
+UMASK_RXL_INSERTS_SNP           0x00 0x200000
+
+EVENT_RXL_OCCUPANCY           0x0B  SBOX
+UMASK_RXL_OCCUPANCY           0x00
+
+EVENT_RXL_OCCUPANCY_DRS           0x15  SBOX
+UMASK_RXL_OCCUPANCY_DRS           0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_HOM           0x18  SBOX
+UMASK_RXL_OCCUPANCY_HOM           0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_NCB           0x16  SBOX
+UMASK_RXL_OCCUPANCY_NCB           0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_NCS           0x17  SBOX
+UMASK_RXL_OCCUPANCY_NCS           0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_NDR           0x1A  SBOX
+UMASK_RXL_OCCUPANCY_NDR           0x00 0x200000
+
+EVENT_RXL_OCCUPANCY_SNP           0x19  SBOX
+UMASK_RXL_OCCUPANCY_SNP           0x00 0x200000
+
+EVENT_TXL0P_POWER_CYCLES           0x0D  SBOX
+UMASK_TXL0P_POWER_CYCLES           0x00
+
+EVENT_TXL0_POWER_CYCLES           0x0C  SBOX
+UMASK_TXL0_POWER_CYCLES           0x00
+
+EVENT_TXL_BYPASSED           0x05  SBOX
+UMASK_TXL_BYPASSED           0x00
+
+EVENT_TXL_CYCLES_NE           0x06  SBOX
+UMASK_TXL_CYCLES_NE           0x00
+
+EVENT_TXL_FLITS_G0              0x00  SBOX
+UMASK_TXL_FLITS_G0_IDLE         0x01
+UMASK_TXL_FLITS_G0_DATA         0x02
+UMASK_TXL_FLITS_G0_NON_DATA     0x04
+
+EVENT_TXL_FLITS_G1              0x00  SBOX
+UMASK_TXL_FLITS_G1_SNP          0x01 0x200000
+UMASK_TXL_FLITS_G1_HOM_REQ      0x02 0x200000
+UMASK_TXL_FLITS_G1_HOM_NONREQ   0x04 0x200000
+UMASK_TXL_FLITS_G1_HOM          0x06 0x200000
+UMASK_TXL_FLITS_G1_DRS_DATA     0x08 0x200000
+UMASK_TXL_FLITS_G1_DRS_NONDATA  0x10 0x200000
+UMASK_TXL_FLITS_G1_DRS          0x60 0x200000
+
+EVENT_TXL_FLITS_G2              0x01  SBOX
+UMASK_TXL_FLITS_G2_NDR_AD       0x01 0x200000
+UMASK_TXL_FLITS_G2_NDR_AK       0x02 0x200000
+UMASK_TXL_FLITS_G2_NCB_DATA     0x04 0x200000
+UMASK_TXL_FLITS_G2_NCB_NODATA   0x08 0x200000
+UMASK_TXL_FLITS_G2_NCB          0x06 0x200000
+UMASK_TXL_FLITS_G2_NCS          0x10 0x200000
+
+EVENT_TXL_INSERTS           0x04  SBOX
+UMASK_TXL_INSERTS           0x00
+
+EVENT_TXL_OCCUPANCY           0x07  SBOX
+UMASK_TXL_OCCUPANCY           0x00
+
+EVENT_VNA_CREDIT_RETURNS           0x1C  SBOX
+UMASK_VNA_CREDIT_RETURNS           0x00 0x200000
+
+EVENT_VNA_CREDIT_RETURN_OCCUPANCY           0x1B  SBOX
+UMASK_VNA_CREDIT_RETURN_OCCUPANCY           0x00 0x200000
+
+EVENT_QPI_RATE                  0x00 SBOX0FIX|SBOX1FIX
+UMASK_QPI_RATE                  0x00
+
+EVENT_QPI_SLOW_MODE             0x01 SBOX0FIX|SBOX1FIX
+UMASK_QPI_SLOW_MODE             0x00
+
+EVENT_PBOX_CLOCKTICKS           0x01  PBOX
+UMASK_PBOX_CLOCKTICKS           0x00
+
+EVENT_RING_AD_USED                  0x07  PBOX
+UMASK_RING_AD_USED_CW_EVEN          0x01
+UMASK_RING_AD_USED_CW_ODD           0x02
+UMASK_RING_AD_USED_CCW_EVEN         0x04
+UMASK_RING_AD_USED_CCW_EVEN         0x08
+
+EVENT_RING_AK_USED                  0x08  PBOX
+UMASK_RING_AK_USED_CW_EVEN          0x01
+UMASK_RING_AK_USED_CW_ODD           0x02
+UMASK_RING_AK_USED_CCW_EVEN         0x04
+UMASK_RING_AK_USED_CCW_EVEN         0x08
+
+EVENT_RING_BL_USED                  0x09  PBOX
+UMASK_RING_BL_USED_CW_EVEN          0x01
+UMASK_RING_BL_USED_CW_ODD           0x02
+UMASK_RING_BL_USED_CCW_EVEN         0x04
+UMASK_RING_BL_USED_CCW_EVEN         0x08
+
+EVENT_RING_IV_USED                  0x0A  PBOX
+UMASK_RING_IV_USED_ANY              0x0F
+
+EVENT_RXR_AK_BOUNCES              0x12  PBOX0
+UMASK_RXR_AK_BOUNCES              0x00
+
+EVENT_RXR_CYCLES_NE              0x10  PBOX0|PBOX1
+UMASK_RXR_CYCLES_NE_DRS              0x08
+UMASK_RXR_CYCLES_NE_NCB              0x10
+UMASK_RXR_CYCLES_NE_NCS              0x20
+
+EVENT_TXR_CYCLES_FULL              0x25  PBOX0
+UMASK_TXR_CYCLES_FULL_AD              0x01
+UMASK_TXR_CYCLES_FULL_AK              0x02
+UMASK_TXR_CYCLES_FULL_BL              0x04
+
+EVENT_TXR_CYCLES_NE              0x23  PBOX0
+UMASK_TXR_CYCLES_NE_AD              0x01
+UMASK_TXR_CYCLES_NE_AK              0x02
+UMASK_TXR_CYCLES_NE_BL              0x04
+
+EVENT_TXR_INSERTS         0x24  PBOX0
+UMASK_TXR_INSERTS         0x00
+
+EVENT_RBOX_CLOCKTICKS              0x01  RBOX
+UMASK_RBOX_CLOCKTICKS              0x00
+
+EVENT_IIO_CREDITS_ACQUIRED              0x20  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_IIO_CREDITS_ACQUIRED_DRS          0x08
+UMASK_IIO_CREDITS_ACQUIRED_NCB          0x10
+UMASK_IIO_CREDITS_ACQUIRED_NCS          0x20
+
+EVENT_IIO_CREDITS_REJECT                0x21  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_IIO_CREDITS_REJECT_DRS            0x08
+UMASK_IIO_CREDITS_REJECT_NCB            0x10
+UMASK_IIO_CREDITS_REJECT_NCS            0x20
+
+EVENT_IIO_CREDITS_USED                  0x22  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_IIO_CREDITS_USED_DRS              0x08
+UMASK_IIO_CREDITS_USED_NCB              0x10
+UMASK_IIO_CREDITS_USED_NCS              0x20
+
+EVENT_RING_AD_USED              0x07  RBOX
+UMASK_RING_AD_USED_CW_EVEN      0x01
+UMASK_RING_AD_USED_CW_ODD       0x02
+UMASK_RING_AD_USED_CCW_EVEN     0x04
+UMASK_RING_AD_USED_CCW_ODD      0x08
+
+EVENT_RING_AK_USED              0x08  RBOX
+UMASK_RING_AK_USED_CW_EVEN      0x01
+UMASK_RING_AK_USED_CW_ODD       0x02
+UMASK_RING_AK_USED_CCW_EVEN     0x04
+UMASK_RING_AK_USED_CCW_ODD      0x08
+
+EVENT_RING_BL_USED              0x09  RBOX
+UMASK_RING_BL_USED_CW_EVEN      0x01
+UMASK_RING_BL_USED_CW_ODD       0x02
+UMASK_RING_BL_USED_CCW_EVEN     0x04
+UMASK_RING_BL_USED_CCW_ODD      0x08
+
+EVENT_RING_IV_USED          0x0A  RBOX
+UMASK_RING_IV_USED_ANY      0x0F
+
+EVENT_RXR_BYPASSED          0x12  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_BYPASSED          0x00
+
+EVENT_RXR_CYCLES_NE         0x10  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_CYCLES_NE_HOM     0x01
+UMASK_RXR_CYCLES_NE_SNP     0x02
+UMASK_RXR_CYCLES_NE_NDR     0x04
+UMASK_RXR_CYCLES_NE_DRS     0x08
+UMASK_RXR_CYCLES_NE_NCB     0x10
+UMASK_RXR_CYCLES_NE_NCS     0x20
+
+EVENT_RXR_INSERTS         0x11  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_RXR_INSERTS_HOM     0x01
+UMASK_RXR_INSERTS_SNP     0x02
+UMASK_RXR_INSERTS_NDR     0x04
+UMASK_RXR_INSERTS_DRS     0x08
+UMASK_RXR_INSERTS_NCB     0x10
+UMASK_RXR_INSERTS_NCS     0x20
+
+EVENT_RXR_OCCUPANCY         0x13  RBOX0C0|RBOX1C0
+UMASK_RXR_OCCUPANCY_HOM     0x01
+UMASK_RXR_OCCUPANCY_SNP     0x02
+UMASK_RXR_OCCUPANCY_NDR     0x04
+UMASK_RXR_OCCUPANCY_DRS     0x08
+UMASK_RXR_OCCUPANCY_NCB     0x10
+UMASK_RXR_OCCUPANCY_NCS     0x20
+
+EVENT_TXR_CYCLES_FULL       0x25  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_CYCLES_FULL       0x00
+
+EVENT_TXR_CYCLES_NE       0x23  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_CYCLES_NE       0x00
+
+EVENT_TXR_INSERTS         0x24  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1|PBOX0
+UMASK_TXR_INSERTS_HOM     0x01
+UMASK_TXR_INSERTS_SNP     0x02
+UMASK_TXR_INSERTS_NDR     0x04
+UMASK_TXR_INSERTS_DRS     0x08
+UMASK_TXR_INSERTS_NCB     0x10
+UMASK_TXR_INSERTS_NCS     0x20
+
+EVENT_TXR_NACK       0x26  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_TXR_NACK       0x00
+
+EVENT_VN0_CREDITS_REJECT      0x37  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_REJECT_HOM     0x01
+UMASK_VN0_CREDITS_REJECT_SNP     0x02
+UMASK_VN0_CREDITS_REJECT_NDR     0x04
+UMASK_VN0_CREDITS_REJECT_DRS     0x08
+UMASK_VN0_CREDITS_REJECT_NCB     0x10
+UMASK_VN0_CREDITS_REJECT_NCS     0x20
+
+EVENT_VN0_CREDITS_USED      0x36  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VN0_CREDITS_USED_HOM     0x01
+UMASK_VN0_CREDITS_USED_SNP     0x02
+UMASK_VN0_CREDITS_USED_NDR     0x04
+UMASK_VN0_CREDITS_USED_DRS     0x08
+UMASK_VN0_CREDITS_USED_NCB     0x10
+UMASK_VN0_CREDITS_USED_NCS     0x20
+
+EVENT_VNA_CREDITS_ACQUIRED      0x33  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_ACQUIRED     0x00
+
+EVENT_VNA_CREDITS_REJECT      0x34  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_REJECT_HOM     0x01
+UMASK_VNA_CREDITS_REJECT_SNP     0x02
+UMASK_VNA_CREDITS_REJECT_NDR     0x04
+UMASK_VNA_CREDITS_REJECT_DRS     0x08
+UMASK_VNA_CREDITS_REJECT_NCB     0x10
+UMASK_VNA_CREDITS_REJECT_NCS     0x20
+
+EVENT_VNA_CREDITS_CYCLES_OUT      0x31  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_CYCLES_OUT     0x00
+
+EVENT_VNA_CREDITS_CYCLES_USED      0x32  RBOX0C0|RBOX0C1|RBOX1C0|RBOX1C1
+UMASK_VNA_CREDITS_CYCLES_USED     0x00
+
+EVENT_EVENT_MSG                  0x42 UBOX
+UMASK_EVENT_MSG_VLW_RCVD         0x01
+UMASK_EVENT_MSG_MSI_RCVD         0x02
+UMASK_EVENT_MSG_IPI_RCVD         0x04
+UMASK_EVENT_MSG_DOORBELL_RCVD    0x08
+UMASK_EVENT_MSG_INT_PRIO         0x10
+
+EVENT_LOCK_CYCLES                 0x44 UBOX
+UMASK_LOCK_CYCLES                 0x00
+
+EVENT_UBOX_CLOCKTICKS                 0x0 UBOXFIX
+UMASK_UBOX_CLOCKTICKS                 0x0
diff --git a/src/includes/perfmon_sandybridge_counters.h b/src/includes/perfmon_sandybridge_counters.h
index afe9c04..316e121 100644
--- a/src/includes/perfmon_sandybridge_counters.h
+++ b/src/includes/perfmon_sandybridge_counters.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_sandybridge_counters.h
  *
- *      Description: Counter header file of perfmon module for Sandy Bridge.
+ *      Description: Counter header file of perfmon module for Intel Sandy Bridge.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,48 +29,217 @@
  * =======================================================================================
  */
 
-#define NUM_COUNTERS_SANDYBRIDGE 32
-#define NUM_COUNTERS_UNCORE_SANDYBRIDGE 12
+#define NUM_COUNTERS_SANDYBRIDGEEP 97
+#define NUM_COUNTERS_UNCORE_SANDYBRIDGEEP 53
+#define NUM_COUNTERS_CORE_SANDYBRIDGEEP 8
 #define NUM_COUNTERS_CORE_SANDYBRIDGE 8
+#define NUM_COUNTERS_UNCORE_SANDYBRIDGE 12
+#define NUM_COUNTERS_SANDYBRIDGE 12
+
+#define SNB_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define SNB_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK| \
+                            EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+#define SNB_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_TID_MASK| \
+                            EVENT_OPTION_INVERT_MASK|EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_NID_MASK|EVENT_OPTION_STATE_MASK
+#define SNB_VALID_OPTIONS_WBOX  EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_THRESHOLD_MASK| \
+                            EVENT_OPTION_OCCUPANCY_MASK|EVENT_OPTION_OCCUPANCY_EDGE_MASK| \
+                            EVENT_OPTION_OCCUPANCY_INVERT_MASK|EVENT_OPTION_MATCH0_MASK
+#define SNB_VALID_OPTIONS_UBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define SNB_VALID_OPTIONS_BBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK| \
+                            EVENT_OPTION_OPCODE_MASK|EVENT_OPTION_MATCH0_MASK
+#define SNB_VALID_OPTIONS_MBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define SNB_VALID_OPTIONS_SBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK| \
+                            EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK|EVENT_OPTION_MASK0_MASK| \
+                            EVENT_OPTION_MASK1_MASK
+#define SNB_VALID_OPTIONS_RBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
+#define SNB_VALID_OPTIONS_PBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK
 
-static PerfmonCounterMap sandybridge_counter_map[NUM_COUNTERS_SANDYBRIDGE] = {
+static RegisterMap sandybridgeEP_counter_map[NUM_COUNTERS_SANDYBRIDGEEP] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
-    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
-    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, SNB_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, SNB_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, SNB_VALID_OPTIONS_FIXED},
     /* PMC Counters: 4 48bit wide */
-    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
-    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
-    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, SNB_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, SNB_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, SNB_VALID_OPTIONS_PMC},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, SNB_VALID_OPTIONS_PMC},
     /* Temperature Sensor*/
-    {"TMP0", PMC7, THERMAL, 0, 0, 0, 0},
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
     /* RAPL counters */
-    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
-    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS,  0, 0},
-    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS,  0, 0},
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* CBOX counters */
+    {"CBOX0C0", PMC12, CBOX0, MSR_UNC_C0_PMON_CTL0, MSR_UNC_C0_PMON_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX0C1", PMC13, CBOX0, MSR_UNC_C0_PMON_CTL1, MSR_UNC_C0_PMON_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX0C2", PMC14, CBOX0, MSR_UNC_C0_PMON_CTL2, MSR_UNC_C0_PMON_CTR2, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX0C3", PMC15, CBOX0, MSR_UNC_C0_PMON_CTL3, MSR_UNC_C0_PMON_CTR3, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX1C0", PMC16, CBOX1, MSR_UNC_C1_PMON_CTL0, MSR_UNC_C1_PMON_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX1C1", PMC17, CBOX1, MSR_UNC_C1_PMON_CTL1, MSR_UNC_C1_PMON_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX1C2", PMC18, CBOX1, MSR_UNC_C1_PMON_CTL2, MSR_UNC_C1_PMON_CTR2, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX1C3", PMC19, CBOX1, MSR_UNC_C1_PMON_CTL3, MSR_UNC_C1_PMON_CTR3, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX2C0", PMC20, CBOX2, MSR_UNC_C2_PMON_CTL0, MSR_UNC_C2_PMON_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX2C1", PMC21, CBOX2, MSR_UNC_C2_PMON_CTL1, MSR_UNC_C2_PMON_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX2C2", PMC22, CBOX2, MSR_UNC_C2_PMON_CTL2, MSR_UNC_C2_PMON_CTR2, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX2C3", PMC23, CBOX2, MSR_UNC_C2_PMON_CTL3, MSR_UNC_C2_PMON_CTR3, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX3C0", PMC24, CBOX3, MSR_UNC_C3_PMON_CTL0, MSR_UNC_C3_PMON_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX3C1", PMC25, CBOX3, MSR_UNC_C3_PMON_CTL1, MSR_UNC_C3_PMON_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX3C2", PMC26, CBOX3, MSR_UNC_C3_PMON_CTL2, MSR_UNC_C3_PMON_CTR2, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX3C3", PMC27, CBOX3, MSR_UNC_C3_PMON_CTL3, MSR_UNC_C3_PMON_CTR3, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX4C0", PMC28, CBOX4, MSR_UNC_C4_PMON_CTL0, MSR_UNC_C4_PMON_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX4C1", PMC29, CBOX4, MSR_UNC_C4_PMON_CTL1, MSR_UNC_C4_PMON_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX4C2", PMC30, CBOX4, MSR_UNC_C4_PMON_CTL2, MSR_UNC_C4_PMON_CTR2, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX4C3", PMC31, CBOX4, MSR_UNC_C4_PMON_CTL3, MSR_UNC_C4_PMON_CTR3, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX5C0", PMC32, CBOX5, MSR_UNC_C5_PMON_CTL0, MSR_UNC_C5_PMON_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX5C1", PMC33, CBOX5, MSR_UNC_C5_PMON_CTL1, MSR_UNC_C5_PMON_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX5C2", PMC34, CBOX5, MSR_UNC_C5_PMON_CTL2, MSR_UNC_C5_PMON_CTR2, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX5C3", PMC35, CBOX5, MSR_UNC_C5_PMON_CTL3, MSR_UNC_C5_PMON_CTR3, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX6C0", PMC36, CBOX6, MSR_UNC_C6_PMON_CTL0, MSR_UNC_C6_PMON_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX6C1", PMC37, CBOX6, MSR_UNC_C6_PMON_CTL1, MSR_UNC_C6_PMON_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX6C2", PMC38, CBOX6, MSR_UNC_C6_PMON_CTL2, MSR_UNC_C6_PMON_CTR2, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX6C3", PMC39, CBOX6, MSR_UNC_C6_PMON_CTL3, MSR_UNC_C6_PMON_CTR3, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX7C0", PMC40, CBOX7, MSR_UNC_C7_PMON_CTL0, MSR_UNC_C7_PMON_CTR0, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX7C1", PMC41, CBOX7, MSR_UNC_C7_PMON_CTL1, MSR_UNC_C7_PMON_CTR1, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX7C2", PMC42, CBOX7, MSR_UNC_C7_PMON_CTL2, MSR_UNC_C7_PMON_CTR2, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    {"CBOX7C3", PMC43, CBOX7, MSR_UNC_C7_PMON_CTL3, MSR_UNC_C7_PMON_CTR3, 0, 0, SNB_VALID_OPTIONS_CBOX},
+    /* UBOX counters */
+    {"UBOX0", PMC44, UBOX, MSR_UNC_U_PMON_CTL0, MSR_UNC_U_PMON_CTR0, 0, 0, SNB_VALID_OPTIONS_UBOX},
+    {"UBOX1", PMC45, UBOX, MSR_UNC_U_PMON_CTL1, MSR_UNC_U_PMON_CTR0, 0, 0, SNB_VALID_OPTIONS_UBOX},
+    {"UBOXFIX", PMC46, UBOXFIX, MSR_UNC_U_UCLK_FIXED_CTL, MSR_UNC_U_UCLK_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"WBOX0",PMC47, WBOX, MSR_UNC_PCU_PMON_CTL0, MSR_UNC_PCU_PMON_CTR0, 0, 0, SNB_VALID_OPTIONS_WBOX},
+    {"WBOX1",PMC48, WBOX, MSR_UNC_PCU_PMON_CTL1, MSR_UNC_PCU_PMON_CTR1, 0, 0, SNB_VALID_OPTIONS_WBOX},
+    {"WBOX2",PMC49, WBOX, MSR_UNC_PCU_PMON_CTL2, MSR_UNC_PCU_PMON_CTR2, 0, 0, SNB_VALID_OPTIONS_WBOX},
+    {"WBOX3",PMC50, WBOX, MSR_UNC_PCU_PMON_CTL3, MSR_UNC_PCU_PMON_CTR3, 0, 0, SNB_VALID_OPTIONS_WBOX},
+    {"WBOXFIX0", PMC51, WBOX0FIX, 0, MSR_UNC_PCU_PMON_FIXED_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"WBOXFIX1", PMC52, WBOX0FIX, 0, MSR_UNC_PCU_PMON_FIXED_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
     /* IMC Counters: 4 48bit wide per memory channel, split in two reads */
-    {"MBOX0C0",PMC12, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX0C1",PMC13, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX0C2",PMC14, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX0C3",PMC15, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX1C0",PMC16, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX1C1",PMC17, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX1C2",PMC18, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX1C3",PMC19, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX2C0",PMC20, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX2C1",PMC21, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX2C2",PMC22, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX2C3",PMC23, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX3C0",PMC24, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX3C1",PMC25, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX3C2",PMC26, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX3C3",PMC27, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_CH_3},
-    {"MBOX0FIX",PMC28, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_0},
-    {"MBOX1FIX",PMC29, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_1},
-    {"MBOX2FIX",PMC30, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_2},
-    {"MBOX3FIX",PMC31, MBOXFIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_CH_3},
+    {"MBOX0C0",PMC53, MBOX0, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_0, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX0C1",PMC54, MBOX0, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_0, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX0C2",PMC55, MBOX0, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_0, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX0C3",PMC56, MBOX0, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_0, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX0FIX", PMC57, MBOX0FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_0, EVENT_OPTION_NONE_MASK},
+    {"MBOX1C0",PMC58, MBOX1, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_1, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX1C1",PMC59, MBOX1, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_1, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX1C2",PMC60, MBOX1, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_1, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX1C3",PMC61, MBOX1, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_1, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX1FIX", PMC62, MBOX1FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_1, EVENT_OPTION_NONE_MASK},
+    {"MBOX2C0",PMC63, MBOX2, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_2, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX2C1",PMC64, MBOX2, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_2, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX2C2",PMC65, MBOX2, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_2, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX2C3",PMC66, MBOX2, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_2, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX2FIX", PMC67, MBOX2FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_2, EVENT_OPTION_NONE_MASK},
+    {"MBOX3C0",PMC68, MBOX3, PCI_UNC_MC_PMON_CTL_0, PCI_UNC_MC_PMON_CTR_0_A, PCI_UNC_MC_PMON_CTR_0_B, PCI_IMC_DEVICE_0_CH_3, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX3C1",PMC69, MBOX3, PCI_UNC_MC_PMON_CTL_1, PCI_UNC_MC_PMON_CTR_1_A, PCI_UNC_MC_PMON_CTR_1_B, PCI_IMC_DEVICE_0_CH_3, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX3C2",PMC70, MBOX3, PCI_UNC_MC_PMON_CTL_2, PCI_UNC_MC_PMON_CTR_2_A, PCI_UNC_MC_PMON_CTR_2_B, PCI_IMC_DEVICE_0_CH_3, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX3C3",PMC71, MBOX3, PCI_UNC_MC_PMON_CTL_3, PCI_UNC_MC_PMON_CTR_3_A, PCI_UNC_MC_PMON_CTR_3_B, PCI_IMC_DEVICE_0_CH_3, SNB_VALID_OPTIONS_MBOX},
+    {"MBOX3FIX", PMC72, MBOX3FIX, PCI_UNC_MC_PMON_FIXED_CTL, PCI_UNC_MC_PMON_FIXED_CTR_A, PCI_UNC_MC_PMON_FIXED_CTR_B, PCI_IMC_DEVICE_0_CH_3, EVENT_OPTION_NONE_MASK},
+    /* QPI counters four 48bit  wide per port, split in two reads */
+    {"SBOX0C0",PMC73, SBOX0, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_0, SNB_VALID_OPTIONS_SBOX},
+    {"SBOX0C1",PMC74, SBOX0, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_0, SNB_VALID_OPTIONS_SBOX},
+    {"SBOX0C2",PMC75, SBOX0, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_0, SNB_VALID_OPTIONS_SBOX},
+    {"SBOX0C3",PMC76, SBOX0, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_0, SNB_VALID_OPTIONS_SBOX},
+    {"SBOX0FIX", PMC77, SBOX0FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_0, EVENT_OPTION_NONE_MASK},
+    {"SBOX1C0",PMC78, SBOX1, PCI_UNC_QPI_PMON_CTL_0, PCI_UNC_QPI_PMON_CTR_0_A, PCI_UNC_QPI_PMON_CTR_0_B, PCI_QPI_DEVICE_PORT_1, SNB_VALID_OPTIONS_SBOX},
+    {"SBOX1C1",PMC79, SBOX1, PCI_UNC_QPI_PMON_CTL_1, PCI_UNC_QPI_PMON_CTR_1_A, PCI_UNC_QPI_PMON_CTR_1_B, PCI_QPI_DEVICE_PORT_1, SNB_VALID_OPTIONS_SBOX},
+    {"SBOX1C2",PMC80, SBOX1, PCI_UNC_QPI_PMON_CTL_2, PCI_UNC_QPI_PMON_CTR_2_A, PCI_UNC_QPI_PMON_CTR_2_B, PCI_QPI_DEVICE_PORT_1, SNB_VALID_OPTIONS_SBOX},
+    {"SBOX1C3",PMC81, SBOX1, PCI_UNC_QPI_PMON_CTL_3, PCI_UNC_QPI_PMON_CTR_3_A, PCI_UNC_QPI_PMON_CTR_3_B, PCI_QPI_DEVICE_PORT_1, SNB_VALID_OPTIONS_SBOX},
+    {"SBOX1FIX", PMC82, SBOX1FIX, 0, PCI_UNC_QPI_RATE_STATUS, 0, PCI_QPI_MISC_DEVICE_PORT_1, EVENT_OPTION_NONE_MASK},
+    /* BBOX or better known as Home Agent (HA) */
+    {"BBOX0",PMC83, BBOX0, PCI_UNC_HA_PMON_CTL_0, PCI_UNC_HA_PMON_CTR_0_A, PCI_UNC_HA_PMON_CTR_0_B, PCI_HA_DEVICE_0, SNB_VALID_OPTIONS_BBOX},
+    {"BBOX1",PMC84, BBOX0, PCI_UNC_HA_PMON_CTL_1, PCI_UNC_HA_PMON_CTR_1_A, PCI_UNC_HA_PMON_CTR_1_B, PCI_HA_DEVICE_0, SNB_VALID_OPTIONS_BBOX},
+    {"BBOX2",PMC85, BBOX0, PCI_UNC_HA_PMON_CTL_2, PCI_UNC_HA_PMON_CTR_2_A, PCI_UNC_HA_PMON_CTR_2_B, PCI_HA_DEVICE_0, SNB_VALID_OPTIONS_BBOX},
+    {"BBOX3",PMC86, BBOX0, PCI_UNC_HA_PMON_CTL_3, PCI_UNC_HA_PMON_CTR_3_A, PCI_UNC_HA_PMON_CTR_3_B, PCI_HA_DEVICE_0, SNB_VALID_OPTIONS_BBOX},
+    {"RBOX0C0", PMC87, RBOX0, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_0, SNB_VALID_OPTIONS_RBOX},
+    {"RBOX0C1", PMC88, RBOX0, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_0, SNB_VALID_OPTIONS_RBOX},
+    {"RBOX0C2", PMC89, RBOX0, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_0, SNB_VALID_OPTIONS_RBOX},
+    {"RBOX1C0", PMC90, RBOX1, PCI_UNC_R3QPI_PMON_CTL_0, PCI_UNC_R3QPI_PMON_CTR_0_A, PCI_UNC_R3QPI_PMON_CTR_0_B, PCI_R3QPI_DEVICE_LINK_1, SNB_VALID_OPTIONS_RBOX},
+    {"RBOX1C1", PMC91, RBOX1, PCI_UNC_R3QPI_PMON_CTL_1, PCI_UNC_R3QPI_PMON_CTR_1_A, PCI_UNC_R3QPI_PMON_CTR_1_B, PCI_R3QPI_DEVICE_LINK_1, SNB_VALID_OPTIONS_RBOX},
+    {"RBOX1C2", PMC92, RBOX1, PCI_UNC_R3QPI_PMON_CTL_2, PCI_UNC_R3QPI_PMON_CTR_2_A, PCI_UNC_R3QPI_PMON_CTR_2_B, PCI_R3QPI_DEVICE_LINK_1, SNB_VALID_OPTIONS_RBOX},
+    {"PBOX0", PMC93, PBOX, PCI_UNC_R2PCIE_PMON_CTL_0, PCI_UNC_R2PCIE_PMON_CTR_0_A, PCI_UNC_R2PCIE_PMON_CTR_0_B, PCI_R2PCIE_DEVICE, SNB_VALID_OPTIONS_PBOX},
+    {"PBOX1", PMC94, PBOX, PCI_UNC_R2PCIE_PMON_CTL_1, PCI_UNC_R2PCIE_PMON_CTR_1_A, PCI_UNC_R2PCIE_PMON_CTR_1_B, PCI_R2PCIE_DEVICE, SNB_VALID_OPTIONS_PBOX},
+    {"PBOX2", PMC95, PBOX, PCI_UNC_R2PCIE_PMON_CTL_2, PCI_UNC_R2PCIE_PMON_CTR_2_A, PCI_UNC_R2PCIE_PMON_CTR_2_B, PCI_R2PCIE_DEVICE, SNB_VALID_OPTIONS_PBOX},
+    {"PBOX3", PMC96, PBOX, PCI_UNC_R2PCIE_PMON_CTL_3, PCI_UNC_R2PCIE_PMON_CTR_3_A, PCI_UNC_R2PCIE_PMON_CTR_3_B, PCI_R2PCIE_DEVICE, SNB_VALID_OPTIONS_PBOX},
 };
 
+static RegisterMap sandybridge_counter_map[NUM_COUNTERS_SANDYBRIDGE] = {
+    /* Fixed Counters: instructions retired, cycles unhalted core */
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, EVENT_OPTION_COUNT_KERNEL_MASK},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, EVENT_OPTION_COUNT_KERNEL_MASK},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, EVENT_OPTION_COUNT_KERNEL_MASK},
+    /* PMC Counters: 4 48bit wide */
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK},
+    /* Temperature Sensor*/
+    {"TMP0", PMC7, THERMAL, 0, IA32_THERM_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    /* RAPL counters */
+    {"PWR0", PMC8, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR1", PMC9, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR2", PMC10, POWER, 0, MSR_PP1_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"PWR3", PMC11, POWER, 0, MSR_DRAM_ENERGY_STATUS, 0, 0, EVENT_OPTION_NONE_MASK}
+};
+
+static BoxMap sandybridgeEP_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 48},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+    [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32},
+    [WBOX] = {MSR_UNC_PCU_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 48},
+    [WBOX0FIX] = {0, 0, 0, 0, 0, MSR_DEV, 48},
+    [UBOX] = {0, MSR_UNC_U_PMON_BOX_STATUS, 0, 0, 0, MSR_DEV, 44},
+    [UBOXFIX] = {0, 0, 0, 0, 0, MSR_DEV, 44},
+    [CBOX0] = {MSR_UNC_C0_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C0_PMON_BOX_FILTER},
+    [CBOX1] = {MSR_UNC_C1_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C1_PMON_BOX_FILTER},
+    [CBOX2] = {MSR_UNC_C2_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C2_PMON_BOX_FILTER},
+    [CBOX3] = {MSR_UNC_C3_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C3_PMON_BOX_FILTER},
+    [CBOX4] = {MSR_UNC_C4_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C4_PMON_BOX_FILTER},
+    [CBOX5] = {MSR_UNC_C5_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C5_PMON_BOX_FILTER},
+    [CBOX6] = {MSR_UNC_C6_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C6_PMON_BOX_FILTER},
+    [CBOX7] = {MSR_UNC_C7_PMON_BOX_CTL, 0, 0, 0, 0, MSR_DEV, 44, MSR_UNC_C7_PMON_BOX_FILTER},
+    [MBOX0] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+    [MBOX1] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+    [MBOX2] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+    [MBOX3] = {PCI_UNC_MC_PMON_BOX_CTL, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+    [MBOX0FIX] = {0, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_0, 48},
+    [MBOX1FIX] = {0, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_1, 48},
+    [MBOX2FIX] = {0, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_2, 48},
+    [MBOX3FIX] = {0, 0, 0, 0, 1, PCI_IMC_DEVICE_0_CH_3, 48},
+    [BBOX0] = {PCI_UNC_HA_PMON_BOX_CTL, 0, 0, 0, 1, PCI_HA_DEVICE_0, 48},
+    [SBOX0] = {PCI_UNC_QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_QPI_DEVICE_PORT_0, 48},
+    [SBOX1] = {PCI_UNC_QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_QPI_DEVICE_PORT_1, 48},
+    [SBOX0FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_0, 32},
+    [SBOX1FIX] = {0, 0, 0, 0, 1, PCI_QPI_MISC_DEVICE_PORT_1, 32},
+    [RBOX0] = {PCI_UNC_R3QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_R3QPI_DEVICE_LINK_0, 44},
+    [RBOX1] = {PCI_UNC_R3QPI_PMON_BOX_CTL, 0, 0, 0, 1, PCI_R3QPI_DEVICE_LINK_1, 44},
+    [PBOX] = {PCI_UNC_R2PCIE_PMON_BOX_CTL, 0, 0, 0, 1, PCI_R2PCIE_DEVICE, 44},
+};
+
+static BoxMap sandybridge_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 48},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, MSR_DEV, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+    [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32},
+};
+
+static PciDevice sandybridgeEP_pci_devices[MAX_NUM_PCI_DEVICES] = {
+ [MSR_DEV] = {NONE, "", "", ""},
+ [PCI_R3QPI_DEVICE_LINK_0] = {R3QPI, "13.5", "PCI_R3QPI_DEVICE_LINK_0", "RBOX0", 0x3c44},
+ [PCI_R3QPI_DEVICE_LINK_1] = {R3QPI, "13.6", "PCI_R3QPI_DEVICE_LINK_1", "RBOX1", 0x3c45},
+ [PCI_R2PCIE_DEVICE] = {R2PCIE, "13.1", "PCI_R2PCIE_DEVICE", "PBOX0", 0x3c43},
+ [PCI_IMC_DEVICE_0_CH_0] = {IMC, "10.0", "PCI_IMC_DEVICE_CH_0", "MBOX0", 0x3cb0},
+ [PCI_IMC_DEVICE_0_CH_1] = {IMC, "10.1", "PCI_IMC_DEVICE_CH_1", "MBOX1", 0x3cb1},
+ [PCI_IMC_DEVICE_0_CH_2] = {IMC, "10.4", "PCI_IMC_DEVICE_CH_2", "MBOX2", 0x3cb4},
+ [PCI_IMC_DEVICE_0_CH_3] = {IMC, "10.5", "PCI_IMC_DEVICE_CH_3", "MBOX3", 0x3cb5},
+ [PCI_HA_DEVICE_0] = {HA, "0e.1", "PCI_HA_DEVICE", "BBOX", 0x3c46},
+ [PCI_QPI_DEVICE_PORT_0] = {QPI, "08.2", "PCI_QPI_DEVICE_PORT_0", "SBOX0", 0x3c41},
+ [PCI_QPI_DEVICE_PORT_1] = {QPI, "09.2", "PCI_QPI_DEVICE_PORT_1", "SBOX1", 0x3c42},
+ [PCI_QPI_MASK_DEVICE_PORT_0] = {QPI, "08.6", "PCI_QPI_MASK_DEVICE_PORT_0", NULL, 0x3c86},
+ [PCI_QPI_MASK_DEVICE_PORT_1] = {QPI, "09.6", "PCI_QPI_MASK_DEVICE_PORT_1", NULL, 0x3c96},
+ [PCI_QPI_MISC_DEVICE_PORT_0] = {QPI, "08.0", "PCI_QPI_MISC_DEVICE_PORT_0", "SBOX0FIX",0x3c80},
+ [PCI_QPI_MISC_DEVICE_PORT_1] = {QPI, "09.0", "PCI_QPI_MISC_DEVICE_PORT_1", "SBOX1FIX", 0x3c91},
+};
 
diff --git a/src/includes/perfmon_sandybridge_events.txt b/src/includes/perfmon_sandybridge_events.txt
index ec4d397..8c13d65 100644
--- a/src/includes/perfmon_sandybridge_events.txt
+++ b/src/includes/perfmon_sandybridge_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_sandybridge_events.txt
-# 
+#
 #      Description:  Event list for Intel SandyBridge
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -35,6 +36,9 @@ UMASK_PWR_PKG_ENERGY          0x00
 EVENT_PWR_PP0_ENERGY          0x00   PWR1
 UMASK_PWR_PP0_ENERGY          0x00
 
+EVENT_PWR_PP1_ENERGY          0x00   PWR2
+UMASK_PWR_PP1_ENERGY          0x00
+
 EVENT_PWR_DRAM_ENERGY          0x00   PWR3
 UMASK_PWR_DRAM_ENERGY          0x00
 
@@ -94,6 +98,7 @@ UMASK_INSTS_WRITTEN_TO_IQ_INSTS        0x01
 
 EVENT_L2_RQSTS                   0x24   PMC
 UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_HIT 0x01
+UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD_MISS 0x02
 UMASK_L2_RQSTS_ALL_DEM_AND_DATA_RD 0x03
 UMASK_L2_RQSTS_RFO_HITS           0x04
 UMASK_L2_RQSTS_RFO_MISS          0x08
@@ -127,11 +132,11 @@ UMASK_CPU_CLOCK_UNHALTED_REF_P     0x01
 EVENT_L1D_PEND_MISS              0x48   PMC1
 UMASK_L1D_PEND_MISS_PENDING      0x01
 
-EVENT_DTLB_STORE_MISSES                0x49   PMC
-UMASK_DTLB_STORE_MISSES_MISS_CAUSES_A_WALK   0x01
-UMASK_DTLB_STORE_MISSES_WALK_COMPLETED       0x02
-UMASK_DTLB_STORE_MISSES_WALK_DURATION       0x04
-UMASK_DTLB_STORE_MISSES_STLB_HIT             0x10
+EVENT_DTLB_STORE_MISSES                    0x49   PMC
+UMASK_DTLB_STORE_MISSES_CAUSES_A_WALK      0x01
+UMASK_DTLB_STORE_MISSES_WALK_COMPLETED     0x02
+UMASK_DTLB_STORE_MISSES_WALK_DURATION      0x04
+UMASK_DTLB_STORE_MISSES_STLB_HIT           0x10
 
 EVENT_LOAD_HIT_PRE               0x4C    PMC
 UMASK_LOAD_HIT_PRE_SW_PF               0x01
@@ -398,9 +403,6 @@ UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HIT              0x02
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_HITM              0x04
 UMASK_MEM_LOAD_UOPS_LLC_HIT_RETIRED_XSNP_NON              0x08
 
-EVENT_MEM_TRANS_RETIRED_LOAD_LATENCY          0xCD  PMC
-UMASK_MEM_TRANS_RETIRED_LOAD_LATENCY           0x01
-
 EVENT_MEM_LOAD_UOPS_RETIRED          0xD1  PMC
 UMASK_MEM_LOAD_UOPS_RETIRED_LLC_HIT           0x04
 UMASK_MEM_LOAD_UOPS_RETIRED_LLC_MISS           0x20
@@ -415,103 +417,36 @@ EVENT_MEM_LOAD_UOPS_LLC_MISS_RETIRED          0xD3  PMC
 UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_LOCAL_DRAM           0x01
 UMASK_MEM_LOAD_UOPS_LLC_MISS_RETIRED_REMOTE_DRAM            0x04
 
-EVENT_DRAM_CLOCKTICKS             0x00  MBOX0FIX|MBOX1FIX|MBOX2FIX|MBOX3FIX
-UMASK_DRAM_CLOCKTICKS             0x00
-
-EVENT_ACT_COUNT                  0x01  MBOX
-UMASK_ACT_COUNT                  0x00
-
-EVENT_CAS_COUNT                  0x04  MBOX
-UMASK_CAS_COUNT_RD_REF           0x01
-UMASK_CAS_COUNT_RD_UNDERFILL     0x02
-UMASK_CAS_COUNT_RD               0x03
-UMASK_CAS_COUNT_WR_WMM           0x04
-UMASK_CAS_COUNT_WR_RMM           0x08
-UMASK_CAS_COUNT_WR               0x0C
-UMASK_CAS_COUNT_ALL              0x0F
-
-EVENT_DRAM_PRE_ALL                  0x06  MBOX
-UMASK_DRAM_PRE_ALL                  0x00
-
-EVENT_DRAM_REFRESH                  0x05  MBOX
-UMASK_DRAM_REFRESH_PANIC            0x02
-UMASK_DRAM_REFRESH_HIGH             0x04
-
-EVENT_ECC_CORRECTABLE_ERRORS           0x09  MBOX
-UMASK_ECC_CORRECTABLE_ERRORS           0x00
-
-EVENT_MAJOR_MODES                  0x07  MBOX
-UMASK_MAJOR_MODES_READ             0x01
-UMASK_MAJOR_MODES_WRITE            0x02
-UMASK_MAJOR_MODES_PARTIAL          0x04
-UMASK_MAJOR_MODES_ISOCH            0x08
-
-EVENT_POWER_CHANNEL_DLLOFF           0x84  MBOX
-UMASK_POWER_CHANNEL_DLLOFF           0x00
-
-EVENT_POWER_CHANNEL_PPD           0x85  MBOX
-UMASK_POWER_CHANNEL_PPD           0x00
-
-EVENT_POWER_CKE_CYCLES                  0x83  MBOX
-UMASK_POWER_CKE_CYCLES_RANK0            0x01
-UMASK_POWER_CKE_CYCLES_RANK1            0x02
-UMASK_POWER_CKE_CYCLES_RANK2            0x04
-UMASK_POWER_CKE_CYCLES_RANK3            0x08
-UMASK_POWER_CKE_CYCLES_RANK4            0x10
-UMASK_POWER_CKE_CYCLES_RANK5            0x20
-UMASK_POWER_CKE_CYCLES_RANK6            0x40
-UMASK_POWER_CKE_CYCLES_RANK7            0x80
-
-EVENT_POWER_CRITICAL_THROTTLE_CYCLES           0x86  MBOX
-UMASK_POWER_CRITICAL_THROTTLE_CYCLES           0x00
-
-EVENT_POWER_SELF_REFRESH           0x43  MBOX
-UMASK_POWER_SELF_REFRESH           0x00
-
-EVENT_POWER_THROTTLE_CYCLES                  0x41  MBOX
-UMASK_POWER_THROTTLE_CYCLES_RANK0            0x01
-UMASK_POWER_THROTTLE_CYCLES_RANK1            0x02
-UMASK_POWER_THROTTLE_CYCLES_RANK2            0x04
-UMASK_POWER_THROTTLE_CYCLES_RANK3            0x08
-UMASK_POWER_THROTTLE_CYCLES_RANK4            0x10
-UMASK_POWER_THROTTLE_CYCLES_RANK5            0x20
-UMASK_POWER_THROTTLE_CYCLES_RANK6            0x40
-UMASK_POWER_THROTTLE_CYCLES_RANK7            0x80
-
-EVENT_PREEMPTION           0x08  MBOX
-UMASK_PREEMPTION_RD_PREEMPT_RD           0x01
-UMASK_PREEMPTION_RD_PREEMPT_WR           0x02
-
-EVENT_PRE_COUNT           0x02  MBOX
-UMASK_PRE_COUNT_PAGE_MISS           0x01
-UMASK_PRE_COUNT_PAGE_CLOSE           0x02
-
-EVENT_RPQ_CYCLES_FULL           0x12  MBOX
-UMASK_RPQ_CYCLES_FULL           0x00
-
-EVENT_RPQ_CYCLES_NE           0x11  MBOX
-UMASK_RPQ_CYCLES_NE           0x00
-
-EVENT_RPQ_INSERTS           0x10  MBOX
-UMASK_RPQ_INSERTS           0x00
-
-EVENT_RPQ_OCCUPANCY           0x80  MBOX
-UMASK_RPQ_OCCUPANCY           0x00
-
-EVENT_WPQ_CYCLES_FULL           0x22  MBOX
-UMASK_WPQ_CYCLES_FULL           0x00
-
-EVENT_WPQ_CYCLES_NE           0x21  MBOX
-UMASK_WPQ_CYCLES_NE           0x00
-
-EVENT_WPQ_INSERTS           0x20  MBOX
-UMASK_WPQ_INSERTS           0x00
-
-EVENT_WPQ_OCCUPANCY           0x81  MBOX
-UMASK_WPQ_OCCUPANCY           0x00
-
-EVENT_WPQ_READ_HIT           0x23  MBOX
-UMASK_WPQ_READ_HIT           0x00
-
-EVENT_WPQ_WRITE_HIT           0x24  MBOX
-UMASK_WPQ_WRITE_HIT           0x00
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                  0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY          0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY              0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY          0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_SPLIT_LOCK_UC_LOCK_ANY     0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                  0x01 0x0F 0x10
diff --git a/src/includes/perfmon_silvermont.h b/src/includes/perfmon_silvermont.h
index 9cfd6f1..1a3dbe7 100644
--- a/src/includes/perfmon_silvermont.h
+++ b/src/includes/perfmon_silvermont.h
@@ -3,15 +3,15 @@
  *
  *      Filename:  perfmon_silvermont.h
  *
- *      Description:  Header file of perfmon module for Intel Atom Silvermont
+ *      Description:  Header file of perfmon module for Intel Atom (Silvermont)
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,264 +29,484 @@
  */
  
 #include <perfmon_silvermont_events.h>
-#include <perfmon_silvermont_groups.h>
 #include <perfmon_silvermont_counters.h>
 
 static int perfmon_numCountersSilvermont = NUM_COUNTERS_SILVERMONT;
-static int perfmon_numGroupsSilvermont = NUM_GROUPS_SILVERMONT;
+static int perfmon_numCoreCountersSilvermont = NUM_COUNTERS_SILVERMONT;
 static int perfmon_numArchEventsSilvermont = NUM_ARCH_EVENTS_SILVERMONT;
 
 
-void perfmon_init_silvermont(PerfmonThread *thread)
+int perfmon_init_silvermont(int cpu_id)
 {
-    uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
     lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
-
-    /* Initialize registers */
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL);
-    msr_write(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL);
-
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
 }
 
-void perfmon_setupCounterThread_silvermont(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+uint32_t svm_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
 {
-    int haveLock = 0;
-    uint64_t flags = 0x0ULL;
-    uint32_t uflags;
-    uint64_t reg = silvermont_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-    uint64_t orig_fixed_flags = fixed_flags;
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
-
-    switch (silvermont_counter_map[index].type)
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    if (event->numberOfOptions > 0)
     {
-        case PMC:
-
-            flags = (1<<16)|(1<<22);
-            flags &= ~(0xFFFFU);   /* clear lower 16bits */
+        for(int i=0;i<event->numberOfOptions;i++)
+        {
+            switch(event->options[i].type)
+            {
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<(2+(index*4)));
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<(index*4));
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    return flags;
+}
 
-            /* Intel with standard 8 bit event mask: [7:0] */
-            flags |= (event->umask<<8) + event->eventId;
+int svm_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    uint64_t flags = 0x0ULL;
+    uint64_t offcore_flags = 0x0ULL;
 
 
+    flags |= (1ULL<<16)|(1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    /* For event id 0xB7 the cmask must be written in an extra register */
+    if ((event->cmask != 0x00) && (event->eventId != 0xB7))
+    {
+        flags |= (event->cmask << 24);
+    }
+    /* set custom cfgbits */
+    if ((event->cfgBits != 0x00) && (event->eventId != 0xB7))
+    {
+        flags |= (event->cfgBits << 16);
+    }
 
-            if (perfmon_verbose)
+    if (event->numberOfOptions > 0)
+    {
+        for(int i=0;i<event->numberOfOptions;i++)
+        {
+            switch(event->options[i].type)
             {
-                printf("[%d] perfmon_setup_counter PMC: Write Register 0x%llX , Flags: 0x%llX \n",
-                        cpu_id,
-                        LLU_CAST reg,
-                        LLU_CAST flags);
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_ANYTHREAD:
+                    flags |= (1ULL<<21);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[i].value & 0xFFULL)<<24;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[i].value & 0xFFFFULL);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    if ((event->eventId == 0xB7) && (event->umask == 0x01))
+                    {
+                        offcore_flags |= (event->options[i].value & 0x768005ULL)<<16;
+                    }
+                    else if ((event->eventId == 0xB7) && (event->umask == 0x02))
+                    {
+                        offcore_flags |= (event->options[i].value & 0x368005ULL)<<16;
+                    }
+                    break;
+                default:
+                    break;
             }
-            msr_write(cpu_id, reg , flags);
-
-            // Offcore event with additional configuration register
-            // We included the additional register as counterRegister2
-            // to avoid creating a new data structure
-            // cfgBits contain offset of "request type" bit
-            // cmask contain offset of "response type" bit
-            if (event->eventId == 0xB7) 
+        }
+    }
+
+    // Offcore event with additional configuration register
+    // cfgBits contain offset of "request type" bit
+    // cmask contain offset of "response type" bit
+    if (event->eventId == 0xB7)
+    {
+        uint32_t reg = 0x0;
+        if (event->umask == 0x01)
+        {
+            reg = MSR_OFFCORE_RESP0;
+        }
+        else if (event->umask == 0x02)
+        {
+            reg = MSR_OFFCORE_RESP1;
+        }
+        if (reg)
+        {
+            if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
             {
-                if (event->umask == 0x01)
-                {
-                    reg = MSR_OFFCORE_RESP0;
-                }
-                else if (event->umask == 0x02)
-                {
-                    reg = MSR_OFFCORE_RESP1;
-                }
-                flags = 0x0ULL;
-                flags = (1<<event->cfgBits)|(1<<event->cmask);
-                msr_write(cpu_id, reg , flags);
+                offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
             }
+            VERBOSEPRINTREG(cpu_id, reg, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , offcore_flags));
+        }
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC)
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    return 0;
+}
 
-            break;
+int perfmon_setupCountersThread_silvermont(
+        int thread_id,
+        PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    uint64_t flags = 0x0ULL;
+    uint64_t fixed_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
 
-        case FIXED:
-            fixed_flags |= (2ULL<<(index*4));
-            break;
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL));
+    }
 
-        case POWER:
-            break;
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        flags = 0x0ULL;
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        switch (type)
+        {
+            case PMC:
+                svm_pmc_setup(cpu_id, index, event);
+                break;
+
+            case FIXED:
+                fixed_flags |= svm_fixed_setup(cpu_id, index, event);
+                break;
 
-        default:
-            /* should never be reached */
-            break;
+            case POWER:
+                break;
+
+            default:
+                break;
+        }
     }
-    if (fixed_flags != orig_fixed_flags)
+    if (fixed_flags > 0x0)
     {
-        msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
     }
+    return 0;
 }
 
 
-void perfmon_startCountersThread_silvermont(int thread_id)
+
+
+int perfmon_startCountersThread_silvermont(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
+    uint64_t tmp;
     uint64_t flags = 0x0ULL;
-    uint32_t uflags = 0x10000UL; /* Clear freeze bit */
-    uint64_t fixed_flags = 0x0ULL;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-
-    for ( int i=0; i<perfmon_numCountersSilvermont; i++ )
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            switch (silvermont_counter_map[i].type)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            tmp = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            switch (type)
             {
                 case PMC:
-                    msr_write(cpu_id, silvermont_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    flags |= (1<<(index-cpuid_info.perf_num_fixed_ctr));  /* enable counter */
                     break;
 
                 case FIXED:
-                    msr_write(cpu_id, silvermont_counter_map[i].counterRegister, 0x0ULL);
-                    flags |= (1ULL<<(i+32));  /* enable fixed counter */
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    flags |= (1ULL<<(index+32));  /* enable fixed counter */
                     break;
 
                 case POWER:
                     if(haveLock)
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_read(cpu_id, silvermont_counter_map[i].counterRegister);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&tmp));
+                        eventSet->events[i].threadCounter[thread_id].startData = field64(tmp, 0, box_map[type].regWidth);
                     }
-
                     break;
 
                 default:
-                    /* should never be reached */
                     break;
             }
         }
     }
 
-    if (perfmon_verbose)
-    {
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_PERF_GLOBAL_CTRL, LLU_CAST flags);
-        printf("perfmon_start_counters: Write Register 0x%X , \
-                Flags: 0x%llX \n",MSR_UNCORE_PERF_GLOBAL_CTRL, LLU_CAST uflags);
-    }
-    if (flags != 0x0ULL)
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, UNFREEZE_PMC_OR_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, flags));
     }
+
+    return 0;
 }
 
 
-void perfmon_stopCountersThread_silvermont(int thread_id)
+int perfmon_stopCountersThread_silvermont(int thread_id, PerfmonEventSet* eventSet)
 {
-    uint64_t flags;
-    uint32_t uflags = 0x10100UL; /* Set freeze bit */
     uint64_t counter_result = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_OR_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
 
-    for ( int i=0; i < perfmon_numCountersSilvermont; i++ ) 
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE) 
         {
-            switch (silvermont_counter_map[i].type)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            switch (type)
             {
                 case PMC:
-
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index - cpuid_info.perf_num_fixed_ctr)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL,
+                                                    (1ULL<<(index - cpuid_info.perf_num_fixed_ctr))));
+                        }
+                    }
+                    break;
                 case FIXED:
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        (double)msr_read(cpu_id, silvermont_counter_map[i].counterRegister);
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index + 32)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<(index + 32))));
+                        }
+                    }
                     break;
 
                 case POWER:
                     if(haveLock)
                     {
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                            power_info.energyUnit *
-                            ( power_read(cpu_id, silvermont_counter_map[i].counterRegister) -
-                              perfmon_threadData[thread_id].counters[i].counterData);
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                        }
                     }
                     break;
 
                 case THERMAL:
-                        perfmon_threadData[thread_id].counters[i].counterData =
-                             thermal_read(cpu_id);
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
                     break;
 
                 default:
-                    /* should never be reached */
                     break;
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
-
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    //    printf ("Status: 0x%llX \n", LLU_CAST flags);
-    if ( (flags & 0x3) || (flags & (0x3ULL<<32)) ) 
-    {
-        printf ("Overflow occured \n");
-    }
+    return 0;
 }
 
-void perfmon_readCountersThread_silvermont(int thread_id)
+int perfmon_readCountersThread_silvermont(int thread_id, PerfmonEventSet* eventSet)
 {
     uint64_t counter_result = 0x0ULL;
+    uint64_t pmc_flags = 0x0ULL;
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    for ( int i=0; i<perfmon_numCountersSilvermont; i++ )
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE)
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &pmc_flags));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, FREEZE_PMC_OR_FIXED)
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if ((silvermont_counter_map[i].type == PMC) ||
-                    (silvermont_counter_map[i].type == FIXED))
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, silvermont_counter_map[i].counterRegister);
+                continue;
             }
-            else
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            switch (type)
             {
-                if(haveLock)
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index - cpuid_info.perf_num_fixed_ctr)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL,
+                                                    (1ULL<<(index - cpuid_info.perf_num_fixed_ctr))));
+                        }
+                    }
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                    {
+                        uint64_t ovf_values = 0x0ULL;
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_STATUS, &ovf_values));
+                        if (ovf_values & (1ULL<<(index + 32)))
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<(index + 32))));
+                        }
+                    }
+                    break;
+
+                case POWER:
+                    if(haveLock)
+                    {
+                        CHECK_POWER_READ_ERROR(power_read(cpu_id, counter1, (uint32_t*)&counter_result));
+                        if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData)
+                        {
+                            eventSet->events[i].threadCounter[thread_id].overflows++;
+                        }
+                    }
+                    break;
+
+                case THERMAL:
+                    CHECK_TEMP_READ_ERROR(thermal_read(cpu_id, (uint32_t*)&counter_result));
+                    break;
+
+                default:
+                    break;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
+        }
+    }
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, pmc_flags));
+    }
+    return 0;
+}
+
+
+int perfmon_finalizeCountersThread_silvermont(int thread_id, PerfmonEventSet* eventSet)
+{
+    int haveLock = 0;
+    int haveTileLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        uint64_t reg = counter_map[index].configRegister;
+        PciDeviceIndex dev = counter_map[index].device;
+        switch (type)
+        {
+            case PMC:
+                ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                if ((haveTileLock) && (event->eventId == 0xB7))
                 {
-                    switch (silvermont_counter_map[i].type)
+                    if (event->umask == 0x1)
                     {
-                        case POWER:
-                            perfmon_threadData[thread_id].counters[i].counterData =
-                                power_info.energyUnit *
-                                power_read(cpu_id, silvermont_counter_map[i].counterRegister);
-                            break;
-
-                        default:
-                            /* should never be reached */
-                            break;
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
+                    }
+                    else if (event->umask == 0x2)
+                    {
+                        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
                     }
                 }
-            }
+                break;
+            case FIXED:
+                ovf_values_core |= (1ULL<<(index+32));
+                break;
+            default:
+                break;
+        }
+        if ((reg) && ((dev == MSR_DEV) || (haveLock)))
+        {
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
         }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+    return 0;
 }
diff --git a/src/includes/perfmon_silvermont_counters.h b/src/includes/perfmon_silvermont_counters.h
index 266ee4b..467ebee 100644
--- a/src/includes/perfmon_silvermont_counters.h
+++ b/src/includes/perfmon_silvermont_counters.h
@@ -3,15 +3,15 @@
  *
  *      Filename:  perfmon_silvermont_counters.h
  *
- *      Description: Counter header file of perfmon module for Silvermont.
+ *      Description: Counter header file of perfmon module for Intel Atom (Silvermont)
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -27,24 +27,33 @@
  *
  * =======================================================================================
  */
+#include <registers.h>
 
 #define NUM_COUNTERS_CORE_SILVERMONT 6
 #define NUM_COUNTERS_UNCORE_SILVERMONT 0
 #define NUM_COUNTERS_SILVERMONT 8
 
-static PerfmonCounterMap silvermont_counter_map[NUM_COUNTERS_SILVERMONT] = {
+#define SVM_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define SVM_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_ANYTHREAD_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+static RegisterMap silvermont_counter_map[NUM_COUNTERS_SILVERMONT] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
-    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
-    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, SVM_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, SVM_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, SVM_VALID_OPTIONS_FIXED},
     /* PMC Counters: 4 48bit wide */
-    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, MSR_OFFCORE_RESP0, 0, SVM_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, MSR_OFFCORE_RESP1, 0, SVM_VALID_OPTIONS_PMC},
     /* Temperature Sensor*/
-    {"TMP0", PMC5, THERMAL, 0, 0, 0, 0},
+    {"TMP0", PMC5, THERMAL, 0, IA32_THERM_STATUS, 0, 0},
     /* RAPL counters */
     {"PWR0", PMC6, POWER, 0, MSR_PKG_ENERGY_STATUS, 0, 0},
-    {"PWR1", PMC7, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0},
+    {"PWR1", PMC7, POWER, 0, MSR_PP0_ENERGY_STATUS, 0, 0}
 };
 
-
+static BoxMap silvermont_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, -1, 0, 0, 48},
+    [THERMAL] = {0, 0, 0, 0, 0, MSR_DEV, 8},
+    [POWER] = {0, 0, 0, 0, 0, MSR_DEV, 32}
+};
diff --git a/src/includes/perfmon_silvermont_events.txt b/src/includes/perfmon_silvermont_events.txt
index b8a088d..8105639 100644
--- a/src/includes/perfmon_silvermont_events.txt
+++ b/src/includes/perfmon_silvermont_events.txt
@@ -1,16 +1,16 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_silvermont_events.txt
-# 
+#
 #      Description:  Event list for Intel Atom (Silvermont)
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -64,12 +64,16 @@ UMASK_MEM_UOPS_RETIRED_HITM         0x20
 UMASK_MEM_UOPS_RETIRED_ALL_LOADS    0x40
 UMASK_MEM_UOPS_RETIRED_ALL_STORES   0x80
 
-EVENT_PAGE_WALKS                    0x05 PMC
-UMASK_PAGE_WALKS_D_SIDE_CYCLES      0x01
-UMASK_PAGE_WALKS_I_SIDE_CYCLES      0x02
-UMASK_PAGE_WALKS_WALKS              0x03
+EVENT_PAGE_WALKS                    0x05  PMC
+UMASK_PAGE_WALKS_DTLB_COUNT         0x01 0x04 0x00
+UMASK_PAGE_WALKS_DTLB_CYCLES        0x01
+UMASK_PAGE_WALKS_ITLB_COUNT         0x02 0x04 0x00
+UMASK_PAGE_WALKS_ITLB_CYCLES        0x02
+UMASK_PAGE_WALKS_COUNT              0x03 0x04 0x00
+UMASK_PAGE_WALKS_CYCLES             0x03
+
 
-EVENT_LONGEST_LAT_CACHE             0x2E PMC
+EVENT_LONGEST_LAT_CACHE             0x2E  PMC
 UMASK_LONGEST_LAT_CACHE_MISS        0x41
 UMASK_LONGEST_LAT_CACHE_REFERENCE   0x4F
 
@@ -83,305 +87,15 @@ EVENT_CPU_CLK_UNHALTED              0x3C PMC
 UMASK_CPU_CLK_UNHALTED_CORE_P       0x00
 UMASK_CPU_CLK_UNHALTED_REF_P        0x01
 
-EVENT_ICACHE                        0x80 PMC
-UMASK_ICACHE_HIT                    0x01
-UMASK_ICACHE_MISSES                 0x02
-UMASK_ICACHE_ACCESSES               0x03
-UMASK_ICACHE_IFETCH_STALL           0x04
+EVENT_ICACHE                  0x80   PMC
+UMASK_ICACHE_HITS             0x01
+UMASK_ICACHE_MISSES             0x02
+UMASK_ICACHE_ACCESSES           0x03
+UMASK_ICACHE_IFETCH_STALL       0x04
 
 EVENT_NIP_STALL                     0xB6 PMC
 UMASK_NIP_STALL_ICACHE_MISS         0x04
 
-EVENT_OFFCORE_RESPONSE              0xB7 PMC
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_L2_HIT        0x01 0x00 0x12
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_SNP_NONE      0x01 0x00 0x1F
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_SNOOP_MISS    0x01 0x00 0x21
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_SNOOP_HIT     0x01 0x00 0x22
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_HITM          0x01 0x00 0x24
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_NON_DRAM      0x01 0x00 0x25
-UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_AVG_LAT       0x01 0x00 0x26
-
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY           0x01 0x01 0x10
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_L2_HIT        0x01 0x01 0x12
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_SNP_NONE      0x01 0x01 0x1F
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_SNOOP_MISS    0x01 0x01 0x21
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_SNOOP_HIT     0x01 0x01 0x22
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_HITM          0x01 0x01 0x24
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_NON_DRAM      0x01 0x01 0x25
-UMASK_OFFCORE_RESPONSE_0_DMND_RFO_AVG_LAT       0x01 0x01 0x26
-
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_ANY           0x01 0x02 0x10
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_L2_HIT        0x01 0x02 0x12
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_SNP_NONE      0x01 0x02 0x1F
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_SNOOP_MISS    0x01 0x02 0x21
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_SNOOP_HIT     0x01 0x02 0x22
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_HITM          0x01 0x02 0x24
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_NON_DRAM      0x01 0x02 0x25
-UMASK_OFFCORE_RESPONSE_0_DMND_IFETCH_AVG_LAT       0x01 0x02 0x26
-
-UMASK_OFFCORE_RESPONSE_0_WB_ANY           0x01 0x03 0x10
-UMASK_OFFCORE_RESPONSE_0_WB_L2_HIT        0x01 0x03 0x12
-UMASK_OFFCORE_RESPONSE_0_WB_SNP_NONE      0x01 0x03 0x1F
-UMASK_OFFCORE_RESPONSE_0_WB_SNOOP_MISS    0x01 0x03 0x21
-UMASK_OFFCORE_RESPONSE_0_WB_SNOOP_HIT     0x01 0x03 0x22
-UMASK_OFFCORE_RESPONSE_0_WB_HITM          0x01 0x03 0x24
-UMASK_OFFCORE_RESPONSE_0_WB_NON_DRAM      0x01 0x03 0x25
-UMASK_OFFCORE_RESPONSE_0_WB_AVG_LAT       0x01 0x03 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_ANY           0x01 0x04 0x10
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_L2_HIT        0x01 0x04 0x12
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_SNP_NONE      0x01 0x04 0x1F
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_SNOOP_MISS    0x01 0x04 0x21
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_SNOOP_HIT     0x01 0x04 0x22
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_HITM          0x01 0x04 0x24
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_NON_DRAM      0x01 0x04 0x25
-UMASK_OFFCORE_RESPONSE_0_PF_DATA_RD_AVG_LAT       0x01 0x04 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_ANY           0x01 0x05 0x10
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_L2_HIT        0x01 0x05 0x12
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_SNP_NONE      0x01 0x05 0x1F
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_SNOOP_MISS    0x01 0x05 0x21
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_SNOOP_HIT     0x01 0x05 0x22
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_HITM          0x01 0x05 0x24
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_NON_DRAM      0x01 0x05 0x25
-UMASK_OFFCORE_RESPONSE_0_PF_RFO_AVG_LAT       0x01 0x05 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_ANY           0x01 0x06 0x10
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_L2_HIT        0x01 0x06 0x12
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_SNP_NONE      0x01 0x06 0x1F
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_SNOOP_MISS    0x01 0x06 0x21
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_SNOOP_HIT     0x01 0x06 0x22
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_HITM          0x01 0x06 0x24
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_NON_DRAM      0x01 0x06 0x25
-UMASK_OFFCORE_RESPONSE_0_PF_IFETCH_AVG_LAT       0x01 0x06 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_ANY           0x01 0x07 0x10
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_L2_HIT        0x01 0x07 0x12
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_SNP_NONE      0x01 0x07 0x1F
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_SNOOP_MISS    0x01 0x07 0x21
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_SNOOP_HIT     0x01 0x07 0x22
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_HITM          0x01 0x07 0x24
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_NON_DRAM      0x01 0x07 0x25
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_AVG_LAT       0x01 0x07 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_ANY           0x01 0x08 0x10
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_L2_HIT        0x01 0x08 0x12
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_SNP_NONE      0x01 0x08 0x1F
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_SNOOP_MISS    0x01 0x08 0x21
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_SNOOP_HIT     0x01 0x08 0x22
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_HITM          0x01 0x08 0x24
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_NON_DRAM      0x01 0x08 0x25
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_AVG_LAT       0x01 0x08 0x26
-
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_ANY           0x01 0x09 0x10
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_L2_HIT        0x01 0x09 0x12
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_SNP_NONE      0x01 0x09 0x1F
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_SNOOP_MISS    0x01 0x09 0x21
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_SNOOP_HIT     0x01 0x09 0x22
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_HITM          0x01 0x09 0x24
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_NON_DRAM      0x01 0x09 0x25
-UMASK_OFFCORE_RESPONSE_0_UC_IFETCH_AVG_LAT       0x01 0x09 0x26
-
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_ANY           0x01 0x0A 0x10
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_L2_HIT        0x01 0x0A 0x12
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_SNP_NONE      0x01 0x0A 0x1F
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_SNOOP_MISS    0x01 0x0A 0x21
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_SNOOP_HIT     0x01 0x0A 0x22
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_HITM          0x01 0x0A 0x24
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_NON_DRAM      0x01 0x0A 0x25
-UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_AVG_LAT       0x01 0x0A 0x26
-
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_ANY           0x01 0x0B 0x10
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_L2_HIT        0x01 0x0B 0x12
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_SNP_NONE      0x01 0x0B 0x1F
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_SNOOP_MISS    0x01 0x0B 0x21
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_SNOOP_HIT     0x01 0x0B 0x22
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_HITM          0x01 0x0B 0x24
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_NON_DRAM      0x01 0x0B 0x25
-UMASK_OFFCORE_RESPONSE_0_STRM_ST_AVG_LAT       0x01 0x0B 0x26
-
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_ANY           0x01 0x0C 0x10
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_L2_HIT        0x01 0x0C 0x12
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_SNP_NONE      0x01 0x0C 0x1F
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_SNOOP_MISS    0x01 0x0C 0x21
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_SNOOP_HIT     0x01 0x0C 0x22
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_HITM          0x01 0x0C 0x24
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_NON_DRAM      0x01 0x0C 0x25
-UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_AVG_LAT       0x01 0x0C 0x26
-
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_ANY           0x01 0x0D 0x10
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_L2_HIT        0x01 0x0D 0x12
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_SNP_NONE      0x01 0x0D 0x1F
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_SNOOP_MISS    0x01 0x0D 0x21
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_SNOOP_HIT     0x01 0x0D 0x22
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_HITM          0x01 0x0D 0x24
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_NON_DRAM      0x01 0x0D 0x25
-UMASK_OFFCORE_RESPONSE_0_DCU_PF_DATA_RD_AVG_LAT       0x01 0x0D 0x26
-
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_ANY           0x01 0x0E 0x10
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_L2_HIT        0x01 0x0E 0x12
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_SNP_NONE      0x01 0x0E 0x1F
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_SNOOP_MISS    0x01 0x0E 0x21
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_SNOOP_HIT     0x01 0x0E 0x22
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_HITM          0x01 0x0E 0x24
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_NON_DRAM      0x01 0x0E 0x25
-UMASK_OFFCORE_RESPONSE_0_PARTIAL_STRM_ST_AVG_LAT       0x01 0x0E 0x26
-
-UMASK_OFFCORE_RESPONSE_0_ANY_ANY           0x01 0x0F 0x10
-UMASK_OFFCORE_RESPONSE_0_ANY_L2_HIT        0x01 0x0F 0x12
-UMASK_OFFCORE_RESPONSE_0_ANY_SNP_NONE      0x01 0x0F 0x1F
-UMASK_OFFCORE_RESPONSE_0_ANY_SNOOP_MISS    0x01 0x0F 0x21
-UMASK_OFFCORE_RESPONSE_0_ANY_SNOOP_HIT     0x01 0x0F 0x22
-UMASK_OFFCORE_RESPONSE_0_ANY_HITM          0x01 0x0F 0x24
-UMASK_OFFCORE_RESPONSE_0_ANY_NON_DRAM      0x01 0x0F 0x25
-UMASK_OFFCORE_RESPONSE_0_ANY_AVG_LAT       0x01 0x0F 0x26
-
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x02 0x00 0x10
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_L2_HIT        0x02 0x00 0x12
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_SNP_NONE      0x02 0x00 0x1F
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_SNOOP_MISS    0x02 0x00 0x21
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_SNOOP_HIT     0x02 0x00 0x22
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_HITM          0x02 0x00 0x24
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_NON_DRAM      0x02 0x00 0x25
-UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_AVG_LAT       0x02 0x00 0x26
-
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY           0x02 0x01 0x10
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_L2_HIT        0x02 0x01 0x12
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_SNP_NONE      0x02 0x01 0x1F
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_SNOOP_MISS    0x02 0x01 0x21
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_SNOOP_HIT     0x02 0x01 0x22
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_HITM          0x02 0x01 0x24
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_NON_DRAM      0x02 0x01 0x25
-UMASK_OFFCORE_RESPONSE_1_DMND_RFO_AVG_LAT       0x02 0x01 0x26
-
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_ANY           0x02 0x02 0x10
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_L2_HIT        0x02 0x02 0x12
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_SNP_NONE      0x02 0x02 0x1F
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_SNOOP_MISS    0x02 0x02 0x21
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_SNOOP_HIT     0x02 0x02 0x22
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_HITM          0x02 0x02 0x24
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_NON_DRAM      0x02 0x02 0x25
-UMASK_OFFCORE_RESPONSE_1_DMND_IFETCH_AVG_LAT       0x02 0x02 0x26
-
-UMASK_OFFCORE_RESPONSE_1_WB_ANY           0x02 0x03 0x10
-UMASK_OFFCORE_RESPONSE_1_WB_L2_HIT        0x02 0x03 0x12
-UMASK_OFFCORE_RESPONSE_1_WB_SNP_NONE      0x02 0x03 0x1F
-UMASK_OFFCORE_RESPONSE_1_WB_SNOOP_MISS    0x02 0x03 0x21
-UMASK_OFFCORE_RESPONSE_1_WB_SNOOP_HIT     0x02 0x03 0x22
-UMASK_OFFCORE_RESPONSE_1_WB_HITM          0x02 0x03 0x24
-UMASK_OFFCORE_RESPONSE_1_WB_NON_DRAM      0x02 0x03 0x25
-UMASK_OFFCORE_RESPONSE_1_WB_AVG_LAT       0x02 0x03 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_ANY           0x02 0x04 0x10
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_L2_HIT        0x02 0x04 0x12
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_SNP_NONE      0x02 0x04 0x1F
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_SNOOP_MISS    0x02 0x04 0x21
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_SNOOP_HIT     0x02 0x04 0x22
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_HITM          0x02 0x04 0x24
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_NON_DRAM      0x02 0x04 0x25
-UMASK_OFFCORE_RESPONSE_1_PF_DATA_RD_AVG_LAT       0x02 0x04 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_ANY           0x02 0x05 0x10
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_L2_HIT        0x02 0x05 0x12
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_SNP_NONE      0x02 0x05 0x1F
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_SNOOP_MISS    0x02 0x05 0x21
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_SNOOP_HIT     0x02 0x05 0x22
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_HITM          0x02 0x05 0x24
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_NON_DRAM      0x02 0x05 0x25
-UMASK_OFFCORE_RESPONSE_1_PF_RFO_AVG_LAT       0x02 0x05 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_ANY           0x02 0x06 0x10
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_L2_HIT        0x02 0x06 0x12
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_SNP_NONE      0x02 0x06 0x1F
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_SNOOP_MISS    0x02 0x06 0x21
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_SNOOP_HIT     0x02 0x06 0x22
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_HITM          0x02 0x06 0x24
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_NON_DRAM      0x02 0x06 0x25
-UMASK_OFFCORE_RESPONSE_1_PF_IFETCH_AVG_LAT       0x02 0x06 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_ANY           0x02 0x07 0x10
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_L2_HIT        0x02 0x07 0x12
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_SNP_NONE      0x02 0x07 0x1F
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_SNOOP_MISS    0x02 0x07 0x21
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_SNOOP_HIT     0x02 0x07 0x22
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_HITM          0x02 0x07 0x24
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_NON_DRAM      0x02 0x07 0x25
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_AVG_LAT       0x02 0x07 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_ANY           0x02 0x08 0x10
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_L2_HIT        0x02 0x08 0x12
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_SNP_NONE      0x02 0x08 0x1F
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_SNOOP_MISS    0x02 0x08 0x21
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_SNOOP_HIT     0x02 0x08 0x22
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_HITM          0x02 0x08 0x24
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_NON_DRAM      0x02 0x08 0x25
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_AVG_LAT       0x02 0x08 0x26
-
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_ANY           0x02 0x09 0x10
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_L2_HIT        0x02 0x09 0x12
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_SNP_NONE      0x02 0x09 0x1F
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_SNOOP_MISS    0x02 0x09 0x21
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_SNOOP_HIT     0x02 0x09 0x22
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_HITM          0x02 0x09 0x24
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_NON_DRAM      0x02 0x09 0x25
-UMASK_OFFCORE_RESPONSE_1_UC_IFETCH_AVG_LAT       0x02 0x09 0x26
-
-UMASK_OFFCORE_RESPONSE_1 BUS_LOCKS_ANY           0x02 0x0A 0x10
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_L2_HIT        0x02 0x0A 0x12
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_SNP_NONE      0x02 0x0A 0x1F
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_SNOOP_MISS    0x02 0x0A 0x21
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_SNOOP_HIT     0x02 0x0A 0x22
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_HITM          0x02 0x0A 0x24
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_NON_DRAM      0x02 0x0A 0x25
-UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_AVG_LAT       0x02 0x0A 0x26
-
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_ANY           0x02 0x0B 0x10
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_L2_HIT        0x02 0x0B 0x12
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_SNP_NONE      0x02 0x0B 0x1F
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_SNOOP_MISS    0x02 0x0B 0x21
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_SNOOP_HIT     0x02 0x0B 0x22
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_HITM          0x02 0x0B 0x24
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_NON_DRAM      0x02 0x0B 0x25
-UMASK_OFFCORE_RESPONSE_1_STRM_ST_AVG_LAT       0x02 0x0B 0x26
-
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_ANY           0x02 0x0C 0x10
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_L2_HIT        0x02 0x0C 0x12
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_SNP_NONE      0x02 0x0C 0x1F
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_SNOOP_MISS    0x02 0x0C 0x21
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_SNOOP_HIT     0x02 0x0C 0x22
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_HITM          0x02 0x0C 0x24
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_NON_DRAM      0x02 0x0C 0x25
-UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_AVG_LAT       0x02 0x0C 0x26
-
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_ANY           0x02 0x0D 0x10
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_L2_HIT        0x02 0x0D 0x12
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_SNP_NONE      0x02 0x0D 0x1F
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_SNOOP_MISS    0x02 0x0D 0x21
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_SNOOP_HIT     0x02 0x0D 0x22
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_HITM          0x02 0x0D 0x24
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_NON_DRAM      0x02 0x0D 0x25
-UMASK_OFFCORE_RESPONSE_1_DCU_PF_DATA_RD_AVG_LAT       0x02 0x0D 0x26
-
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_ANY           0x02 0x0E 0x10
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_L2_HIT        0x02 0x0E 0x12
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_SNP_NONE      0x02 0x0E 0x1F
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_SNOOP_MISS    0x02 0x0E 0x21
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_SNOOP_HIT     0x02 0x0E 0x22
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_HITM          0x02 0x0E 0x24
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_NON_DRAM      0x02 0x0E 0x25
-UMASK_OFFCORE_RESPONSE_1_PARTIAL_STRM_ST_AVG_LAT       0x02 0x0E 0x26
-
-UMASK_OFFCORE_RESPONSE_1_ANY_ANY           0x02 0x0F 0x10
-UMASK_OFFCORE_RESPONSE_1_ANY_L2_HIT        0x02 0x0F 0x12
-UMASK_OFFCORE_RESPONSE_1_ANY_SNP_NONE      0x02 0x0F 0x1F
-UMASK_OFFCORE_RESPONSE_1_ANY_SNOOP_MISS    0x02 0x0F 0x21
-UMASK_OFFCORE_RESPONSE_1_ANY_SNOOP_HIT     0x02 0x0F 0x22
-UMASK_OFFCORE_RESPONSE_1_ANY_HITM          0x02 0x0F 0x24
-UMASK_OFFCORE_RESPONSE_1_ANY_NON_DRAM      0x02 0x0F 0x25
-UMASK_OFFCORE_RESPONSE_1_ANY_AVG_LAT       0x02 0x0F 0x26
-
-
 EVENT_INST_RETIRED                  0xC0 PMC
 UMASK_INST_RETIRED_ANY_P            0x00
 
@@ -390,32 +104,33 @@ UMASK_UOPS_RETIRED_MS               0x01
 UMASK_UOPS_RETIRED_ALL              0x10
 
 EVENT_MACHINE_CLEARS                0xC3 PMC
-UMASK_MACHINE_CLEARS_SMC            0x01
-UMASK_MACHINE_CLEARS_MEMORY_ORDERING 0x02
-UMASK_MACHINE_CLEARS_FP_ASSIST      0x04
-UMASK_MACHINE_CLEARS_ALL            0x08
-
-EVENT_BR_INST_RETIRED               0xC4  PMC
-UMASK_BR_INST_RETIRED_ALL_BRANCHES  0x00
-UMASK_BR_INST_RETIRED_JCC           0x7E
-UMASK_BR_INST_RETIRED_FAR_BRANCH    0xBF
+UMASK_MACHINE_CLEARS_SMC               0x01
+UMASK_MACHINE_CLEARS_MEMORY_ORDERING   0x02
+UMASK_MACHINE_CLEARS_FP_ASSIST         0x04
+UMASK_MACHINE_CLEARS_ALL               0x08
+
+
+EVENT_BR_INST_RETIRED                0xC4  PMC
+UMASK_BR_INST_RETIRED_ALL_BRANCHES   0x00
+UMASK_BR_INST_RETIRED_JCC            0x7E
+UMASK_BR_INST_RETIRED_TAKEN_JCC      0xFE
+UMASK_BR_INST_RETIRED_FAR_BRANCH     0xBF
 UMASK_BR_INST_RETIRED_NON_RETURN_IND 0xEB
-UMASK_BR_INST_RETIRED_RETURN        0xF7
-UMASK_BR_INST_RETIRED_CALL          0xF9
-UMASK_BR_INST_RETIRED_IND_CALL      0xFB
-UMASK_BR_INST_RETIRED_REL_CALL      0xFD
-UMASK_BR_INST_RETIRED_TAKEN_JCC     0xFE
-
-EVENT_BR_MISP_RETIRED               0xC5  PMC
-UMASK_BR_MISP_RETIRED_ALL_BRANCHES  0x00
-UMASK_BR_MISP_RETIRED_JCC           0x7E
-UMASK_BR_MISP_RETIRED_FAR_BRANCH    0xBF
+UMASK_BR_INST_RETIRED_RETURN         0xF7
+UMASK_BR_INST_RETIRED_CALL           0xF9
+UMASK_BR_INST_RETIRED_IND_CALL       0xFB
+UMASK_BR_INST_RETIRED_REL_CALL       0xFD
+
+EVENT_BR_MISP_RETIRED                0xC5  PMC
+UMASK_BR_MISP_RETIRED_ALL_BRANCHES   0x00
+UMASK_BR_MISP_RETIRED_JCC            0x7E
+UMASK_BR_MISP_RETIRED_TAKEN_JCC      0xFE
+UMASK_BR_MISP_RETIRED_FAR_BRANCH     0xBF
 UMASK_BR_MISP_RETIRED_NON_RETURN_IND 0xEB
-UMASK_BR_MISP_RETIRED_RETURN        0xF7
-UMASK_BR_MISP_RETIRED_CALL          0xF9
-UMASK_BR_MISP_RETIRED_IND_CALL      0xFB
-UMASK_BR_MISP_RETIRED_REL_CALL      0xFD
-UMASK_BR_MISP_RETIRED_TAKEN_JCC     0xFE
+UMASK_BR_MISP_RETIRED_RETURN         0xF7
+UMASK_BR_MISP_RETIRED_CALL           0xF9
+UMASK_BR_MISP_RETIRED_IND_CALL       0xFB
+UMASK_BR_MISP_RETIRED_REL_CALL       0xFD
 
 EVENT_NO_ALLOC_CYCLES               0xCA PMC
 UMASK_NO_ALLOC_CYCLES_ROB_FULL      0x01
@@ -430,7 +145,7 @@ UMASK_RS_FULL_STALL_ALL             0x1F
 EVENT_CYCLES_DIV_BUSY               0xCD PMC
 UMASK_CYCLES_DIV_BUSY_ANY           0x01
 
-EVENT_BACLEARS                      0xE6 PMC
+EVENT_BACLEARS                      0xE6  PMC
 UMASK_BACLEARS_ALL                  0x01
 UMASK_BACLEARS_RETURN               0x08
 UMASK_BACLEARS_COND                 0x10
@@ -438,3 +153,46 @@ UMASK_BACLEARS_COND                 0x10
 EVENT_MS_DECODED                    0xE7 PMC
 UMASK_MS_DECODED_MS_ENTRY           0x01
 
+EVENT_OFFCORE_RESPONSE_0              0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY           0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY               0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY           0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                     0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY          0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY              0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY          0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_READ_ANY           0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_WRITE_ANY          0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_UC_CODE_RD_ANY             0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_ANY              0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY       0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_SW_PREFETCH_ANY            0x01 0x0C 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L1_DATA_RD_ANY          0x01 0x0D 0x10
+UMASK_OFFCORE_RESPONSE_0_PARTIAL_STREAMING_STORES_ANY 0x01 0x0E 0x10
+UMASK_OFFCORE_RESPONSE_0_ANY_ANY                    0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1              0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x02 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY           0x02 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY               0x02 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY           0x02 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                     0x02 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY          0x02 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY              0x02 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY          0x02 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_READ_ANY           0x02 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_WRITE_ANY          0x02 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_UC_CODE_RD_ANY             0x02 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_ANY              0x02 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY       0x02 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_SW_PREFETCH_ANY            0x02 0x0C 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L1_DATA_RD_ANY          0x02 0x0D 0x10
+UMASK_OFFCORE_RESPONSE_1_PARTIAL_STREAMING_STORES_ANY 0x02 0x0E 0x10
+UMASK_OFFCORE_RESPONSE_1_ANY_ANY                    0x02 0x0F 0x10
+
+
+
+
diff --git a/src/includes/perfmon_types.h b/src/includes/perfmon_types.h
index 1f0663a..4108536 100644
--- a/src/includes/perfmon_types.h
+++ b/src/includes/perfmon_types.h
@@ -7,13 +7,14 @@
  *                    Configures and reads out performance counters
  *                    on x86 based architectures. Supports multi threading.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -34,145 +35,212 @@
 #define PERFMON_TYPES_H
 
 #include <bstrlib.h>
-#include <perfmon_group_types.h>
+#include <timer.h>
+
+#define MAX_EVENT_OPTIONS 4
 
 /* #####   EXPORTED TYPE DEFINITIONS   #################################### */
 
-typedef enum {
-    PMC0 = 0,
-    PMC1, PMC2, PMC3, PMC4, PMC5, PMC6,
-    PMC7, PMC8, PMC9, PMC10, PMC11, PMC12,
-    PMC13, PMC14, PMC15, PMC16, PMC17, PMC18,
-    PMC19, PMC20, PMC21, PMC22, PMC23, PMC24,
-    PMC25, PMC26, PMC27, PMC28, PMC29, PMC30,
-    PMC31, PMC32, PMC33, PMC34, PMC35, PMC36,
-    PMC37, PMC38, PMC39, PMC40, PMC41, PMC42,
-    PMC43, PMC44, PMC45, PMC46, PMC47, PMC48,
-    PMC49, PMC50, PMC51, PMC52, PMC53, PMC54,
-    PMC55, PMC56, PMC57, PMC58, PMC59, PMC60,
-    PMC61, PMC62, PMC63, PMC64, PMC65, PMC66,
-    PMC67, PMC68, PMC69, PMC70, PMC71, PMC72,
-    PMC73, PMC74, PMC75, PMC76, PMC77, PMC78,
-    PMC79, PMC80, PMC81, PMC82, PMC83, PMC84,
-    PMC85, PMC86, PMC87, PMC88, PMC89, PMC90,
-    PMC91, PMC92, PMC93, PMC94, PMC95, PMC96,
-    PMC97, PMC98, PMC99, PMC100, PMC101, PMC102,
-    PMC103, PMC104, PMC105, PMC106, PMC107, PMC108,
-    NUM_PMC} PerfmonCounterIndex;
+extern int socket_fd;
+extern int thread_sockets[MAX_NUM_THREADS];
 
-typedef enum {
-    PMC = 0,
-    FIXED,
-    THERMAL,
-    UNCORE,
-    MBOX0,
-    MBOX1,
-    MBOX2,
-    MBOX3,
-    MBOXFIX,
-    BBOX0,
-    BBOX1,
-    RBOX0,
-    RBOX1,
-    WBOX,
-    SBOX0,
-    SBOX1,
-    SBOX2,
-    CBOX0,
-    CBOX1,
-    CBOX2,
-    CBOX3,
-    CBOX4,
-    CBOX5,
-    CBOX6,
-    CBOX7,
-    CBOX8,
-    CBOX9,
-    CBOX10,
-    CBOX11,
-    CBOX12,
-    CBOX13,
-    CBOX14,
-    PBOX,
-    POWER,
-    UBOX,
-    NUM_UNITS} PerfmonType;
 
-typedef struct {
-    char* key;
-    PerfmonCounterIndex index;
-    PerfmonType type;
-    uint64_t configRegister;
-    uint64_t counterRegister;
-    uint64_t counterRegister2;
-    PciDeviceIndex device;
-} PerfmonCounterMap;
+/** \addtogroup PerfMon
+ *  @{
+ */
+/////////////////////////////////////////////
 
+/*! \brief Enum of possible event and counter options
+
+List of internally used IDs for all event and counter options that are supported
+by LIKWID.
+\extends PerfmonEventOption
+*/
+typedef enum {
+    EVENT_OPTION_NONE = 0, /*!< \brief No option, used as False value */
+    EVENT_OPTION_OPCODE, /*!< \brief Match opcode */
+    EVENT_OPTION_MATCH0, /*!< \brief Match0 register */
+    EVENT_OPTION_MATCH1, /*!< \brief Match1 register */
+    EVENT_OPTION_MATCH2, /*!< \brief Match2 register */
+    EVENT_OPTION_MATCH3, /*!< \brief Match3 register */
+    EVENT_OPTION_MASK0, /*!< \brief Mask0 register */
+    EVENT_OPTION_MASK1, /*!< \brief Mask1 register */
+    EVENT_OPTION_MASK2, /*!< \brief Mask2 register */
+    EVENT_OPTION_MASK3, /*!< \brief Mask3 register */
+    EVENT_OPTION_NID, /*!< \brief Set NUMA node ID */
+    EVENT_OPTION_TID, /*!< \brief Set Thread ID */
+    EVENT_OPTION_STATE, /*!< \brief Match for state */
+    EVENT_OPTION_EDGE, /*!< \brief Increment counter at each edge */
+    EVENT_OPTION_THRESHOLD, /*!< \brief Increment only if exceeding threshold */
+    EVENT_OPTION_INVERT, /*!< \brief Invert behavior of EVENT_OPTION_THRESHOLD, hence increment only below threshold */
+    EVENT_OPTION_COUNT_KERNEL, /*!< \brief Also count events when in kernel space */
+    EVENT_OPTION_ANYTHREAD, /*!< \brief Increment counter at events of all HW threads in the core */
+    EVENT_OPTION_OCCUPANCY, /*!< \brief Count occupancy not occurrences */
+    EVENT_OPTION_OCCUPANCY_FILTER, /*!< \brief Filter for occupancy counting */
+    EVENT_OPTION_OCCUPANCY_EDGE, /*!< \brief Increment occupancy counter at detection of an edge */
+    EVENT_OPTION_OCCUPANCY_INVERT, /*!< \brief Invert filter for occupancy counting */
+    EVENT_OPTION_IN_TRANS, /*!< \brief Count events during transactions */
+    EVENT_OPTION_IN_TRANS_ABORT, /*!< \brief Count events that aborted during transactions */
+    NUM_EVENT_OPTIONS /*!< \brief Amount of defined options */
+} EventOptionType;
+
+/*! \brief List of option names
+
+List of strings for all event and counter options used for matching and output
+*/
+extern char* eventOptionTypeName[NUM_EVENT_OPTIONS];
+
+/** \brief Bitmask with no event/counter option set */
+#define EVENT_OPTION_NONE_MASK 0x0ULL
+/** \brief Define for easily creating an bitmask of all configured event/counter options */
+#define OPTIONS_TYPE_MASK(type) \
+        (((type == EVENT_OPTION_NONE)||(type >= NUM_EVENT_OPTIONS)) ? \
+        EVENT_OPTION_NONE_MASK : \
+        (1ULL<<type))
+
+
+/** @cond */ 
+#define EVENT_OPTION_OPCODE_MASK (1ULL<<EVENT_OPTION_OPCODE)
+#define EVENT_OPTION_MATCH0_MASK (1ULL<<EVENT_OPTION_MATCH0)
+#define EVENT_OPTION_MATCH1_MASK (1ULL<<EVENT_OPTION_MATCH1)
+#define EVENT_OPTION_MATCH2_MASK (1ULL<<EVENT_OPTION_MATCH2)
+#define EVENT_OPTION_MATCH3_MASK (1ULL<<EVENT_OPTION_MATCH3)
+#define EVENT_OPTION_MASK0_MASK (1ULL<<EVENT_OPTION_MASK0)
+#define EVENT_OPTION_MASK1_MASK (1ULL<<EVENT_OPTION_MASK1)
+#define EVENT_OPTION_MASK2_MASK (1ULL<<EVENT_OPTION_MASK2)
+#define EVENT_OPTION_MASK3_MASK (1ULL<<EVENT_OPTION_MASK3)
+#define EVENT_OPTION_NID_MASK (1ULL<<EVENT_OPTION_NID)
+#define EVENT_OPTION_TID_MASK (1ULL<<EVENT_OPTION_TID)
+#define EVENT_OPTION_STATE_MASK (1ULL<<EVENT_OPTION_STATE)
+#define EVENT_OPTION_EDGE_MASK (1ULL<<EVENT_OPTION_EDGE)
+#define EVENT_OPTION_THRESHOLD_MASK (1ULL<<EVENT_OPTION_THRESHOLD)
+#define EVENT_OPTION_INVERT_MASK (1ULL<<EVENT_OPTION_INVERT)
+#define EVENT_OPTION_COUNT_KERNEL_MASK (1ULL<<EVENT_OPTION_COUNT_KERNEL)
+#define EVENT_OPTION_ANYTHREAD_MASK (1ULL<<EVENT_OPTION_ANYTHREAD)
+#define EVENT_OPTION_OCCUPANCY_MASK (1ULL<<EVENT_OPTION_OCCUPANCY)
+#define EVENT_OPTION_OCCUPANCY_FILTER_MASK (1ULL<<EVENT_OPTION_OCCUPANCY_FILTER)
+#define EVENT_OPTION_OCCUPANCY_EDGE_MASK (1ULL<<EVENT_OPTION_OCCUPANCY_EDGE)
+#define EVENT_OPTION_OCCUPANCY_INVERT_MASK (1ULL<<EVENT_OPTION_OCCUPANCY_INVERT)
+#define EVENT_OPTION_IN_TRANS_MASK (1ULL<<EVENT_OPTION_IN_TRANS)
+#define EVENT_OPTION_IN_TRANS_ABORT_MASK (1ULL<<EVENT_OPTION_IN_TRANS_ABORT)
+/** @endcond */
+
+/*! \brief Structure specifying thread to CPU relation
+
+Threads are always numbered incrementally. This structure is used in order to 
+resolve the real HW thread ID.
+\extends PerfmonGroupSet
+*/
 typedef struct {
-    const char* key;
-    PerfmonGroup index;
-    int isUncore;
-    const char* info;
-    const char* config;
-    int derivedCounters;
-    const char ** derivedCounterNames;
-} PerfmonGroupMap;
+    int             thread_id; /*!< \brief Thread ID how it is used internally */
+    int             processorId; /*!< \brief Real HW thread ID */
+} PerfmonThread;
 
+/*! \brief Structure specifying event/counter options and their value
+
+Most options set a bitfield in registers and their values are stored in this structure.
+If an option is a binary option, the value is set to 1.
+\extends PerfmonEvent
+*/
 typedef struct {
-    char* key;
-    char* msg;
-} PerfmonGroupHelp;
+    EventOptionType      type; /*!< \brief Type of the option */
+    uint64_t             value; /*!< \brief Value of the option */
+} PerfmonEventOption;
+
+/*! \brief Structure specifying an performance monitoring event
 
-/* only used in westmereEX at the moment */
+This structure holds the configuration data for an event. It groups the name,
+the allowed counters and internally used values like event ID and masks. Moreover
+the event options are hold here.
+\extends PerfmonEventSetEntry
+*/
 typedef struct {
-    uint32_t ctrlRegister;
-    uint32_t statusRegister;
-    uint32_t ovflRegister;
-} PerfmonUnit;
+    const char*     name; /*!< \brief Name of the event */
+    const char*     limit; /*!< \brief Valid counters for the event */
+    uint16_t        eventId; /*!< \brief ID of the event */
+    uint8_t         umask; /*!< \brief Most events need to specify a mask to limit counting */
+    uint8_t         cfgBits; /*!< \brief Misc configuration bits */
+    uint64_t        cmask; /*!< \brief Misc mask bits */
+    uint8_t         numberOfOptions; /*!< \brief Number of options for the event */
+    uint64_t        optionMask; /*!< \brief Bitmask for fast check of set options */
+    PerfmonEventOption options[NUM_EVENT_OPTIONS]; /*!< \brief List of options */
+} PerfmonEvent;
+
+/*! \brief Structure describing performance monitoring counter data
 
+Each event holds one of these structures for each thread to store the counter
+data, if it is configured and the amount of happened overflows.
+\extends PerfmonEventSetEntry
+*/
 typedef struct {
-    int init;
-    int id;  /* TODO id is only used for EX type processors */
-    double counterData;
+    int         init; /*!< \brief Flag if corresponding control register is set up properly */
+    int         id; /*!< \brief Offset in higher level control register, e.g. position of enable bit */
+    int         overflows; /*!< \brief Amount of overflows */
+    uint64_t    startData; /*!< \brief Start data from the counter */
+    uint64_t    counterData; /*!< \brief Intermediate data from the counters */
 } PerfmonCounter;
 
-typedef struct {
-    int processorId;
-    PerfmonCounter counters[NUM_PMC];
-} PerfmonThread;
 
-typedef struct {
-    const char* name;
-    const char* limit;
-    uint16_t eventId;
-    uint8_t umask;
-    uint8_t cfgBits;
-    uint8_t cmask;
-} PerfmonEvent;
+/*! \brief Structure specifying an performance monitoring event
 
+An eventSet consists of an event and a counter and the read counter values.
+\extends PerfmonEventSet
+*/
 typedef struct {
-    PerfmonEvent event;
-    PerfmonCounterIndex index;
-    double* result;
+    PerfmonEvent        event; /*!< \brief Event configuration */
+    RegisterIndex       index; /*!< \brief Index of the counter register in the counter map */
+    RegisterType        type; /*!< \brief Type of the counter register and event */
+    PerfmonCounter*     threadCounter; /*!< \brief List of counter data for each thread, list length is \a numberOfThreads in PerfmonGroupSet */
 } PerfmonEventSetEntry;
 
+/*! \brief Structure specifying an performance monitoring event group
+
+A PerfmonEventSet holds a set of event and counter combinations and some global information about all eventSet entries
+\extends PerfmonGroupSet
+*/
 typedef struct {
-    int numberOfEvents;
-    PerfmonEventSetEntry* events;
+    int                   numberOfEvents; /*!< \brief Number of eventSets in \a events */
+    PerfmonEventSetEntry* events; /*!< \brief List of eventSets */
+    TimerData             timer; /*!< \brief Time information how long the counters were running */
+    double                rdtscTime; /*!< \brief Evaluation of the Time information in seconds */
+    double                runTime; /*!< \brief Sum of all time information in seconds that the group was running */
+    uint64_t              regTypeMask; /*!< \brief Bitmask for easy checks which types are included in the eventSet */
 } PerfmonEventSet;
 
+/*! \brief Structure specifying all performance monitoring event groups
 
+The global PerfmonGroupSet structure holds all eventSets and threads that are
+configured to measure. Only one eventSet can be measured at a time but the groups
+can be switched to perform some kind of multiplexing.
+*/
 typedef struct {
-    bstring label;
-    double* value;
-} PerfmonResult;
-
-typedef struct {
-    bstrList* header;
-    int numRows;
-    int numColumns;
-    PerfmonResult* rows;
-} PerfmonResultTable;
+    int              numberOfGroups; /*!< \brief List length of \a groups*/
+    int              numberOfActiveGroups; /*!< \brief Amount of added eventSets. Only those eventSets can be accessed in \a groups. */
+    int              activeGroup; /*!< \brief Currently active eventSet */
+    PerfmonEventSet* groups; /*!< \brief List of eventSets */
+    int              numberOfThreads; /*!< \brief Amount of threads in \a threads */
+    PerfmonThread*   threads; /*!< \brief List of threads */
+} PerfmonGroupSet;
+
+/** \brief List of counter with name, config register, counter registers and
+if needed PCI device */
+extern RegisterMap* counter_map;
+/** \brief List of boxes with name, config register, counter registers and if
+needed PCI device. Mainly used in Uncore handling but also core-local counters
+are defined as a box. */
+extern BoxMap* box_map;
+/** \brief List of events available for the current architecture */
+extern PerfmonEvent* eventHash;
+/** \brief List of PCI devices available for the current architecture */
+extern PciDevice* pci_devices;
+/** @}*/
+
+/* perfmon datatypes */
+extern PerfmonGroupSet *groupSet;
+extern int perfmon_numCounters;
+extern int perfmon_numCoreCounters;
+extern int perfmon_numUncoreCounters;
+extern int perfmon_numArchEvents;
 
 
 #endif /*PERFMON_TYPES_H*/
diff --git a/src/includes/perfmon_westmere.h b/src/includes/perfmon_westmere.h
index c469766..c73a140 100644
--- a/src/includes/perfmon_westmere.h
+++ b/src/includes/perfmon_westmere.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_westmere.h
  *
- *      Description:  Header File of perfmon module for Westmere.
+ *      Description:  Header File of perfmon module for Intel Westmere.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,8 +30,6 @@
  */
 
 #include <perfmon_westmere_events.h>
-#include <perfmon_westmere_groups.h>
 
-static int perfmon_numGroupsWestmere = NUM_GROUPS_WESTMERE;
 static int perfmon_numArchEventsWestmere = NUM_ARCH_EVENTS_WESTMERE;
 
diff --git a/src/includes/perfmon_westmereEX.h b/src/includes/perfmon_westmereEX.h
index 8cbc921..a1fa5b9 100644
--- a/src/includes/perfmon_westmereEX.h
+++ b/src/includes/perfmon_westmereEX.h
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon_westmereEX.h
  *
- *      Description:  Header File of perfmon module for Westmere EX.
+ *      Description:  Header File of perfmon module for Intel Westmere EX.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,736 +30,957 @@
  */
 
 #include <perfmon_westmereEX_events.h>
-#include <perfmon_westmereEX_groups.h>
 #include <perfmon_westmereEX_counters.h>
+#include <error.h>
+#include <affinity.h>
 
 
 static int perfmon_numCountersWestmereEX = NUM_COUNTERS_WESTMEREEX;
-static int perfmon_numGroupsWestmereEX = NUM_GROUPS_WESTMEREEX;
 static int perfmon_numArchEventsWestmereEX = NUM_ARCH_EVENTS_WESTMEREEX;
 
-static PerfmonUnit westmereEX_PMunits[NUM_UNITS];
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
-void perfmon_init_westmereEX(PerfmonThread *thread)
+int perfmon_init_westmereEX(int cpu_id)
 {
+    lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id);
+    lock_acquire((int*) &tile_lock[affinity_thread2tile_lookup[cpu_id]], cpu_id);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PEBS_ENABLE, 0x0ULL));
+    return 0;
+}
+
+uint32_t wex_fixed_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint32_t flags = (1ULL<<(1+(index*4)));
+    for(j = 0; j < event->numberOfOptions; j++)
+    {
+        switch (event->options[j].type)
+        {
+            case EVENT_OPTION_COUNT_KERNEL:
+                flags |= (1ULL<<(index*4));
+                break;
+            case EVENT_OPTION_ANYTHREAD:
+                flags |= (1ULL<<(2+(index*4)));
+            default:
+                break;
+        }
+    }
+    return flags;
+}
+
+int wex_pmc_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
     uint64_t flags = 0x0ULL;
-    int cpu_id = thread->processorId;
-
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC0, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC1, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC2, 0x0ULL);
-    msr_write(cpu_id, MSR_PMC3, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR0, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR1, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_FIXED_CTR2, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL);
-    msr_write(cpu_id, MSR_PEBS_ENABLE, 0x0ULL);
-
-    /* initialize fixed counters
-     * FIXED 0: Instructions retired
-     * FIXED 1: Clocks unhalted core
-     * FIXED 2: Clocks unhalted ref */
-    //msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, 0x222ULL);
-
-    /* Preinit of PERFEVSEL registers */
-    //flags |= (1<<22);  /* enable flag */
-    //flags |= (1<<16);  /* user mode flag */
-
-    /*msr_write(cpu_id, MSR_PERFEVTSEL0, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL1, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL2, flags);
-    msr_write(cpu_id, MSR_PERFEVTSEL3, flags);*/
-
-    /* Initialize uncore */
-    /* MBOX */
-    thread->counters[PMC7].id  = 0;
-    thread->counters[PMC8].id  = 1;
-    thread->counters[PMC9].id  = 2;
-    thread->counters[PMC10].id = 3;
-    thread->counters[PMC11].id = 4;
-    thread->counters[PMC12].id = 5;
-    westmereEX_PMunits[MBOX0].ctrlRegister = MSR_M0_PMON_BOX_CTRL;
-    westmereEX_PMunits[MBOX0].statusRegister = MSR_M0_PMON_BOX_STATUS;
-    westmereEX_PMunits[MBOX0].ovflRegister = MSR_M0_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC13].id = 0;
-    thread->counters[PMC14].id = 1;
-    thread->counters[PMC15].id = 2;
-    thread->counters[PMC16].id = 3;
-    thread->counters[PMC17].id = 4;
-    thread->counters[PMC18].id = 5;
-    westmereEX_PMunits[MBOX1].ctrlRegister = MSR_M1_PMON_BOX_CTRL;
-    westmereEX_PMunits[MBOX1].statusRegister = MSR_M1_PMON_BOX_STATUS;
-    westmereEX_PMunits[MBOX1].ovflRegister = MSR_M1_PMON_BOX_OVF_CTRL;
-
-    /* BBOX */
-    thread->counters[PMC19].id = 0;
-    thread->counters[PMC20].id = 1;
-    thread->counters[PMC21].id = 2;
-    thread->counters[PMC22].id = 3;
-    westmereEX_PMunits[BBOX0].ctrlRegister = MSR_B0_PMON_BOX_CTRL;
-    westmereEX_PMunits[BBOX0].statusRegister =  MSR_B0_PMON_BOX_STATUS;
-    westmereEX_PMunits[BBOX0].ovflRegister = MSR_B0_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC23].id = 0;
-    thread->counters[PMC24].id = 1;
-    thread->counters[PMC25].id = 2;
-    thread->counters[PMC26].id = 3;
-    westmereEX_PMunits[BBOX1].ctrlRegister = MSR_B1_PMON_BOX_CTRL;
-    westmereEX_PMunits[BBOX1].statusRegister =  MSR_B1_PMON_BOX_STATUS;
-    westmereEX_PMunits[BBOX1].ovflRegister = MSR_B1_PMON_BOX_OVF_CTRL;
-
-    /* RBOX */
-    thread->counters[PMC27].id = 0;
-    thread->counters[PMC28].id = 1;
-    thread->counters[PMC29].id = 2;
-    thread->counters[PMC30].id = 3;
-    thread->counters[PMC31].id = 4;
-    thread->counters[PMC32].id = 5;
-    thread->counters[PMC33].id = 6;
-    thread->counters[PMC34].id = 7;
-    westmereEX_PMunits[RBOX0].ctrlRegister = MSR_R0_PMON_BOX_CTRL;
-    westmereEX_PMunits[RBOX0].statusRegister =  MSR_R0_PMON_BOX_STATUS;
-    westmereEX_PMunits[RBOX0].ovflRegister = MSR_R0_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC35].id = 0;
-    thread->counters[PMC36].id = 1;
-    thread->counters[PMC37].id = 2;
-    thread->counters[PMC38].id = 3;
-    thread->counters[PMC39].id = 4;
-    thread->counters[PMC40].id = 5;
-    thread->counters[PMC41].id = 6;
-    thread->counters[PMC42].id = 7;
-    westmereEX_PMunits[RBOX1].ctrlRegister = MSR_R1_PMON_BOX_CTRL;
-    westmereEX_PMunits[RBOX1].statusRegister =  MSR_R1_PMON_BOX_STATUS;
-    westmereEX_PMunits[RBOX1].ovflRegister = MSR_R1_PMON_BOX_OVF_CTRL;
-
-    /* WBOX */
-    thread->counters[PMC43].id = 0;
-    thread->counters[PMC44].id = 1;
-    thread->counters[PMC45].id = 2;
-    thread->counters[PMC46].id = 3;
-    thread->counters[PMC47].id = 31;
-    westmereEX_PMunits[WBOX].ctrlRegister   = MSR_W_PMON_BOX_CTRL;
-    westmereEX_PMunits[WBOX].statusRegister = MSR_W_PMON_BOX_STATUS;
-    westmereEX_PMunits[WBOX].ovflRegister   = MSR_W_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC48].id = 0;
-    westmereEX_PMunits[UBOX].ctrlRegister   = MSR_U_PMON_GLOBAL_CTRL;
-    westmereEX_PMunits[UBOX].statusRegister = MSR_U_PMON_GLOBAL_STATUS;
-    westmereEX_PMunits[UBOX].ovflRegister   = MSR_U_PMON_GLOBAL_OVF_CTRL;
-
-    /* Set IDs for all CBOXes */
-    int walker = 0;
-    for (int i=PMC49; i<=PMC98; i++)
-    {
-        thread->counters[i].id = walker;
-        walker = (walker == 4 ? 0 : walker + 1);
-    }
-    westmereEX_PMunits[CBOX0].ctrlRegister   = MSR_C0_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX0].statusRegister = MSR_C0_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX0].ovflRegister   = MSR_C0_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX1].ctrlRegister   = MSR_C1_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX1].statusRegister = MSR_C1_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX1].ovflRegister   = MSR_C1_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX2].ctrlRegister   = MSR_C2_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX2].statusRegister = MSR_C2_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX2].ovflRegister   = MSR_C2_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX3].ctrlRegister   = MSR_C3_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX3].statusRegister = MSR_C3_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX3].ovflRegister   = MSR_C3_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX4].ctrlRegister   = MSR_C4_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX4].statusRegister = MSR_C4_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX4].ovflRegister   = MSR_C4_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX5].ctrlRegister   = MSR_C5_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX5].statusRegister = MSR_C5_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX5].ovflRegister   = MSR_C5_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX6].ctrlRegister   = MSR_C6_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX6].statusRegister = MSR_C6_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX6].ovflRegister   = MSR_C6_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX7].ctrlRegister   = MSR_C7_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX7].statusRegister = MSR_C7_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX7].ovflRegister   = MSR_C7_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX8].ctrlRegister   = MSR_C8_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX8].statusRegister = MSR_C8_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX8].ovflRegister   = MSR_C8_PMON_BOX_OVF_CTRL;
-    westmereEX_PMunits[CBOX9].ctrlRegister   = MSR_C9_PMON_BOX_CTRL;
-    westmereEX_PMunits[CBOX9].statusRegister = MSR_C9_PMON_BOX_STATUS;
-    westmereEX_PMunits[CBOX9].ovflRegister   = MSR_C9_PMON_BOX_OVF_CTRL;
-
-    thread->counters[PMC99].id = 0;
-    thread->counters[PMC100].id = 1;
-    thread->counters[PMC101].id = 2;
-    thread->counters[PMC102].id = 3;
-    westmereEX_PMunits[SBOX0].ctrlRegister   = MSR_S0_PMON_BOX_CTRL;
-    westmereEX_PMunits[SBOX0].statusRegister = MSR_S0_PMON_BOX_STATUS;
-    westmereEX_PMunits[SBOX0].ovflRegister   = MSR_S0_PMON_BOX_OVF_CTRL;
-    thread->counters[PMC103].id = 0;
-    thread->counters[PMC104].id = 1;
-    thread->counters[PMC105].id = 2;
-    thread->counters[PMC106].id = 3;
-    westmereEX_PMunits[SBOX1].ctrlRegister   = MSR_S1_PMON_BOX_CTRL;
-    westmereEX_PMunits[SBOX1].statusRegister = MSR_S1_PMON_BOX_STATUS;
-    westmereEX_PMunits[SBOX1].ovflRegister   = MSR_S1_PMON_BOX_OVF_CTRL;
-
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire((int*) &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
-    {
-        msr_write(cpu_id, MSR_W_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_W_PMON_FIXED_CTR, 0x0ULL);
-
-        msr_write(cpu_id, MSR_M0_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL4, 0x0ULL);
-        msr_write(cpu_id, MSR_M0_PMON_EVNT_SEL5, 0x0ULL);
-
-        msr_write(cpu_id, MSR_M1_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL4, 0x0ULL);
-        msr_write(cpu_id, MSR_M1_PMON_EVNT_SEL5, 0x0ULL);
-
-        msr_write(cpu_id, MSR_B0_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_B0_PMON_EVNT_SEL3, 0x0ULL);
-
-        msr_write(cpu_id, MSR_B1_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_B1_PMON_EVNT_SEL3, 0x0ULL);
-
-        msr_write(cpu_id, MSR_R0_PMON_BOX_CTRL,  0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL4, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL5, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL6, 0x0ULL);
-        msr_write(cpu_id, MSR_R0_PMON_EVNT_SEL7, 0x0ULL);
-
-        msr_write(cpu_id, MSR_R1_PMON_BOX_CTRL,   0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL8,  0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL9,  0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL10, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL11, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL12, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL13, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL14, 0x0ULL);
-        msr_write(cpu_id, MSR_R1_PMON_EVNT_SEL15, 0x0ULL);
-
-        msr_write(cpu_id, MSR_U_PMON_GLOBAL_EVNT_SEL, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C0_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C1_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C2_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C3_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C4_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C5_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C6_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C7_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C8_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL3, 0x0ULL);
-        msr_write(cpu_id, MSR_C9_PMON_EVNT_SEL4, 0x0ULL);
-
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_S0_PMON_EVNT_SEL3, 0x0ULL);
-
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL0, 0x0ULL);
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL1, 0x0ULL);
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL2, 0x0ULL);
-        msr_write(cpu_id, MSR_S1_PMON_EVNT_SEL3, 0x0ULL);
+    uint64_t offcore_flags = 0x0ULL;
+
+    flags = (1ULL<<22)|(1ULL<<16);
+    /* Intel with standard 8 bit event mask: [7:0] */
+    flags |= (event->umask<<8) + event->eventId;
+
+    /* set custom cfg and cmask */
+    if ((event->cfgBits != 0) &&
+        (event->eventId != 0xB7) &&
+        (event->eventId != 0xBB))
+    {
+        flags |= ((event->cmask<<8) + event->cfgBits)<<16;
+    }
 
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_COUNT_KERNEL:
+                    flags |= (1ULL<<17);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= (event->options[j].value & 0xFFULL)<<24;
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    offcore_flags |= (event->options[j].value & 0xFFULL);
+                    break;
+                case EVENT_OPTION_MATCH1:
+                    offcore_flags |= (event->options[j].value & 0xF7ULL)<<8;
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (event->eventId == 0xB7)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
+        {
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
+        }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, offcore_flags));
+    }
+    else if (event->eventId == 0xBB)
+    {
+        if ((event->cfgBits != 0xFF) && (event->cmask != 0xFF))
         {
-            uint32_t ubflags = 0x0UL;
-            ubflags |= (1<<29); /* reset all */
-            msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
+            offcore_flags = (1ULL<<event->cfgBits)|(1ULL<<event->cmask);
         }
+        VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, LLU_CAST offcore_flags, SETUP_PMC_OFFCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, offcore_flags));
     }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, SETUP_PMC);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister , flags));
+    return 0;
 }
 
-#define MBOX_GATE(NUM)  \
-    flags = 0x41ULL; \
-switch (event->cfgBits)  \
-{  \
-    case 0x00:   /* primary Event */  \
-        flags |= (event->eventId<<9);  \
-        break;  \
-    case 0x01: /* secondary Events */  \
-        /* TODO fvid index is missing defaults to 0 */   \
-        flags |= (1<<7); /* toggle flag mode */   \
-        flags |= (event->eventId<<19);   \
-        switch (event->eventId)   \
-        {   \
-            case 0x00: /* CYCLES_DSP_FILL: DSP */   \
+int wex_bbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    RegisterType type = counter_map[index].type;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = 0x1ULL;
+    flags |=  (event->eventId<<1);
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_MATCH0:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1, event->options[j].value));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, event->options[j].value, SETUP_BBOX_MATCH);
+                    break;
+                case EVENT_OPTION_MASK0:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2, event->options[j].value));
+                    VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister2, event->options[j].value, SETUP_BBOX_MASK);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_BBOX);
+    return 0;
+}
+
+int wex_uncore_box_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0x1FULL) << 24);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_BOX);
+    return 0;
+}
+
+
+int wex_sbox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+    int write_mm_cfg = 0;
+    RegisterType type = counter_map[index].type;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22);
+    flags |= (event->umask<<8) + event->eventId;
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                case EVENT_OPTION_INVERT:
+                    flags |= (1ULL<<23);
+                    break;
+                case EVENT_OPTION_THRESHOLD:
+                    flags |= ((event->options[j].value & 0xFFULL) << 24);
+                    break;
+                case EVENT_OPTION_MATCH0:
+                    if (event->eventId == 0x0)
+                    {
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister1,event->options[j].value));
+                        VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, event->options[j].value, SETUP_SBOX_MATCH);
+                        write_mm_cfg = 1;
+                    }
+                    break;
+                case EVENT_OPTION_MASK0:
+                    if (event->eventId == 0x0)
+                    {
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[type].filterRegister2,event->options[j].value));
+                        VERBOSEPRINTREG(cpu_id, box_map[type].filterRegister1, event->options[j].value, SETUP_SBOX_MASK);
+                        write_mm_cfg = 1;
+                    }
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    if (write_mm_cfg && event->eventId == 0x0)
+    {
+        if (type == SBOX0)
+        {
+            VERBOSEPRINTREG(cpu_id, MSR_S0_PMON_MM_CFG, (1ULL<<63), SETUP_SBOX_MATCH_CTRL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S0_PMON_MM_CFG ,(1ULL<<63)));
+        }
+        else if (type == SBOX1)
+        {
+            VERBOSEPRINTREG(cpu_id, MSR_S1_PMON_MM_CFG, (1ULL<<63), SETUP_SBOX_MATCH_CTRL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_S1_PMON_MM_CFG ,(1ULL<<63)));
+        }
+    }
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister, flags));
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, flags, SETUP_SBOX);
+    return 0;
+}
+
+int wex_ubox_setup(int cpu_id, RegisterIndex index, PerfmonEvent *event)
+{
+    int j;
+    uint64_t flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+
+    flags = (1ULL<<22);
+    flags |= (event->eventId & 0xFF);
+    if (event->numberOfOptions > 0)
+    {
+        for(j = 0; j < event->numberOfOptions; j++)
+        {
+            switch (event->options[j].type)
+            {
+                case EVENT_OPTION_EDGE:
+                    flags |= (1ULL<<18);
+                    break;
+                default:
+                    break;
+            }
+        }
+    }
+    VERBOSEPRINTREG(cpu_id, counter_map[index].configRegister, LLU_CAST flags, UBOX_CTRL);
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter_map[index].configRegister , flags));
+    return 0;
+}
+
+
+/* MBOX macros */
+
+#define WEX_SETUP_MBOX(number)  \
+    if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(MBOX##number))) \
+    { \
+        flags = 0x41ULL; \
+        if (event->numberOfOptions > 0 && (event->cfgBits == 0x02 || event->cfgBits == 0x04)) \
+        { \
+            for (int j=0; j < event->numberOfOptions; j++) \
+            {\
+                switch (event->options[j].type) \
+                { \
+                    case EVENT_OPTION_MATCH0: \
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ADDR_MATCH, (event->options[j].value & 0x3FFFFFFFFULL))); \
+                        VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, (event->options[j].value & 0x3FFFFFFFFULL), MBOX##number##_ADDR_MATCH) \
+                        break; \
+                    case EVENT_OPTION_MASK0: \
+                        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ADDR_MASK, (event->options[j].value & 0x1FFFFFFC0ULL)>>6)); \
+                        VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, (event->options[j].value & 0x1FFFFFFC0ULL)>>6, MBOX##number##_ADDR_MASK) \
+                        break; \
+                    default: \
+                        break; \
+                } \
+            } \
+        } \
+        switch (event->cfgBits)  \
+        {  \
+            case 0x00:   /* primary Event */  \
+                flags |= (event->eventId & 0x1FULL)<<9;  \
+                break;  \
+            case 0x01: /* secondary Events */  \
+                /* TODO fvid index is missing defaults to 0 */   \
+                flags |= (1ULL<<7); /* toggle flag mode */   \
+                flags |= (event->eventId & 0x7ULL)<<19;   \
+                switch (event->eventId)   \
+                {   \
+                    case 0x00: /* CYCLES_DSP_FILL: DSP */   \
+                        {   \
+                            uint64_t dsp_flags = 0x0ULL;   \
+                            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_DSP, &dsp_flags));   \
+                            VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_DSP, dsp_flags, MBOX##number##_DSP); \
+                            dsp_flags |= (event->umask & 0xFULL)<<7;  \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_DSP, dsp_flags));   \
+                            VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_DSP, dsp_flags, MBOX##number##_DSP); \
+                        }   \
+                        break;   \
+                    case 0x01: /* CYCLES_SCHED_MODE: ISS */   \
+                        {   \
+                          uint64_t iss_flags = 0x0ULL;   \
+                          CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ISS, &iss_flags));   \
+                          VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ISS, iss_flags, MBOX##number##_ISS_READ); \
+                          iss_flags |= (event->umask & 0x7ULL)<<4;   \
+                          CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ISS, iss_flags));   \
+                          VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ISS, iss_flags, MBOX##number##_ISS); \
+                        }    \
+                        break;   \
+                    case 0x05: /* CYCLES_PGT_STATE: PGT */   \
+                        {   \
+                         uint64_t pgt_flags = 0x0ULL;   \
+                         CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_PGT, &pgt_flags));   \
+                         VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PGT, pgt_flags, MBOX##number##_PGT_READ); \
+                         pgt_flags |= (event->umask & 0x1ULL)<<6;   \
+                         CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_PGT, pgt_flags));   \
+                         VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PGT, pgt_flags, MBOX##number##_PGT); \
+                        }    \
+                        break;   \
+                    case 0x06: /* BCMD_SCHEDQ_OCCUPANCY: MAP */   \
+                        {   \
+                          uint64_t map_flags = 0x0ULL;   \
+                          CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_MAP, &map_flags));   \
+                          VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_MAP, map_flags, MBOX##number##_MAP_READ); \
+                          map_flags |= (event->umask & 0x7ULL)<<6;   \
+                          CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_MAP, map_flags));   \
+                          VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_MAP, map_flags, MBOX##number##_MAP); \
+                        }   \
+                        break;   \
+                }    \
+                break;   \
+            case 0x02: /* DRAM_CMD: PLD/ISS */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                {   \
+                    uint64_t pld_flags = 0x0ULL;   \
+                    uint64_t iss_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_PLD, &pld_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PLD, pld_flags, MBOX##number##_PLD_READ); \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ISS, &iss_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ISS, iss_flags, MBOX##number##_ISS_READ); \
+                    pld_flags |= (event->umask & 0x1FULL)<<8;   \
+                    if ((event->cmask & 0xFULL) != 0)   \
+                    {   \
+                        iss_flags |= (event->cmask & 0x7ULL)<<7;   \
+                    }   \
+                    if ((event->cmask & 0xF0ULL) != 0) \
+                    { \
+                        pld_flags |= (1ULL<<0); /* toggle cmd flag */   \
+                    } \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_PLD, pld_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PLD, pld_flags, MBOX##number##_PLD); \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ISS, iss_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ISS, iss_flags, MBOX##number##_ISS); \
+                }   \
+                break;   \
+            case 0x03: /* DSP_FILL: DSP */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
                 {   \
                     uint64_t dsp_flags = 0x0ULL;   \
-                    dsp_flags |= (event->umask<<7);  \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags);   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_DSP, &dsp_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_DSP, dsp_flags, MBOX##number##_DSP_READ); \
+                    dsp_flags |= (event->umask & 0xFULL)<<7;   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_DSP, dsp_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_DSP, dsp_flags, MBOX##number##_DSP); \
                 }   \
                 break;   \
-            case 0x01: /* CYCLES_SCHED_MODE: ISS */   \
+            case 0x04: /* DRAM_MISC: PLD */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
                 {   \
-                    uint32_t iss_flags = 0x0UL;   \
-                    iss_flags |= (event->umask<<4);   \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-                }    \
+                    uint64_t pld_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_PLD, &pld_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PLD, pld_flags, MBOX##number##_PLD_READ); \
+                    switch (event->cmask)   \
+                    {   \
+                        case 0x0:   \
+                            pld_flags |= (1ULL<<16);   \
+                            pld_flags |= (event->umask & 0x1FULL)<<19;   \
+                            break;   \
+                        case 0x1:   \
+                            pld_flags |= (event->umask & 0x1ULL)<<18;   \
+                            break;   \
+                        case 0x2:   \
+                            pld_flags |= (event->umask & 0x1ULL)<<17;   \
+                            break;   \
+                        case 0x3:   \
+                            pld_flags |= (event->umask & 0x1ULL)<<7;   \
+                            break;   \
+                    }   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_PLD, pld_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PLD, pld_flags, MBOX##number##_PLD); \
+                }   \
                 break;   \
-            case 0x05: /* CYCLES_PGT_STATE: PGT */   \
+            case 0x05: /* FRM_TYPE: ISS */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
                 {   \
-                    uint32_t pgt_flags = 0x0UL;   \
-                    pgt_flags |= (event->umask<<6);   \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-                }    \
+                    uint64_t iss_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ISS, &iss_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ISS, iss_flags, MBOX##number##_ISS_READ); \
+                    iss_flags |= (event->umask & 0xFULL);   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ISS, iss_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ISS, iss_flags, MBOX##number##_ISS); \
+                }   \
                 break;   \
-            case 0x06: /* BCMD_SCHEDQ_OCCUPANCY: MAP */   \
+            case 0x06: /* FVC_EV0: FVC */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
                 {   \
-                    uint32_t map_flags = 0x0UL;   \
-                    map_flags |= (event->umask<<6);   \
-                    msr_write(cpu_id, MSR_M##NUM##_PMON_MAP, map_flags);   \
+                    uint64_t fvc_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, &fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_READ); \
+                    fvc_flags |= (event->umask & 0x7ULL)<<12;   \
+                    if (event->umask == 0x5)   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<6;   \
+                    }   \
+                    else   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<9;   \
+                    }   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_EV0); \
                 }   \
                 break;   \
-        }    \
-        break;   \
-    case 0x02: /* DRAM_CMD: PLD/ISS */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t pld_flags = 0x0UL;   \
-            uint32_t iss_flags = 0x0UL;   \
-            pld_flags |= (event->umask<<8);   \
-            if (event->cmask != 0)   \
-            {   \
-                iss_flags |= (event->cmask<<7);   \
-                pld_flags |= 1; /* toggle cmd flag */   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags);   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-        }   \
-        break;   \
-    case 0x03: /* DSP_FILL: DSP */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint64_t dsp_flags = 0x0ULL;   \
-            dsp_flags |= (event->umask<<7);   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_DSP, dsp_flags);   \
-        }   \
-        break;   \
-    case 0x04: /* DRAM_MISC: PLD */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint64_t pld_flags = 0x0ULL;   \
-            switch (event->cmask)   \
-            {   \
-                case 0x0:   \
-                            pld_flags |= (1<<16);   \
-                pld_flags |= (event->umask<<19);   \
+            case 0x07: /* FVC_EV1: FVC */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                {   \
+                    uint64_t fvc_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, &fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_READ); \
+                    fvc_flags |= (event->umask & 0x7ULL)<<15;   \
+                    if (event->umask == 0x5)   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<6;   \
+                    }   \
+                    else   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<9;   \
+                    }   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_EV1); \
+                }   \
                 break;   \
-                case 0x1:   \
-                            pld_flags |= (event->umask<<18);   \
+            case 0x08: /* FVC_EV2: FVC */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                {   \
+                    uint64_t fvc_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, &fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_READ); \
+                    fvc_flags |= (event->umask & 0x7ULL)<<18;   \
+                    if (event->umask == 0x5)   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<6;   \
+                    }   \
+                    else   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<9;   \
+                    }   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_EV2); \
+                }   \
                 break;   \
-                case 0x2:   \
-                            pld_flags |= (event->umask<<17);   \
+            case 0x09: /* FVC_EV3: FVC(ZDP) */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                {   \
+                    uint64_t fvc_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, &fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_READ); \
+                    fvc_flags |= (event->umask & 0x7ULL)<<21;   \
+                    if (event->umask == 0x5)   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<6;   \
+                    }   \
+                    else   \
+                    {   \
+                        fvc_flags |= (event->cmask & 0x7ULL)<<9;   \
+                    }   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ZDP, fvc_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ZDP, fvc_flags, MBOX##number##_FVC_EV3); \
+                }   \
                 break;   \
-                case 0x3:   \
-                            pld_flags |= (event->umask<<7);   \
+            case 0x0A: /* ISS_SCHED: ISS */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                {   \
+                    uint64_t iss_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_ISS, &iss_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ISS, iss_flags, MBOX##number##_ISS_READ); \
+                    iss_flags |= (event->umask & 0x1ULL)<<10;   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_ISS, iss_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_ISS, iss_flags, MBOX##number##_ISS); \
+                }   \
                 break;   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_PLD, pld_flags);   \
-        }   \
-        break;   \
-    case 0x05: /* FRM_TYPE: ISS */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t iss_flags = 0x0UL;   \
-            iss_flags |= event->umask;   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-        }   \
-        break;   \
-    case 0x06: /* FVC_EV0: FVC */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t fvc_flags = 0x0UL;   \
-            fvc_flags |= (event->umask<<12);   \
-            if (event->umask == 0x5)   \
-            {   \
-                fvc_flags |= (event->cmask<<6);   \
-            }   \
-            else   \
-            {   \
-                fvc_flags |= (event->cmask<<9);   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV0) \
-        }   \
-        break;   \
-    case 0x07: /* FVC_EV1: FVC */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t fvc_flags = 0x0UL;   \
-            fvc_flags |= (event->umask<<15);   \
-            if (event->umask == 0x5)   \
-            {   \
-                fvc_flags |= (event->cmask<<6);   \
-            }   \
-            else   \
-            {   \
-                fvc_flags |= (event->cmask<<9);   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV1) \
-        }   \
-        break;   \
-    case 0x08: /* FVC_EV2: FVC */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t fvc_flags = 0x0UL;   \
-            fvc_flags |= (event->umask<<18);   \
-            if (event->umask == 0x5)   \
-            {   \
-                fvc_flags |= (event->cmask<<6);   \
-            }   \
-            else   \
-            {   \
-                fvc_flags |= (event->cmask<<9);   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-            VERBOSEPRINTREG(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags, FVC_EV2) \
-        }   \
-        break;   \
-    case 0x09: /* FVC_EV3: FVC(ZDP) */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t fvc_flags = 0x0UL;   \
-            fvc_flags |= (event->umask<<21);   \
-            if (event->umask == 0x5)   \
-            {   \
-                fvc_flags |= (event->cmask<<6);   \
-            }   \
-            else   \
-            {   \
-                fvc_flags |= (event->cmask<<9);   \
-            }   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ZDP, fvc_flags);   \
-        }   \
-        break;   \
-    case 0x0A: /* ISS_SCHED: ISS */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t iss_flags = 0x0UL;   \
-            iss_flags |= (event->umask<<10);   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_ISS, iss_flags);   \
-        }   \
-        break;   \
-    case 0x0B: /* PGT_PAGE_EV: PGT */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t pgt_flags = 0x0UL;   \
-            pgt_flags |= event->umask;   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-        }   \
-        break;   \
-    case 0x0C: /* PGT_PAGE_EV2: PGT */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t pgt_flags = 0x0UL;   \
-            pgt_flags |= (event->umask<<11);   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, pgt_flags);   \
-        }   \
-        break;   \
-    case 0x0D: /* THERM_TRP_DN: THR */   \
-        flags |= (event->eventId<<9);   \
-        {   \
-            uint32_t thr_flags = 0x0UL;   \
-            thr_flags |= (1<<3);   \
-            thr_flags |= (event->umask<<9);   \
-            msr_write(cpu_id, MSR_M##NUM##_PMON_PGT, thr_flags);   \
-        }   \
-        break;   \
-}
+            case 0x0B: /* PGT_PAGE_EV: PGT */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                {   \
+                    uint64_t pgt_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_PGT, &pgt_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PGT, pgt_flags, MBOX##number##_PGT_READ); \
+                    pgt_flags |= (event->umask & 0x1ULL);   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_PGT, pgt_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PGT, pgt_flags, MBOX##number##_PGT); \
+                }   \
+                break;   \
+            case 0x0C: /* PGT_PAGE_EV2: PGT */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                {   \
+                    uint64_t pgt_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_PGT, &pgt_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PGT, pgt_flags, MBOX##number##_PGT_READ); \
+                    pgt_flags |= (event->umask & 0x1ULL)<<11;   \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_PGT, pgt_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_PGT, pgt_flags, MBOX##number##_PGT); \
+                }   \
+                break;   \
+            case 0x0D: /* THERM_TRP_DN: THR */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                {   \
+                    uint64_t thr_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_MSC_THR, &thr_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_MSC_THR, thr_flags, MBOX##number##_PGT_READ); \
+                    thr_flags |= (event->umask & 0x3ULL)<<9;   \
+                    if (event->cmask == 0x0) \
+                    { \
+                        thr_flags |= (1ULL<<3);   \
+                    } \
+                    else \
+                    { \
+                        thr_flags &= ~(1ULL<<3);   \
+                        thr_flags |= (event->cmask & 0x7ULL)<<4;   \
+                    } \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_MSC_THR, thr_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_MSC_THR, thr_flags, MBOX##number##_THR); \
+                }   \
+                break;   \
+            case 0x0E: /* THERM_TRP_UP: THR */   \
+                flags |= (event->eventId & 0x1FULL)<<9;   \
+                if (event->cmask == 0x0) \
+                {   \
+                    uint64_t thr_flags = 0x0ULL;   \
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_M##number##_PMON_MSC_THR, &thr_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_MSC_THR, thr_flags, MBOX##number##_PGT_READ); \
+                    thr_flags |= (event->umask & 0x3ULL)<<7;   \
+                    if (event->cmask == 0x0) \
+                    { \
+                        thr_flags |= (1ULL<<3);   \
+                    } \
+                    else \
+                    { \
+                        thr_flags &= ~(1ULL<<3);   \
+                        thr_flags |= (event->cmask & 0x7ULL)<<4;   \
+                    } \
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M##number##_PMON_MSC_THR, thr_flags));   \
+                    VERBOSEPRINTREG(cpu_id, MSR_M##number##_PMON_MSC_THR, thr_flags, MBOX##number##_THR); \
+                }   \
+                break;   \
+        } \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags)); \
+    }
 
 /* RBOX macros */
-#define RBOX_GATE(NUM)  \
-    flags = 0x01ULL; /* set local enable flag */ \
-switch (event->eventId) {  \
-    case 0x00:  \
-                flags |= (event->umask<<1); /* configure sub register */   \
-    {  \
-        uint32_t iperf_flags = 0x0UL;   \
-        iperf_flags |= (event->cfgBits<<event->cmask); /* configure event */  \
-        switch (event->umask) { /* pick correct iperf register */  \
-            case 0x00: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF0_P0, iperf_flags);   \
-            break; \
-            case 0x01: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF1_P0, iperf_flags);   \
-            break; \
-            case 0x06: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF0_P1, iperf_flags);   \
-            break; \
-            case 0x07: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF1_P1, iperf_flags);   \
-            break; \
-            case 0x0C: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF0_P2, iperf_flags);   \
-            break; \
-            case 0x0D: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF1_P2, iperf_flags);   \
-            break; \
-            case 0x12: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF0_P3, iperf_flags);   \
-            break; \
-            case 0x13: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_IPERF1_P3, iperf_flags);   \
-            break; \
-        } } \
-    break; \
-    case 0x01: \
-               flags |= (event->umask<<1); /* configure sub register */   \
+#define WEX_SETUP_RBOX(number)  \
+    if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(RBOX##number))) \
     { \
-        uint32_t qlx_flags = 0x0UL;   \
-        qlx_flags |= (event->cfgBits); /* configure event */  \
-        if (event->cmask) qlx_flags |= (event->cmask<<4);  \
-        switch (event->umask) { /* pick correct qlx register */  \
-            case 0x02: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, qlx_flags);   \
-            break; \
-            case 0x03: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, (qlx_flags<<8));   \
-            break; \
-            case 0x08: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, qlx_flags);   \
-            break; \
-            case 0x09: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P1, (qlx_flags<<8));   \
-            break; \
-            case 0x0E: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, qlx_flags);   \
-            break; \
-            case 0x0F: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P2, (qlx_flags<<8));   \
-            break; \
-            case 0x14: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P0, qlx_flags);   \
-            break; \
-            case 0x15: \
-                       msr_write(cpu_id, MSR_R##NUM##_PMON_QLX_P3, (qlx_flags<<8));   \
-            break; \
-        } } \
-    break; \
+        flags = 0x01ULL; /* set local enable flag */ \
+        switch (event->eventId) {  \
+            case 0x00:  \
+                flags |= (event->umask & 0x1FULL)<<1; /* configure sub register */   \
+                {  \
+                    uint64_t iperf_flags = 0x0ULL;   \
+                    iperf_flags |= (event->cfgBits<<event->cmask); /* configure event */  \
+                    switch (event->umask) { /* pick correct iperf register */  \
+                        case 0x00: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF0_P0, iperf_flags));   \
+                            break; \
+                        case 0x01: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF1_P0, iperf_flags));   \
+                            break; \
+                        case 0x06: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF0_P1, iperf_flags));   \
+                            break; \
+                        case 0x07: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF1_P1, iperf_flags));   \
+                            break; \
+                        case 0x0C: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF0_P2, iperf_flags));   \
+                            break; \
+                        case 0x0D: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF1_P2, iperf_flags));   \
+                            break; \
+                        case 0x12: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF0_P3, iperf_flags));   \
+                            break; \
+                        case 0x13: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_IPERF1_P3, iperf_flags));   \
+                            break; \
+                    } \
+                } \
+                break; \
+            case 0x01: \
+                flags |= (event->umask & 0x1FULL)<<1; /* configure sub register */   \
+                { \
+                    uint64_t qlx_flags = 0x0ULL;   \
+                    qlx_flags |= (event->cfgBits & 0xFULL); /* configure event */  \
+                    if (event->cmask) qlx_flags |= (event->cmask & 0xFULL)<<4;  \
+                    switch (event->umask) { /* pick correct qlx register */  \
+                        case 0x02: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P0, qlx_flags));   \
+                            break; \
+                        case 0x03: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P0, (qlx_flags<<8)));   \
+                            break; \
+                        case 0x08: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P1, qlx_flags));   \
+                            break; \
+                        case 0x09: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P1, (qlx_flags<<8)));   \
+                            break; \
+                        case 0x0E: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P2, qlx_flags));   \
+                            break; \
+                        case 0x0F: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P2, (qlx_flags<<8)));   \
+                            break; \
+                        case 0x14: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P3, qlx_flags));   \
+                            break; \
+                        case 0x15: \
+                            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R##number##_PMON_QLX_P3, (qlx_flags<<8)));   \
+                            break; \
+                    } \
+                } \
+                break; \
+        } \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags)); \
+    }
+
+
+int wex_uncore_freeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
+{
+    uint64_t freeze_flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (eventSet->regTypeMask & ~(0xF))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &freeze_flags));
+        freeze_flags &= ~(1ULL<<28);
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST freeze_flags, FREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, freeze_flags));
+    }
+    if (flags != FREEZE_FLAG_ONLYFREEZE)
+    {
+        if (flags & FREEZE_FLAG_CLEAR_CTR)
+        {
+            uint64_t clear_flags = 0x0ULL;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &clear_flags));
+            clear_flags |= 29;
+            VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST freeze_flags, CLEAR_UNCORE_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, freeze_flags));
+        }
+        else if (flags & FREEZE_FLAG_CLEAR_CTL)
+        {
+            for (int i=0;i < eventSet->numberOfEvents;i++)
+            {
+                uint32_t reg = counter_map[eventSet->events[i].index].configRegister;
+                if (reg != 0x0ULL)
+                {
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, reg, 0x0ULL, CLEAR_UNCORE_CTL);
+                }
+            }
+        }
+
+    }
+    return 0;
+}
+
+int wex_uncore_unfreeze(int cpu_id, PerfmonEventSet* eventSet, int flags)
+{
+    uint64_t unfreeze_flags = 0x0ULL;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] != cpu_id)
+    {
+        return 0;
+    }
+    if (flags != FREEZE_FLAG_ONLYFREEZE)
+    {
+        if (flags & FREEZE_FLAG_CLEAR_CTR)
+        {
+            uint64_t clear_flags = 0x0ULL;
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &clear_flags));
+            clear_flags |= 29;
+            VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST clear_flags, CLEAR_UNCORE_CTR);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, clear_flags));
+        }
+        else if (flags & FREEZE_FLAG_CLEAR_CTL)
+        {
+            for (int i=0;i < eventSet->numberOfEvents;i++)
+            {
+                uint32_t reg = counter_map[eventSet->events[i].index].configRegister;
+                if (reg != 0x0ULL)
+                {
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+                    VERBOSEPRINTREG(cpu_id, reg, 0x0ULL, CLEAR_UNCORE_CTL);
+                }
+            }
+        }
+    }
+    if (eventSet->regTypeMask & ~(0xF))
+    {
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, &unfreeze_flags));
+        unfreeze_flags |= (1ULL<<28);
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST unfreeze_flags, UNFREEZE_UNCORE);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, unfreeze_flags));
+    }
+    return 0;
 }
 
+#define WEX_RESET_OVF_BOX(id) \
+    if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(id))) \
+    { \
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, 0xFFFFFFFF)); \
+    }
 
 
-void perfmon_setupCounterThread_westmereEX(
-        int thread_id,
-        PerfmonEvent* event,
-        PerfmonCounterIndex index)
+int perfmon_setupCounterThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
     uint64_t flags = 0x0ULL;
-    uint64_t reg = westmereEX_counter_map[index].configRegister;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-    uint64_t fixed_flags = msr_read(cpu_id, MSR_PERF_FIXED_CTR_CTRL);
-    perfmon_threadData[thread_id].counters[index].init = TRUE;
+    uint64_t fixed_flags = 0x0ULL;
+    uint64_t ubox_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint32_t uflags[NUM_UNITS] = { [0 ... NUM_UNITS-1] = 0x0U };
 
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
 
-    switch (westmereEX_counter_map[index].type)
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(FIXED)|REG_TYPE_MASK(PMC)))
     {
-        case PMC:
-            flags = (1<<22)|(1<<16);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, 0x0ULL));
+    }
 
-            /* Intel with standard 8 bit event mask: [7:0] */
-            flags |= (event->umask<<8) + event->eventId;
+    if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL));
+    }
 
-            if (event->cfgBits != 0) /* set custom cfg and cmask */
-            {
-                flags &= ~(0xFFFFU<<16);  /* clear upper 16bits */
-                flags |= ((event->cmask<<8) + event->cfgBits)<<16;
-            }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX0))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_TIMESTAMP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_DSP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_ISS, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_MAP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_MSC_THR, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_PGT, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_PLD, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M0_PMON_ZDP, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(MBOX1))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_TIMESTAMP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_DSP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_ISS, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_MAP, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_MSC_THR, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_PGT, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_PLD, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_M1_PMON_ZDP, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(RBOX0))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF0_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_IPERF1_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R0_PMON_QLX_P3, 0x0ULL));
+    }
+    if (haveLock && (eventSet->regTypeMask & (REG_TYPE_MASK(RBOX1))))
+    {
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF0_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_IPERF1_P3, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P0, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P1, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P2, 0x0ULL));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_R1_PMON_QLX_P3, 0x0ULL));
+    }
 
-            msr_write(cpu_id, reg , flags);
-            VERBOSEPRINTREG(cpu_id, reg, flags, PMC_EV_SEL)
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+        {
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        PerfmonEvent *event = &(eventSet->events[i].event);
+        uint64_t reg = counter_map[index].configRegister;
+        eventSet->events[i].threadCounter[thread_id].init = TRUE;
+        flags = 0x0ULL;
+        switch (type)
+        {
+            case PMC:
+                wex_pmc_setup(cpu_id, index, event);
                 break;
 
-        case FIXED:
-            fixed_flags |= (0x2 <<(index*4));
-            msr_write(cpu_id, MSR_PERF_FIXED_CTR_CTRL, fixed_flags);
-            break;
+            case FIXED:
+                fixed_flags |= wex_fixed_setup(cpu_id, index, event);
+                break;
 
-        case MBOX0:
-            if (haveLock)
-            {
-                MBOX_GATE(0);
-                msr_write(cpu_id, reg , flags);
+            case MBOX0:
+                WEX_SETUP_MBOX(0);
                 VERBOSEPRINTREG(cpu_id, reg, flags, MBOX0_CTRL)
-            }
-            break;
+                break;
 
-        case MBOX1:
-            if (haveLock)
-            {
-                MBOX_GATE(1);
-                msr_write(cpu_id, reg , flags);
+            case MBOX1:
+                WEX_SETUP_MBOX(1);
                 VERBOSEPRINTREG(cpu_id, reg, flags, MBOX1_CTRL)
-            }
-            break;
-
-        case BBOX0:
+                break;
 
-        case BBOX1:
-            if (haveLock)
-            {
-                flags = 0x1ULL; /* set enable bit */
-                flags |=  (event->eventId<<1);
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, BBOX_CTRL)
-            }
-            break;
+            case BBOX0:
+            case BBOX1:
+                wex_bbox_setup(cpu_id, index, event);
+                break;
 
-        case RBOX0:
-            if (haveLock)
-            {
-                RBOX_GATE(0);
-                msr_write(cpu_id, reg , flags);
+            case RBOX0:
+                WEX_SETUP_RBOX(0)
                 VERBOSEPRINTREG(cpu_id, reg, flags, RBOX0_CTRL)
-            }
-            break;
+                break;
 
-        case RBOX1:
-            if (haveLock)
-            {
-                RBOX_GATE(1);
-                msr_write(cpu_id, reg , flags);
+            case RBOX1:
+                WEX_SETUP_RBOX(1)
                 VERBOSEPRINTREG(cpu_id, reg, flags, RBOX1_CTRL)
-            }
-            break;
+                break;
 
-        case WBOX:
-            if (haveLock)
-            {
-                if (event->eventId == 0xFF)  /* Fixed Counter */
-                {
-                    flags = 0x1ULL; /* set enable bit */
-                }
-                else
+            case WBOX:
+            case CBOX0:
+            case CBOX1:
+            case CBOX2:
+            case CBOX3:
+            case CBOX4:
+            case CBOX5:
+            case CBOX6:
+            case CBOX7:
+            case CBOX8:
+            case CBOX9:
+                wex_uncore_box_setup(cpu_id, index, event);
+                break;
+
+            case WBOX0FIX:
+                if (haveLock && eventSet->regTypeMask & (REG_TYPE_MASK(WBOX0FIX)))
                 {
-                    flags |= (1<<22); /* set enable bit */
-                    flags |= (event->umask<<8) + event->eventId;
+                    flags = 0x1;
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg , flags));
+                    VERBOSEPRINTREG(cpu_id, reg, LLU_CAST flags, WBOX0FIX_CTRL);
+                    eventSet->regTypeMask |= REG_TYPE_MASK(WBOX);
                 }
-                msr_write(cpu_id, reg , flags);
-                VERBOSEPRINTREG(cpu_id, reg, flags, WBOX_CTRL)
-            }
-            break;
+                break;
 
-        case UBOX:
-            if (haveLock)
-            {
-                flags = 0x0ULL;
-                flags |= (1<<22);
-                flags |= (event->eventId);
-                msr_write(cpu_id, reg , flags);
-            }
+            case UBOX:
+                wex_ubox_setup(cpu_id, index, event);
+                ubox_flags = 0x1ULL;
 
-        case CBOX0:
-        case CBOX1:
-        case CBOX2:
-        case CBOX3:
-        case CBOX4:
-        case CBOX5:
-        case CBOX6:
-        case CBOX7:
-        case CBOX8:
-        case CBOX9:
-        case SBOX0:
-        case SBOX1:
-            if (haveLock)
+            case SBOX0:
+            case SBOX1:
+                wex_sbox_setup(cpu_id, index, event);
+                break;
+            default:
+                break;
+        }
+        if (type != WBOX0FIX)
+        {
+            uflags[type] |= (1U<<getCounterTypeOffset(index));
+        }
+        else
+        {
+            uflags[WBOX] |= (1<<31);
+        }
+    }
+
+    if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+    {
+        for ( int i=0; i<NUM_UNITS; i++ )
+        {
+            if ((uflags[i] != 0x0ULL) && (i != WBOX0FIX))
             {
-                flags = 0x0ULL;
-                flags |= (1<<22);
-                flags |= (event->umask<<8);
-                flags |= (event->eventId);
-                msr_write(cpu_id, reg , flags);
+                VERBOSEPRINTPCIREG(cpu_id, MSR_DEV, box_map[i].ctrlRegister, uflags[i], CLEAR_CTL);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[i].ctrlRegister, uflags[i]));
+                VERBOSEPRINTPCIREG(cpu_id, MSR_DEV, box_map[i].ovflRegister, uflags[i], CLEAR_OVF_CTL);
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[i].ovflRegister, uflags[i]));
             }
-            break;
+        }
+    }
 
-        default:
-            /* should never be reached */
-            break;
+    if (fixed_flags != 0x0ULL)
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_FIXED_CTR_CTRL, LLU_CAST fixed_flags, SETUP_FIXED);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_FIXED_CTR_CTRL, fixed_flags));
+    }
+    if (ubox_flags != 0x0ULL)
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST ubox_flags, ACTIVATE_UBOX);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, ubox_flags));
     }
+    return 0;
 }
 
 /* Actions for Performance Monitoring Session:
@@ -777,167 +999,317 @@ void perfmon_setupCounterThread_westmereEX(
  * 3) Set enable bit in global U Box control register
  * */
 
-void perfmon_startCountersThread_westmereEX(int thread_id)
+
+int perfmon_startCountersThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    uint64_t flags = 0x0ULL;
-    uint32_t uflags[NUM_UNITS];
-    int enable_ubox = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    uint64_t core_ctrl_flags = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
-        uint32_t ubflags = 0x0UL;
-        ubflags |= (1<<29); /* reset all */
         haveLock = 1;
-        //        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
-        //       VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags, UBOX_GLOBAL_CTRL)
     }
 
-    for ( int i=0; i<NUM_UNITS; i++ )
-    {
-        uflags[i] = 0x0UL;
-    }
+    //wex_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTR);
 
-    for ( int i=0; i<NUM_PMC; i++ ) 
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) {
-            if (westmereEX_counter_map[i].type == PMC)
-            {
-                msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
-                flags |= (1<<(i-OFFSET_PMC));  /* enable counter */
-            }
-            else if (westmereEX_counter_map[i].type == FIXED)
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE) 
+        {
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
-                flags |= (1ULL<<(i+32));  /* enable fixed counter */
+                continue;
             }
-            else if (westmereEX_counter_map[i].type > UNCORE)
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            switch (type)
             {
-                if(haveLock)
-                {
-                    msr_write(cpu_id, westmereEX_counter_map[i].counterRegister , 0x0ULL);
-                    uflags[westmereEX_counter_map[i].type] |=
-                        (1<<(perfmon_threadData[thread_id].counters[i].id));  /* enable uncore counter */
-                    if (westmereEX_counter_map[i].type == UBOX)
-                    {
-                        enable_ubox = 1;
-                    }
-                }
+                case PMC:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    core_ctrl_flags |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                    break;
+                case FIXED:
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, counter1, 0x0ULL));
+                    core_ctrl_flags |= (1ULL<<(index+32));
+                    break;
+                default:
+                    break;
             }
         }
     }
 
-    VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST flags, GLOBAL_CTRL);
 
-    if (haveLock)
+    wex_uncore_unfreeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTR);
+
+    /* Finally enable counters */
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        for ( int i=0; i<NUM_UNITS; i++ )
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST core_ctrl_flags, GLOBAL_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, core_ctrl_flags));
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, (1ULL<<63)|(1ULL<<62)|core_ctrl_flags));
+    }
+    return 0;
+}
+
+#define WEX_CHECK_OVERFLOW(id, offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].statusRegister, &tmp)); \
+        if (tmp & (1ULL<<offset)) \
+        { \
+            eventSet->events[i].threadCounter[thread_id].overflows++; \
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, (1ULL<<offset))); \
+        } \
+    }
+
+#define WEX_CLEAR_OVERFLOW(id, offset) \
+    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ctrlRegister, (1<<offset)));
+
+
+#define WEX_CHECK_UNCORE_OVERFLOW(id, offset) \
+    if (counter_result < eventSet->events[i].threadCounter[thread_id].counterData) \
+    { \
+        uint64_t tmp = 0x0ULL; \
+        int check_local = 0; \
+        if ((id == SBOX0) || (id == SBOX1) || (id == WBOX) || (id == UBOX)) \
+        { \
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_STATUS, &tmp)); \
+            int gl_offset = -1; \
+            switch (id) \
+            { \
+                case UBOX: \
+                    gl_offset = 0; \
+                    break; \
+                case WBOX: \
+                    gl_offset = 1; \
+                    break; \
+                case SBOX1: \
+                    gl_offset = 2; \
+                    break; \
+                case SBOX0: \
+                    gl_offset = 3; \
+                    break; \
+                default: \
+                    break; \
+            } \
+            if ((gl_offset != -1) && (tmp & (1ULL<<gl_offset))) \
+            { \
+                check_local = 1; \
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_OVF_CTRL, (1ULL<<gl_offset))); \
+            } \
+        } \
+        else \
+        { \
+            check_local = 1; \
+        } \
+        if (check_local) \
+        { \
+            CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, box_map[id].statusRegister, &tmp)); \
+            if (tmp & (1ULL<<offset)) \
+            { \
+                eventSet->events[i].threadCounter[thread_id].overflows++; \
+                CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, box_map[id].ovflRegister, (1ULL<<offset))); \
+            } \
+        } \
+    }
+
+int perfmon_stopCountersThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
+{
+    int i;
+    int haveLock = 0;
+    uint64_t counter_result = 0x0ULL;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+
+    if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
+    {
+        haveLock = 1;
+    }
+
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, LLU_CAST 0x0ULL, GLOBAL_CTRL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+    }
+    wex_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_CLEAR_CTL);
+
+    for (i = 0; i < eventSet->numberOfEvents; i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            /* if counters are enabled write the according box ctrl register */
-            if (uflags[i]) 
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
             {
-                msr_write(cpu_id, westmereEX_PMunits[i].ctrlRegister, uflags[i]);
-                VERBOSEPRINTREG(cpu_id, westmereEX_PMunits[i].ctrlRegister, LLU_CAST uflags[i], BOXCTRL);
+                continue;
             }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            switch (type)
+            {
+                case PMC:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    WEX_CHECK_OVERFLOW(PMC, index-cpuid_info.perf_num_fixed_ctr);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC);
+                    break;
+                case FIXED:
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    WEX_CHECK_OVERFLOW(PMC, index+32);
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED);
+                    break;
+                default:
+                    if(haveLock && (eventSet->regTypeMask & REG_TYPE_MASK(type)))
+                    {
+                        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                        WEX_CHECK_UNCORE_OVERFLOW(type, index);
+                        VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_UNCORE);
+                    }
+                    break;
+            }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
-
-        /* set global enable flag in U BOX ctrl register */
-        uint32_t ubflags = 0x0UL;
-        ubflags |= (1<<28); /* enable all */
-        if (enable_ubox)
-        {
-            ubflags |= (1<<0);
-        }
-        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, LLU_CAST ubflags, UBOX_GLOBAL_CTRL);
-        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
     }
-    /* Finally enable counters */
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, flags);
-    msr_write(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, 0x30000000FULL);
+
+    return 0;
 }
 
-void perfmon_stopCountersThread_westmereEX(int thread_id)
+int perfmon_readCountersThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
-
-    msr_write(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL);
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t counter_result = 0x0ULL;
+    uint64_t core_ctrl_flags = 0x0ULL;
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
-        uint32_t ubflags = 0x0UL;
         haveLock = 1;
-        //        ubflags |= (1<<29); /* reset all */
-        msr_write(cpu_id, MSR_U_PMON_GLOBAL_CTRL, ubflags );
     }
 
-    for ( int i=0; i<NUM_COUNTERS_WESTMEREEX; i++ ) 
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, &core_ctrl_flags));
+    }
+    wex_uncore_freeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
+
+    for (int i=0;i < eventSet->numberOfEvents;i++)
+    {
+        if (eventSet->events[i].threadCounter[thread_id].init == TRUE)
         {
-            if (westmereEX_counter_map[i].type > UNCORE)
+            RegisterType type = eventSet->events[i].type;
+            if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
+            {
+                continue;
+            }
+            counter_result = 0x0ULL;
+            RegisterIndex index = eventSet->events[i].index;
+            uint64_t counter1 = counter_map[index].counterRegister;
+            if (type > UNCORE)
             {
                 if(haveLock)
                 {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-
-                    VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister,
-                            LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_UNCORE);
+                    CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                    WEX_CHECK_UNCORE_OVERFLOW(counter_map[index].type, getCounterTypeOffset(index));
+                    VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_UNCORE);
                 }
             }
-            else
+            else if (type == FIXED)
             {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-
-                VERBOSEPRINTREG(cpu_id, westmereEX_counter_map[i].counterRegister,
-                        LLU_CAST perfmon_threadData[thread_id].counters[i].counterData, READ_CORE);
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                WEX_CHECK_OVERFLOW(PMC, index+32);
+                VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_FIXED);
+            }
+            else if (type == PMC)
+            {
+                CHECK_MSR_READ_ERROR(HPMread(cpu_id, MSR_DEV, counter1, &counter_result));
+                WEX_CHECK_OVERFLOW(PMC, index-cpuid_info.perf_num_fixed_ctr);
+                VERBOSEPRINTREG(cpu_id, counter1, LLU_CAST counter_result, READ_PMC);
             }
+            eventSet->events[i].threadCounter[thread_id].counterData = field64(counter_result, 0, box_map[type].regWidth);
         }
     }
 
-#if 0
-    flags = msr_read(cpu_id,MSR_PERF_GLOBAL_STATUS);
-    printf ("Status: 0x%llX \n", LLU_CAST flags);
-    if((flags & 0x3) || (flags & (0x3ULL<<32)) ) 
+    wex_uncore_unfreeze(cpu_id, eventSet, FREEZE_FLAG_ONLYFREEZE);
+    if ((eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED))) && (core_ctrl_flags != 0x0ULL))
     {
-        printf ("Overflow occured \n");
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, core_ctrl_flags));
     }
-#endif
+    return 0;
 }
 
-void perfmon_readCountersThread_westmereEX(int thread_id)
+
+int perfmon_finalizeCountersThread_westmereEX(int thread_id, PerfmonEventSet* eventSet)
 {
     int haveLock = 0;
-    int cpu_id = perfmon_threadData[thread_id].processorId;
+    int haveTileLock = 0;
+    int cpu_id = groupSet->threads[thread_id].processorId;
+    uint64_t ovf_values_core = (1ULL<<63)|(1ULL<<62);
 
     if (socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id)
     {
         haveLock = 1;
     }
+    if (tile_lock[affinity_thread2tile_lookup[cpu_id]] == cpu_id)
+    {
+        haveTileLock = 1;
+    }
 
-    for ( int i=0; i<NUM_COUNTERS_WESTMEREEX; i++ ) 
+    for (int i=0;i < eventSet->numberOfEvents;i++)
     {
-        if (perfmon_threadData[thread_id].counters[i].init == TRUE) 
+        RegisterType type = eventSet->events[i].type;
+        if (!(eventSet->regTypeMask & (REG_TYPE_MASK(type))))
         {
-            if (westmereEX_counter_map[i].type > UNCORE)
-            {
-                if(haveLock)
+            continue;
+        }
+        RegisterIndex index = eventSet->events[i].index;
+        uint64_t reg = counter_map[index].configRegister;
+        PciDeviceIndex dev = counter_map[index].device;
+        switch (type)
+        {
+            case PMC:
+                ovf_values_core |= (1ULL<<(index-cpuid_info.perf_num_fixed_ctr));
+                if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xB7))
                 {
-                    perfmon_threadData[thread_id].counters[i].counterData =
-                        msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP0, 0x0ULL, CLEAR_OFFCORE_RESP0);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP0, 0x0ULL));
                 }
-            }
-            else
-            {
-                perfmon_threadData[thread_id].counters[i].counterData =
-                    msr_read(cpu_id, westmereEX_counter_map[i].counterRegister);
-            }
+                else if ((haveTileLock) && (eventSet->events[i].event.eventId == 0xBB))
+                {
+                    VERBOSEPRINTREG(cpu_id, MSR_OFFCORE_RESP1, 0x0ULL, CLEAR_OFFCORE_RESP1);
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_OFFCORE_RESP1, 0x0ULL));
+                }
+                break;
+            case FIXED:
+                ovf_values_core |= (1ULL<<(index+32));
+                break;
+            default:
+                if (((haveLock) && (type > UNCORE)))
+                {
+                    CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, reg, 0x0ULL));
+                }
+                break;
         }
+        if ((reg) && (((dev == MSR_DEV) && (type < UNCORE)) || (((haveLock) && (type > UNCORE)))))
+        {
+            VERBOSEPRINTPCIREG(cpu_id, dev, reg, 0x0ULL, CLEAR_CTL);
+            CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, dev, reg, 0x0ULL));
+        }
+        eventSet->events[i].threadCounter[thread_id].init = FALSE;
     }
+    if (eventSet->regTypeMask & (REG_TYPE_MASK(PMC)|REG_TYPE_MASK(FIXED)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_CTRL, 0x0ULL, CLEAR_PMC_AND_FIXED_CTL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core, CLEAR_PMC_AND_FIXED_OVERFLOW);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_PERF_GLOBAL_OVF_CTRL, ovf_values_core));
+    }
+    if (haveLock && (eventSet->regTypeMask & ~(0xF)))
+    {
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL, CLEAR_UNCORE_CTL);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_CTRL, 0x0ULL));
+        VERBOSEPRINTREG(cpu_id, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL, CLEAR_UNCORE_OVERFLOW);
+        CHECK_MSR_WRITE_ERROR(HPMwrite(cpu_id, MSR_DEV, MSR_U_PMON_GLOBAL_OVF_CTRL, 0x0ULL));
+    }
+    return 0;
 }
-
diff --git a/src/includes/perfmon_westmereEX_counters.h b/src/includes/perfmon_westmereEX_counters.h
index fd65746..af384e0 100644
--- a/src/includes/perfmon_westmereEX_counters.h
+++ b/src/includes/perfmon_westmereEX_counters.h
@@ -5,13 +5,14 @@
  *
  *      Description: Counter Header File of perfmon module for Westmere EX.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -29,125 +30,170 @@
  */
 
 #define NUM_COUNTERS_CORE_WESTMEREEX 7
-#define NUM_COUNTERS_UNCORE_WESTMEREEX 107
-#define NUM_COUNTERS_WESTMEREEX 107
+#define NUM_COUNTERS_UNCORE_WESTMEREEX 117
+#define NUM_COUNTERS_WESTMEREEX 117
 
-static PerfmonCounterMap westmereEX_counter_map[NUM_COUNTERS_WESTMEREEX] = {
+#define WEX_VALID_OPTIONS_FIXED EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_ANYTHREAD_MASK
+#define WEX_VALID_OPTIONS_PMC EVENT_OPTION_EDGE_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_COUNT_KERNEL_MASK|EVENT_OPTION_THRESHOLD_MASK
+
+#define WEX_VALID_OPTIONS_MBOX EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define WEX_VALID_OPTIONS_BBOX EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define WEX_VALID_OPTIONS_CBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+#define WEX_VALID_OPTIONS_SBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK|EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+#define WEX_VALID_OPTIONS_WBOX EVENT_OPTION_THRESHOLD_MASK|EVENT_OPTION_INVERT_MASK|EVENT_OPTION_EDGE_MASK
+
+static RegisterMap westmereEX_counter_map[NUM_COUNTERS_WESTMEREEX] = {
     /* Fixed Counters: instructions retired, cycles unhalted core */
-    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0},
-    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0},
-    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0},
+    {"FIXC0", PMC0, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR0, 0, 0, WEX_VALID_OPTIONS_FIXED},
+    {"FIXC1", PMC1, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR1, 0, 0, WEX_VALID_OPTIONS_FIXED},
+    {"FIXC2", PMC2, FIXED, MSR_PERF_FIXED_CTR_CTRL, MSR_PERF_FIXED_CTR2, 0, 0, WEX_VALID_OPTIONS_FIXED},
     /* PMC Counters: 4 48bit wide */
-    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0},
-    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0},
-    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0},
-    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0},
+    {"PMC0", PMC3, PMC, MSR_PERFEVTSEL0, MSR_PMC0, 0, 0, WEX_VALID_OPTIONS_PMC},
+    {"PMC1", PMC4, PMC, MSR_PERFEVTSEL1, MSR_PMC1, 0, 0, WEX_VALID_OPTIONS_PMC},
+    {"PMC2", PMC5, PMC, MSR_PERFEVTSEL2, MSR_PMC2, 0, 0, WEX_VALID_OPTIONS_PMC},
+    {"PMC3", PMC6, PMC, MSR_PERFEVTSEL3, MSR_PMC3, 0, 0, WEX_VALID_OPTIONS_PMC},
     /* MBOX */
-    {"MBOX0C0",PMC7, MBOX0, MSR_M0_PMON_EVNT_SEL0, MSR_M0_PMON_CTR0, 0, 0},
-    {"MBOX0C1",PMC8, MBOX0, MSR_M0_PMON_EVNT_SEL1, MSR_M0_PMON_CTR1, 0, 0},
-    {"MBOX0C2",PMC9, MBOX0, MSR_M0_PMON_EVNT_SEL2, MSR_M0_PMON_CTR2, 0, 0},
-    {"MBOX0C3",PMC10, MBOX0, MSR_M0_PMON_EVNT_SEL3, MSR_M0_PMON_CTR3, 0, 0},
-    {"MBOX0C4",PMC11, MBOX0, MSR_M0_PMON_EVNT_SEL4, MSR_M0_PMON_CTR4, 0, 0},
-    {"MBOX0C5",PMC12, MBOX0, MSR_M0_PMON_EVNT_SEL5, MSR_M0_PMON_CTR5, 0, 0},
-    {"MBOX1C0",PMC13, MBOX1, MSR_M1_PMON_EVNT_SEL0, MSR_M1_PMON_CTR0, 0, 0},
-    {"MBOX1C1",PMC14, MBOX1, MSR_M1_PMON_EVNT_SEL1, MSR_M1_PMON_CTR1, 0, 0},
-    {"MBOX1C2",PMC15, MBOX1, MSR_M1_PMON_EVNT_SEL2, MSR_M1_PMON_CTR2, 0, 0},
-    {"MBOX1C3",PMC16, MBOX1, MSR_M1_PMON_EVNT_SEL3, MSR_M1_PMON_CTR3, 0, 0},
-    {"MBOX1C4",PMC17, MBOX1, MSR_M1_PMON_EVNT_SEL4, MSR_M1_PMON_CTR4, 0, 0},
-    {"MBOX1C5",PMC18, MBOX1, MSR_M1_PMON_EVNT_SEL5, MSR_M1_PMON_CTR5, 0, 0},
+    {"MBOX0C0",PMC7, MBOX0, MSR_M0_PMON_EVNT_SEL0, MSR_M0_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C1",PMC8, MBOX0, MSR_M0_PMON_EVNT_SEL1, MSR_M0_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C2",PMC9, MBOX0, MSR_M0_PMON_EVNT_SEL2, MSR_M0_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C3",PMC10, MBOX0, MSR_M0_PMON_EVNT_SEL3, MSR_M0_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C4",PMC11, MBOX0, MSR_M0_PMON_EVNT_SEL4, MSR_M0_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX0C5",PMC12, MBOX0, MSR_M0_PMON_EVNT_SEL5, MSR_M0_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C0",PMC13, MBOX1, MSR_M1_PMON_EVNT_SEL0, MSR_M1_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C1",PMC14, MBOX1, MSR_M1_PMON_EVNT_SEL1, MSR_M1_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C2",PMC15, MBOX1, MSR_M1_PMON_EVNT_SEL2, MSR_M1_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C3",PMC16, MBOX1, MSR_M1_PMON_EVNT_SEL3, MSR_M1_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C4",PMC17, MBOX1, MSR_M1_PMON_EVNT_SEL4, MSR_M1_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_MBOX},
+    {"MBOX1C5",PMC18, MBOX1, MSR_M1_PMON_EVNT_SEL5, MSR_M1_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_MBOX},
     /* BBOX */
-    {"BBOX0C0",PMC19, BBOX0, MSR_B0_PMON_EVNT_SEL0, MSR_B0_PMON_CTR0, 0, 0},
-    {"BBOX0C1",PMC20, BBOX0, MSR_B0_PMON_EVNT_SEL1, MSR_B0_PMON_CTR1, 0, 0},
-    {"BBOX0C2",PMC21, BBOX0, MSR_B0_PMON_EVNT_SEL2, MSR_B0_PMON_CTR2, 0, 0},
-    {"BBOX0C3",PMC22, BBOX0, MSR_B0_PMON_EVNT_SEL3, MSR_B0_PMON_CTR3, 0, 0},
-    {"BBOX1C0",PMC23, BBOX1, MSR_B1_PMON_EVNT_SEL0, MSR_B1_PMON_CTR0, 0, 0},
-    {"BBOX1C1",PMC24, BBOX1, MSR_B1_PMON_EVNT_SEL1, MSR_B1_PMON_CTR1, 0, 0},
-    {"BBOX1C2",PMC25, BBOX1, MSR_B1_PMON_EVNT_SEL2, MSR_B1_PMON_CTR2, 0, 0},
-    {"BBOX1C3",PMC26, BBOX1, MSR_B1_PMON_EVNT_SEL3, MSR_B1_PMON_CTR3, 0, 0},
+    {"BBOX0C0",PMC19, BBOX0, MSR_B0_PMON_EVNT_SEL0, MSR_B0_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_BBOX},
+    {"BBOX0C1",PMC20, BBOX0, MSR_B0_PMON_EVNT_SEL1, MSR_B0_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_BBOX},
+    {"BBOX0C2",PMC21, BBOX0, MSR_B0_PMON_EVNT_SEL2, MSR_B0_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_BBOX},
+    {"BBOX0C3",PMC22, BBOX0, MSR_B0_PMON_EVNT_SEL3, MSR_B0_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C0",PMC23, BBOX1, MSR_B1_PMON_EVNT_SEL0, MSR_B1_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C1",PMC24, BBOX1, MSR_B1_PMON_EVNT_SEL1, MSR_B1_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C2",PMC25, BBOX1, MSR_B1_PMON_EVNT_SEL2, MSR_B1_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_BBOX},
+    {"BBOX1C3",PMC26, BBOX1, MSR_B1_PMON_EVNT_SEL3, MSR_B1_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_BBOX},
     /* RBOX */
-    {"RBOX0C0",PMC27, RBOX0, MSR_R0_PMON_EVNT_SEL0, MSR_R0_PMON_CTR0, 0, 0},
-    {"RBOX0C1",PMC28, RBOX0, MSR_R0_PMON_EVNT_SEL1, MSR_R0_PMON_CTR1, 0, 0},
-    {"RBOX0C2",PMC29, RBOX0, MSR_R0_PMON_EVNT_SEL2, MSR_R0_PMON_CTR2, 0, 0},
-    {"RBOX0C3",PMC30, RBOX0, MSR_R0_PMON_EVNT_SEL3, MSR_R0_PMON_CTR3, 0, 0},
-    {"RBOX0C4",PMC31, RBOX0, MSR_R0_PMON_EVNT_SEL4, MSR_R0_PMON_CTR4, 0, 0},
-    {"RBOX0C5",PMC32, RBOX0, MSR_R0_PMON_EVNT_SEL5, MSR_R0_PMON_CTR5, 0, 0},
-    {"RBOX0C6",PMC33, RBOX0, MSR_R0_PMON_EVNT_SEL6, MSR_R0_PMON_CTR6, 0, 0},
-    {"RBOX0C7",PMC34, RBOX0, MSR_R0_PMON_EVNT_SEL7, MSR_R0_PMON_CTR7, 0, 0},
-    {"RBOX1C0",PMC35, RBOX1, MSR_R1_PMON_EVNT_SEL8, MSR_R1_PMON_CTR8, 0, 0},
-    {"RBOX1C1",PMC36, RBOX1, MSR_R1_PMON_EVNT_SEL9, MSR_R1_PMON_CTR9, 0, 0},
-    {"RBOX1C2",PMC37, RBOX1, MSR_R1_PMON_EVNT_SEL10, MSR_R1_PMON_CTR10, 0, 0},
-    {"RBOX1C3",PMC38, RBOX1, MSR_R1_PMON_EVNT_SEL11, MSR_R1_PMON_CTR11, 0, 0},
-    {"RBOX1C4",PMC39, RBOX1, MSR_R1_PMON_EVNT_SEL12, MSR_R1_PMON_CTR12, 0, 0},
-    {"RBOX1C5",PMC40, RBOX1, MSR_R1_PMON_EVNT_SEL13, MSR_R1_PMON_CTR13, 0, 0},
-    {"RBOX1C6",PMC41, RBOX1, MSR_R1_PMON_EVNT_SEL14, MSR_R1_PMON_CTR14, 0, 0},
-    {"RBOX1C7",PMC42, RBOX1, MSR_R1_PMON_EVNT_SEL15, MSR_R1_PMON_CTR15, 0, 0},
+    {"RBOX0C0",PMC27, RBOX0, MSR_R0_PMON_EVNT_SEL0, MSR_R0_PMON_CTR0, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C1",PMC28, RBOX0, MSR_R0_PMON_EVNT_SEL1, MSR_R0_PMON_CTR1, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C2",PMC29, RBOX0, MSR_R0_PMON_EVNT_SEL2, MSR_R0_PMON_CTR2, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C3",PMC30, RBOX0, MSR_R0_PMON_EVNT_SEL3, MSR_R0_PMON_CTR3, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C4",PMC31, RBOX0, MSR_R0_PMON_EVNT_SEL4, MSR_R0_PMON_CTR4, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C5",PMC32, RBOX0, MSR_R0_PMON_EVNT_SEL5, MSR_R0_PMON_CTR5, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C6",PMC33, RBOX0, MSR_R0_PMON_EVNT_SEL6, MSR_R0_PMON_CTR6, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX0C7",PMC34, RBOX0, MSR_R0_PMON_EVNT_SEL7, MSR_R0_PMON_CTR7, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C0",PMC35, RBOX1, MSR_R1_PMON_EVNT_SEL8, MSR_R1_PMON_CTR8, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C1",PMC36, RBOX1, MSR_R1_PMON_EVNT_SEL9, MSR_R1_PMON_CTR9, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C2",PMC37, RBOX1, MSR_R1_PMON_EVNT_SEL10, MSR_R1_PMON_CTR10, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C3",PMC38, RBOX1, MSR_R1_PMON_EVNT_SEL11, MSR_R1_PMON_CTR11, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C4",PMC39, RBOX1, MSR_R1_PMON_EVNT_SEL12, MSR_R1_PMON_CTR12, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C5",PMC40, RBOX1, MSR_R1_PMON_EVNT_SEL13, MSR_R1_PMON_CTR13, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C6",PMC41, RBOX1, MSR_R1_PMON_EVNT_SEL14, MSR_R1_PMON_CTR14, 0, 0, EVENT_OPTION_NONE_MASK},
+    {"RBOX1C7",PMC42, RBOX1, MSR_R1_PMON_EVNT_SEL15, MSR_R1_PMON_CTR15, 0, 0, EVENT_OPTION_NONE_MASK},
     /* WBOX */
-    {"WBOX0",PMC43, WBOX, MSR_W_PMON_EVNT_SEL0, MSR_W_PMON_CTR0, 0, 0},
-    {"WBOX1",PMC44, WBOX, MSR_W_PMON_EVNT_SEL1, MSR_W_PMON_CTR1, 0, 0},
-    {"WBOX2",PMC45, WBOX, MSR_W_PMON_EVNT_SEL2, MSR_W_PMON_CTR2, 0, 0},
-    {"WBOX3",PMC46, WBOX, MSR_W_PMON_EVNT_SEL3, MSR_W_PMON_CTR3, 0, 0},
-    {"WBOX4",PMC47, WBOX, MSR_W_PMON_FIXED_CTR_CTL, MSR_W_PMON_FIXED_CTR, 0, 0},
+    {"WBOX0",PMC43, WBOX, MSR_W_PMON_EVNT_SEL0, MSR_W_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_WBOX},
+    {"WBOX1",PMC44, WBOX, MSR_W_PMON_EVNT_SEL1, MSR_W_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_WBOX},
+    {"WBOX2",PMC45, WBOX, MSR_W_PMON_EVNT_SEL2, MSR_W_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_WBOX},
+    {"WBOX3",PMC46, WBOX, MSR_W_PMON_EVNT_SEL3, MSR_W_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_WBOX},
+    {"WBOXFIX",PMC47, WBOX0FIX, MSR_W_PMON_FIXED_CTR_CTL, MSR_W_PMON_FIXED_CTR, 0, 0, EVENT_OPTION_NONE_MASK},
     /* UBOX */
-    {"UBOX0",PMC48, UBOX, MSR_U_PMON_GLOBAL_EVNT_SEL, MSR_U_PMON_GLOBAL_CTR, 0, 0},
+    {"UBOX0",PMC48, UBOX, MSR_U_PMON_GLOBAL_EVNT_SEL, MSR_U_PMON_GLOBAL_CTR, 0, 0, EVENT_OPTION_EDGE_MASK},
     /* CBOXes */
-    {"CBOX0C0",PMC49, CBOX0, MSR_C0_PMON_EVNT_SEL0, MSR_C0_PMON_CTR0, 0, 0},
-    {"CBOX0C1",PMC50, CBOX0, MSR_C0_PMON_EVNT_SEL1, MSR_C0_PMON_CTR1, 0, 0},
-    {"CBOX0C2",PMC51, CBOX0, MSR_C0_PMON_EVNT_SEL2, MSR_C0_PMON_CTR2, 0, 0},
-    {"CBOX0C3",PMC52, CBOX0, MSR_C0_PMON_EVNT_SEL3, MSR_C0_PMON_CTR3, 0, 0},
-    {"CBOX0C4",PMC53, CBOX0, MSR_C0_PMON_EVNT_SEL4, MSR_C0_PMON_CTR4, 0, 0},
-    {"CBOX1C0",PMC54, CBOX1, MSR_C1_PMON_EVNT_SEL0, MSR_C1_PMON_CTR0, 0, 0},
-    {"CBOX1C1",PMC55, CBOX1, MSR_C1_PMON_EVNT_SEL1, MSR_C1_PMON_CTR1, 0, 0},
-    {"CBOX1C2",PMC56, CBOX1, MSR_C1_PMON_EVNT_SEL2, MSR_C1_PMON_CTR2, 0, 0},
-    {"CBOX1C3",PMC57, CBOX1, MSR_C1_PMON_EVNT_SEL3, MSR_C1_PMON_CTR3, 0, 0},
-    {"CBOX1C4",PMC58, CBOX1, MSR_C1_PMON_EVNT_SEL4, MSR_C1_PMON_CTR4, 0, 0},
-    {"CBOX2C0",PMC59, CBOX2, MSR_C2_PMON_EVNT_SEL0, MSR_C2_PMON_CTR0, 0, 0},
-    {"CBOX2C1",PMC60, CBOX2, MSR_C2_PMON_EVNT_SEL1, MSR_C2_PMON_CTR1, 0, 0},
-    {"CBOX2C2",PMC61, CBOX2, MSR_C2_PMON_EVNT_SEL2, MSR_C2_PMON_CTR2, 0, 0},
-    {"CBOX2C3",PMC62, CBOX2, MSR_C2_PMON_EVNT_SEL3, MSR_C2_PMON_CTR3, 0, 0},
-    {"CBOX2C4",PMC63, CBOX2, MSR_C2_PMON_EVNT_SEL4, MSR_C2_PMON_CTR4, 0, 0},
-    {"CBOX3C0",PMC64, CBOX3, MSR_C3_PMON_EVNT_SEL0, MSR_C3_PMON_CTR0, 0, 0},
-    {"CBOX3C1",PMC65, CBOX3, MSR_C3_PMON_EVNT_SEL1, MSR_C3_PMON_CTR1, 0, 0},
-    {"CBOX3C2",PMC66, CBOX3, MSR_C3_PMON_EVNT_SEL2, MSR_C3_PMON_CTR2, 0, 0},
-    {"CBOX3C3",PMC67, CBOX3, MSR_C3_PMON_EVNT_SEL3, MSR_C3_PMON_CTR3, 0, 0},
-    {"CBOX3C4",PMC68, CBOX3, MSR_C3_PMON_EVNT_SEL4, MSR_C3_PMON_CTR4, 0, 0},
-    {"CBOX4C0",PMC69, CBOX4, MSR_C4_PMON_EVNT_SEL0, MSR_C4_PMON_CTR0, 0, 0},
-    {"CBOX4C1",PMC70, CBOX4, MSR_C4_PMON_EVNT_SEL1, MSR_C4_PMON_CTR1, 0, 0},
-    {"CBOX4C2",PMC71, CBOX4, MSR_C4_PMON_EVNT_SEL2, MSR_C4_PMON_CTR2, 0, 0},
-    {"CBOX4C3",PMC72, CBOX4, MSR_C4_PMON_EVNT_SEL3, MSR_C4_PMON_CTR3, 0, 0},
-    {"CBOX4C4",PMC73, CBOX4, MSR_C4_PMON_EVNT_SEL4, MSR_C4_PMON_CTR4, 0, 0},
-    {"CBOX5C0",PMC74, CBOX5, MSR_C5_PMON_EVNT_SEL0, MSR_C5_PMON_CTR0, 0, 0},
-    {"CBOX5C1",PMC75, CBOX5, MSR_C5_PMON_EVNT_SEL1, MSR_C5_PMON_CTR1, 0, 0},
-    {"CBOX5C2",PMC76, CBOX5, MSR_C5_PMON_EVNT_SEL2, MSR_C5_PMON_CTR2, 0, 0},
-    {"CBOX5C3",PMC77, CBOX5, MSR_C5_PMON_EVNT_SEL3, MSR_C5_PMON_CTR3, 0, 0},
-    {"CBOX5C4",PMC78, CBOX5, MSR_C5_PMON_EVNT_SEL4, MSR_C5_PMON_CTR4, 0, 0},
-    {"CBOX6C0",PMC79, CBOX6, MSR_C6_PMON_EVNT_SEL0, MSR_C6_PMON_CTR0, 0, 0},
-    {"CBOX6C1",PMC80, CBOX6, MSR_C6_PMON_EVNT_SEL1, MSR_C6_PMON_CTR1, 0, 0},
-    {"CBOX6C2",PMC81, CBOX6, MSR_C6_PMON_EVNT_SEL2, MSR_C6_PMON_CTR2, 0, 0},
-    {"CBOX6C3",PMC82, CBOX6, MSR_C6_PMON_EVNT_SEL3, MSR_C6_PMON_CTR3, 0, 0},
-    {"CBOX6C4",PMC83, CBOX6, MSR_C6_PMON_EVNT_SEL4, MSR_C6_PMON_CTR4, 0, 0},
-    {"CBOX7C0",PMC84, CBOX7, MSR_C7_PMON_EVNT_SEL0, MSR_C7_PMON_CTR0, 0, 0},
-    {"CBOX7C1",PMC85, CBOX7, MSR_C7_PMON_EVNT_SEL1, MSR_C7_PMON_CTR1, 0, 0},
-    {"CBOX7C2",PMC86, CBOX7, MSR_C7_PMON_EVNT_SEL2, MSR_C7_PMON_CTR2, 0, 0},
-    {"CBOX7C3",PMC87, CBOX7, MSR_C7_PMON_EVNT_SEL3, MSR_C7_PMON_CTR3, 0, 0},
-    {"CBOX7C4",PMC88, CBOX7, MSR_C7_PMON_EVNT_SEL4, MSR_C7_PMON_CTR4, 0, 0},
-    {"CBOX8C0",PMC89, CBOX8, MSR_C8_PMON_EVNT_SEL0, MSR_C8_PMON_CTR0, 0, 0},
-    {"CBOX8C1",PMC90, CBOX8, MSR_C8_PMON_EVNT_SEL1, MSR_C8_PMON_CTR1, 0, 0},
-    {"CBOX8C2",PMC91, CBOX8, MSR_C8_PMON_EVNT_SEL2, MSR_C8_PMON_CTR2, 0, 0},
-    {"CBOX8C3",PMC92, CBOX8, MSR_C8_PMON_EVNT_SEL3, MSR_C8_PMON_CTR3, 0, 0},
-    {"CBOX8C4",PMC93, CBOX8, MSR_C8_PMON_EVNT_SEL4, MSR_C8_PMON_CTR4, 0, 0},
-    {"CBOX9C0",PMC94, CBOX9, MSR_C9_PMON_EVNT_SEL0, MSR_C9_PMON_CTR0, 0, 0},
-    {"CBOX9C1",PMC95, CBOX9, MSR_C9_PMON_EVNT_SEL1, MSR_C9_PMON_CTR1, 0, 0},
-    {"CBOX9C2",PMC96, CBOX9, MSR_C9_PMON_EVNT_SEL2, MSR_C9_PMON_CTR2, 0, 0},
-    {"CBOX9C3",PMC97, CBOX9, MSR_C9_PMON_EVNT_SEL3, MSR_C9_PMON_CTR3, 0, 0},
-    {"CBOX9C4",PMC98, CBOX9, MSR_C9_PMON_EVNT_SEL4, MSR_C9_PMON_CTR4, 0, 0},
+    {"CBOX0C0",PMC49, CBOX0, MSR_C0_PMON_EVNT_SEL0, MSR_C0_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C1",PMC50, CBOX0, MSR_C0_PMON_EVNT_SEL1, MSR_C0_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C2",PMC51, CBOX0, MSR_C0_PMON_EVNT_SEL2, MSR_C0_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C3",PMC52, CBOX0, MSR_C0_PMON_EVNT_SEL3, MSR_C0_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C4",PMC53, CBOX0, MSR_C0_PMON_EVNT_SEL4, MSR_C0_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX0C5",PMC54, CBOX0, MSR_C0_PMON_EVNT_SEL5, MSR_C0_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C0",PMC55, CBOX1, MSR_C1_PMON_EVNT_SEL0, MSR_C1_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C1",PMC56, CBOX1, MSR_C1_PMON_EVNT_SEL1, MSR_C1_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C2",PMC57, CBOX1, MSR_C1_PMON_EVNT_SEL2, MSR_C1_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C3",PMC58, CBOX1, MSR_C1_PMON_EVNT_SEL3, MSR_C1_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C4",PMC59, CBOX1, MSR_C1_PMON_EVNT_SEL4, MSR_C1_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX1C5",PMC60, CBOX1, MSR_C1_PMON_EVNT_SEL5, MSR_C1_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C0",PMC61, CBOX2, MSR_C2_PMON_EVNT_SEL0, MSR_C2_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C1",PMC62, CBOX2, MSR_C2_PMON_EVNT_SEL1, MSR_C2_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C2",PMC63, CBOX2, MSR_C2_PMON_EVNT_SEL2, MSR_C2_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C3",PMC64, CBOX2, MSR_C2_PMON_EVNT_SEL3, MSR_C2_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C4",PMC65, CBOX2, MSR_C2_PMON_EVNT_SEL4, MSR_C2_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX2C5",PMC66, CBOX2, MSR_C2_PMON_EVNT_SEL5, MSR_C2_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C0",PMC67, CBOX3, MSR_C3_PMON_EVNT_SEL0, MSR_C3_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C1",PMC68, CBOX3, MSR_C3_PMON_EVNT_SEL1, MSR_C3_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C2",PMC69, CBOX3, MSR_C3_PMON_EVNT_SEL2, MSR_C3_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C3",PMC70, CBOX3, MSR_C3_PMON_EVNT_SEL3, MSR_C3_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C4",PMC71, CBOX3, MSR_C3_PMON_EVNT_SEL4, MSR_C3_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX3C5",PMC72, CBOX3, MSR_C3_PMON_EVNT_SEL5, MSR_C3_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C0",PMC73, CBOX4, MSR_C4_PMON_EVNT_SEL0, MSR_C4_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C1",PMC74, CBOX4, MSR_C4_PMON_EVNT_SEL1, MSR_C4_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C2",PMC75, CBOX4, MSR_C4_PMON_EVNT_SEL2, MSR_C4_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C3",PMC76, CBOX4, MSR_C4_PMON_EVNT_SEL3, MSR_C4_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C4",PMC77, CBOX4, MSR_C4_PMON_EVNT_SEL4, MSR_C4_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX4C5",PMC78, CBOX4, MSR_C4_PMON_EVNT_SEL5, MSR_C4_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C0",PMC79, CBOX5, MSR_C5_PMON_EVNT_SEL0, MSR_C5_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C1",PMC80, CBOX5, MSR_C5_PMON_EVNT_SEL1, MSR_C5_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C2",PMC81, CBOX5, MSR_C5_PMON_EVNT_SEL2, MSR_C5_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C3",PMC82, CBOX5, MSR_C5_PMON_EVNT_SEL3, MSR_C5_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C4",PMC83, CBOX5, MSR_C5_PMON_EVNT_SEL4, MSR_C5_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX5C5",PMC84, CBOX5, MSR_C5_PMON_EVNT_SEL5, MSR_C5_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C0",PMC85, CBOX6, MSR_C6_PMON_EVNT_SEL0, MSR_C6_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C1",PMC86, CBOX6, MSR_C6_PMON_EVNT_SEL1, MSR_C6_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C2",PMC87, CBOX6, MSR_C6_PMON_EVNT_SEL2, MSR_C6_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C3",PMC88, CBOX6, MSR_C6_PMON_EVNT_SEL3, MSR_C6_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C4",PMC89, CBOX6, MSR_C6_PMON_EVNT_SEL4, MSR_C6_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX6C5",PMC90, CBOX6, MSR_C6_PMON_EVNT_SEL5, MSR_C6_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C0",PMC91, CBOX7, MSR_C7_PMON_EVNT_SEL0, MSR_C7_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C1",PMC92, CBOX7, MSR_C7_PMON_EVNT_SEL1, MSR_C7_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C2",PMC93, CBOX7, MSR_C7_PMON_EVNT_SEL2, MSR_C7_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C3",PMC94, CBOX7, MSR_C7_PMON_EVNT_SEL3, MSR_C7_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C4",PMC95, CBOX7, MSR_C7_PMON_EVNT_SEL4, MSR_C7_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX7C5",PMC96, CBOX7, MSR_C7_PMON_EVNT_SEL5, MSR_C7_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX8C0",PMC97, CBOX8, MSR_C8_PMON_EVNT_SEL0, MSR_C8_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX8C1",PMC98, CBOX8, MSR_C8_PMON_EVNT_SEL1, MSR_C8_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX8C2",PMC99, CBOX8, MSR_C8_PMON_EVNT_SEL2, MSR_C8_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX8C3",PMC100, CBOX8, MSR_C8_PMON_EVNT_SEL3, MSR_C8_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX8C4",PMC101, CBOX8, MSR_C8_PMON_EVNT_SEL4, MSR_C8_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX8C5",PMC102, CBOX8, MSR_C8_PMON_EVNT_SEL5, MSR_C8_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX9C0",PMC103, CBOX9, MSR_C9_PMON_EVNT_SEL0, MSR_C9_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX9C1",PMC104, CBOX9, MSR_C9_PMON_EVNT_SEL1, MSR_C9_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX9C2",PMC105, CBOX9, MSR_C9_PMON_EVNT_SEL2, MSR_C9_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX9C3",PMC106, CBOX9, MSR_C9_PMON_EVNT_SEL3, MSR_C9_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX9C4",PMC107, CBOX9, MSR_C9_PMON_EVNT_SEL4, MSR_C9_PMON_CTR4, 0, 0, WEX_VALID_OPTIONS_CBOX},
+    {"CBOX9C5",PMC108, CBOX9, MSR_C9_PMON_EVNT_SEL5, MSR_C9_PMON_CTR5, 0, 0, WEX_VALID_OPTIONS_CBOX},
     /* SBOXes */
-    {"SBOX0C0",PMC99 , SBOX0, MSR_S0_PMON_EVNT_SEL0, MSR_S0_PMON_CTR0, 0, 0},
-    {"SBOX0C1",PMC100, SBOX0, MSR_S0_PMON_EVNT_SEL1, MSR_S0_PMON_CTR1, 0, 0},
-    {"SBOX0C2",PMC101, SBOX0, MSR_S0_PMON_EVNT_SEL2, MSR_S0_PMON_CTR2, 0, 0},
-    {"SBOX0C3",PMC102, SBOX0, MSR_S0_PMON_EVNT_SEL3, MSR_S0_PMON_CTR3, 0, 0},
-    {"SBOX1C0",PMC103, SBOX1, MSR_S1_PMON_EVNT_SEL0, MSR_S1_PMON_CTR0, 0, 0},
-    {"SBOX1C1",PMC104, SBOX1, MSR_S1_PMON_EVNT_SEL1, MSR_S1_PMON_CTR1, 0, 0},
-    {"SBOX1C2",PMC105, SBOX1, MSR_S1_PMON_EVNT_SEL2, MSR_S1_PMON_CTR2, 0, 0},
-    {"SBOX1C3",PMC106, SBOX1, MSR_S1_PMON_EVNT_SEL3, MSR_S1_PMON_CTR3, 0, 0}
+    {"SBOX0C0",PMC109 , SBOX0, MSR_S0_PMON_EVNT_SEL0, MSR_S0_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_SBOX},
+    {"SBOX0C1",PMC110, SBOX0, MSR_S0_PMON_EVNT_SEL1, MSR_S0_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_SBOX},
+    {"SBOX0C2",PMC111, SBOX0, MSR_S0_PMON_EVNT_SEL2, MSR_S0_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_SBOX},
+    {"SBOX0C3",PMC112, SBOX0, MSR_S0_PMON_EVNT_SEL3, MSR_S0_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C0",PMC113, SBOX1, MSR_S1_PMON_EVNT_SEL0, MSR_S1_PMON_CTR0, 0, 0, WEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C1",PMC114, SBOX1, MSR_S1_PMON_EVNT_SEL1, MSR_S1_PMON_CTR1, 0, 0, WEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C2",PMC115, SBOX1, MSR_S1_PMON_EVNT_SEL2, MSR_S1_PMON_CTR2, 0, 0, WEX_VALID_OPTIONS_SBOX},
+    {"SBOX1C3",PMC116, SBOX1, MSR_S1_PMON_EVNT_SEL3, MSR_S1_PMON_CTR3, 0, 0, WEX_VALID_OPTIONS_SBOX}
 };
 
+
+static BoxMap westmereEX_box_map[NUM_UNITS] = {
+    [PMC] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [FIXED] = {MSR_PERF_GLOBAL_CTRL, MSR_PERF_GLOBAL_STATUS, MSR_PERF_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+    [MBOX0] = {MSR_M0_PMON_BOX_CTRL, MSR_M0_PMON_BOX_STATUS, MSR_M0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_M0_PMON_ADDR_MATCH, MSR_M0_PMON_ADDR_MASK},
+    [MBOX1] = {MSR_M1_PMON_BOX_CTRL, MSR_M1_PMON_BOX_STATUS, MSR_M1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_M1_PMON_ADDR_MATCH, MSR_M1_PMON_ADDR_MASK},
+    [BBOX0] = {MSR_B0_PMON_BOX_CTRL, MSR_B0_PMON_BOX_STATUS, MSR_B0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_B0_PMON_MATCH,MSR_B0_PMON_MASK},
+    [BBOX1] = {MSR_B1_PMON_BOX_CTRL, MSR_B1_PMON_BOX_STATUS, MSR_B1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_B1_PMON_MATCH,MSR_B1_PMON_MASK},
+    [RBOX0] = {MSR_R0_PMON_BOX_CTRL, MSR_R0_PMON_BOX_STATUS, MSR_R0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [RBOX1] = {MSR_R1_PMON_BOX_CTRL, MSR_R1_PMON_BOX_STATUS, MSR_R1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [SBOX0] = {MSR_S0_PMON_BOX_CTRL, MSR_S0_PMON_BOX_STATUS, MSR_S0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_S0_PMON_MATCH, MSR_S0_PMON_MASK},
+    [SBOX1] = {MSR_S1_PMON_BOX_CTRL, MSR_S1_PMON_BOX_STATUS, MSR_S1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48, MSR_S1_PMON_MATCH, MSR_S1_PMON_MASK},
+    [CBOX0] = {MSR_C0_PMON_BOX_CTRL, MSR_C0_PMON_BOX_STATUS, MSR_C0_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX1] = {MSR_C1_PMON_BOX_CTRL, MSR_C1_PMON_BOX_STATUS, MSR_C1_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX2] = {MSR_C2_PMON_BOX_CTRL, MSR_C2_PMON_BOX_STATUS, MSR_C2_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX3] = {MSR_C3_PMON_BOX_CTRL, MSR_C3_PMON_BOX_STATUS, MSR_C3_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX4] = {MSR_C4_PMON_BOX_CTRL, MSR_C4_PMON_BOX_STATUS, MSR_C4_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX5] = {MSR_C5_PMON_BOX_CTRL, MSR_C5_PMON_BOX_STATUS, MSR_C5_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX6] = {MSR_C6_PMON_BOX_CTRL, MSR_C6_PMON_BOX_STATUS, MSR_C6_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX7] = {MSR_C7_PMON_BOX_CTRL, MSR_C7_PMON_BOX_STATUS, MSR_C7_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX8] = {MSR_C8_PMON_BOX_CTRL, MSR_C8_PMON_BOX_STATUS, MSR_C8_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [CBOX9] = {MSR_C9_PMON_BOX_CTRL, MSR_C9_PMON_BOX_STATUS, MSR_C9_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [WBOX] = {MSR_W_PMON_BOX_CTRL, MSR_W_PMON_BOX_STATUS, MSR_W_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [WBOX0FIX] = {MSR_W_PMON_BOX_CTRL, MSR_W_PMON_BOX_STATUS, MSR_W_PMON_BOX_OVF_CTRL, 0, 0, 0, 48},
+    [UBOX] = {MSR_U_PMON_GLOBAL_CTRL, MSR_U_PMON_GLOBAL_STATUS, MSR_U_PMON_GLOBAL_OVF_CTRL, 0, 0, 0, 48},
+};
diff --git a/src/includes/perfmon_westmereEX_events.txt b/src/includes/perfmon_westmereEX_events.txt
index 2aabf8d..81efc6b 100644
--- a/src/includes/perfmon_westmereEX_events.txt
+++ b/src/includes/perfmon_westmereEX_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_westmereEX_events.txt
-# 
+#
 #      Description:  Event list for Intel WestmereEX
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -68,16 +69,16 @@ EVENT_MEM_STORE_RETIRED_DTLB        0x0C  PMC
 UMASK_MEM_STORE_RETIRED_DTLB_MISS   0x01
 
 EVENT_UOPS_ISSUED                0x0E   PMC
-UMASK_UOPS_ISSUED_ANY            0x01 
+UMASK_UOPS_ISSUED_ANY            0x01
 UMASK_UOPS_ISSUED_STALLED_CYCLES 0x01 0xC1  0x01
-UMASK_UOPS_ISSUED_FUSED          0x02 
+UMASK_UOPS_ISSUED_FUSED          0x02
 
 EVENT_MEM_UNCORE_RETIRED         0x0F    PMC
-UMASK_MEM_UNCORE_RETIRED_LOCAL_HITM                        0x02 
-UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM_AND_REMOTE_CACHE_HIT   0x08 
-UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM                        0x10 
-UMASK_MEM_UNCORE_RETIRED_REMOTE_DRAM                       0x20 
-UMASK_MEM_UNCORE_RETIRED_UNCACHEABLE                       0x80 
+UMASK_MEM_UNCORE_RETIRED_LOCAL_HITM                        0x02
+UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM_AND_REMOTE_CACHE_HIT   0x08
+UMASK_MEM_UNCORE_RETIRED_LOCAL_DRAM                        0x10
+UMASK_MEM_UNCORE_RETIRED_REMOTE_DRAM                       0x20
+UMASK_MEM_UNCORE_RETIRED_UNCACHEABLE                       0x80
 
 EVENT_FP_COMP_OPS_EXE            0x10   PMC
 UMASK_FP_COMP_OPS_EXE_X87        0x01
@@ -253,10 +254,10 @@ UMASK_BR_INST_EXEC_INDIRECT_NON_CALL     0x04
 UMASK_BR_INST_EXEC_NON_CALLS             0x07
 UMASK_BR_INST_EXEC_RETURN_NEAR           0x08
 UMASK_BR_INST_EXEC_DIRECT_NEAR_CALL      0x10
-UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL    0x20 
-UMASK_BR_INST_EXEC_NEAR_CALLS            0x30 
-UMASK_BR_INST_EXEC_TAKEN                 0x40 
-UMASK_BR_INST_EXEC_ANY                   0x7F 
+UMASK_BR_INST_EXEC_INDIRECT_NEAR_CALL    0x20
+UMASK_BR_INST_EXEC_NEAR_CALLS            0x30
+UMASK_BR_INST_EXEC_TAKEN                 0x40
+UMASK_BR_INST_EXEC_ANY                   0x7F
 
 EVENT_BR_MISP_EXEC                    0x89   PMC
 UMASK_BR_MISP_EXEC_COND               0x01
@@ -473,8 +474,66 @@ UMASK_SIMD_INT_64_PACKED_LOGICAL        0x10
 UMASK_SIMD_INT_64_PACKED_ARITH          0x20
 UMASK_SIMD_INT_64_SHUFFLE_MOVE          0x40
 
-EVENT_UNCORE_CYCLES                  0xFF  WBOX4
-UMASK_UNCORE_CYCLES                  0x00
+EVENT_OFFCORE_RESPONSE_0                           0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                 EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                   0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_DRAM      EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x40
+UMASK_OFFCORE_RESPONSE_0_LOCAL_DRAM                0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_LOCAL_CACHE             EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_LOCAL_CACHE     EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x7
+UMASK_OFFCORE_RESPONSE_0_LOCAL_CACHE               0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM             EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_DRAM     EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x20
+UMASK_OFFCORE_RESPONSE_0_REMOTE_DRAM               0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_0_REMOTE_CACHE            EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_0_REMOTE_CACHE    EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x18
+UMASK_OFFCORE_RESPONSE_0_REMOTE_CACHE              0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_0_DMND_DATA_RD_ANY          0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_RFO_ANY              0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_0_DMND_CODE_RD_ANY          0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_0_WB_ANY                    0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_DATA_RD_ANY         0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_RFO_ANY             0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L2_CODE_RD_ANY         0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_DATA_RD_ANY         0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_RFO_ANY             0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_0_PF_L3_CODE_RD_ANY         0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_0_BUS_LOCKS_ANY             0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_0_STREAMING_STORES_ANY      0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_0_OTHER_ANY                 0x01 0x0F 0x10
+
+EVENT_OFFCORE_RESPONSE_1                           0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                 EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                   0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM              EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_DRAM      EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x40
+UMASK_OFFCORE_RESPONSE_1_LOCAL_DRAM                0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_LOCAL_CACHE             EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_LOCAL_CACHE     EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x7
+UMASK_OFFCORE_RESPONSE_1_LOCAL_CACHE               0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM             EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_DRAM     EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x20
+UMASK_OFFCORE_RESPONSE_1_REMOTE_DRAM               0x01 0xFF 0xFF
+OPTIONS_OFFCORE_RESPONSE_1_REMOTE_CACHE            EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+DEFAULT_OPTIONS_OFFCORE_RESPONSE_1_REMOTE_CACHE    EVENT_OPTION_MATCH0=0xFF,EVENT_OPTION_MATCH1=0x18
+UMASK_OFFCORE_RESPONSE_1_REMOTE_CACHE              0x01 0xFF 0xFF
+UMASK_OFFCORE_RESPONSE_1_DMND_DATA_RD_ANY          0x01 0x00 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_RFO_ANY              0x01 0x01 0x10
+UMASK_OFFCORE_RESPONSE_1_DMND_CODE_RD_ANY          0x01 0x02 0x10
+UMASK_OFFCORE_RESPONSE_1_WB_ANY                    0x01 0x03 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_DATA_RD_ANY         0x01 0x04 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_RFO_ANY             0x01 0x05 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L2_CODE_RD_ANY         0x01 0x06 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_DATA_RD_ANY         0x01 0x07 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_RFO_ANY             0x01 0x08 0x10
+UMASK_OFFCORE_RESPONSE_1_PF_L3_CODE_RD_ANY         0x01 0x09 0x10
+UMASK_OFFCORE_RESPONSE_1_BUS_LOCKS_ANY             0x01 0x0A 0x10
+UMASK_OFFCORE_RESPONSE_1_STREAMING_STORES_ANY      0x01 0x0B 0x10
+UMASK_OFFCORE_RESPONSE_1_OTHER_ANY                 0x01 0x0F 0x10
+
+EVENT_UNCORE_CLOCKTICKS                  0xFF  WBOXFIX
+UMASK_UNCORE_CLOCKTICKS                  0x00
 
 EVENT_C_CYCLES_TURBO                  0x04  WBOX
 UMASK_C_CYCLES_TURBO_C0               0x01
@@ -488,26 +547,26 @@ UMASK_C_CYCLES_TURBO_C7               0x80
 UMASK_C_CYCLES_TURBO_C_ALL            0xFF
 
 EVENT_C_C0_THROTTLE_DIE              0x01  WBOX
-UMASK_C_C0_THROTTLE_DIE_C0               0x01              
-UMASK_C_C0_THROTTLE_DIE_C1               0x02              
-UMASK_C_C0_THROTTLE_DIE_C2               0x04              
-UMASK_C_C0_THROTTLE_DIE_C3               0x08              
-UMASK_C_C0_THROTTLE_DIE_C4               0x10              
-UMASK_C_C0_THROTTLE_DIE_C5               0x20              
-UMASK_C_C0_THROTTLE_DIE_C6               0x40              
-UMASK_C_C0_THROTTLE_DIE_C7               0x80              
-UMASK_C_C0_THROTTLE_DIE_C_ALL            0xFF              
+UMASK_C_C0_THROTTLE_DIE_C0               0x01
+UMASK_C_C0_THROTTLE_DIE_C1               0x02
+UMASK_C_C0_THROTTLE_DIE_C2               0x04
+UMASK_C_C0_THROTTLE_DIE_C3               0x08
+UMASK_C_C0_THROTTLE_DIE_C4               0x10
+UMASK_C_C0_THROTTLE_DIE_C5               0x20
+UMASK_C_C0_THROTTLE_DIE_C6               0x40
+UMASK_C_C0_THROTTLE_DIE_C7               0x80
+UMASK_C_C0_THROTTLE_DIE_C_ALL            0xFF
 
 EVENT_C_C0_THROTTLE_PROCHOT          0x03  WBOX
-UMASK_C_C0_THROTTLE_PROCHOT_C0               0x01          
-UMASK_C_C0_THROTTLE_PROCHOT_C1               0x02          
-UMASK_C_C0_THROTTLE_PROCHOT_C2               0x04          
-UMASK_C_C0_THROTTLE_PROCHOT_C3               0x08          
-UMASK_C_C0_THROTTLE_PROCHOT_C4               0x10          
-UMASK_C_C0_THROTTLE_PROCHOT_C5               0x20          
-UMASK_C_C0_THROTTLE_PROCHOT_C6               0x40          
-UMASK_C_C0_THROTTLE_PROCHOT_C7               0x80          
-UMASK_C_C0_THROTTLE_PROCHOT_C_ALL            0xFF          
+UMASK_C_C0_THROTTLE_PROCHOT_C0               0x01
+UMASK_C_C0_THROTTLE_PROCHOT_C1               0x02
+UMASK_C_C0_THROTTLE_PROCHOT_C2               0x04
+UMASK_C_C0_THROTTLE_PROCHOT_C3               0x08
+UMASK_C_C0_THROTTLE_PROCHOT_C4               0x10
+UMASK_C_C0_THROTTLE_PROCHOT_C5               0x20
+UMASK_C_C0_THROTTLE_PROCHOT_C6               0x40
+UMASK_C_C0_THROTTLE_PROCHOT_C7               0x80
+UMASK_C_C0_THROTTLE_PROCHOT_C_ALL            0xFF
 
 EVENT_C_C0_THROTTLE_TMP              0x00  WBOX
 UMASK_C_C0_THROTTLE_TMP_C0               0x01
@@ -559,8 +618,8 @@ UMASK_BCMD_SCHEDQ_OCCUPANCY_F2B       0x06 0x01 0x00
 UMASK_BCMD_SCHEDQ_OCCUPANCY_SPRWR     0x07 0x01 0x00
 UMASK_BCMD_SCHEDQ_OCCUPANCY_ALL       0x08 0x01 0x00
 
-EVENT_BBOX_CYCLES                  0x1B  MBOX
-UMASK_BBOX_CYCLES                  0xFF
+EVENT_MBOX_CLOCKTICKS                  0x1B  MBOX0C0|MBOX1C0
+UMASK_MBOX_CLOCKTICKS                  0xFF
 
 EVENT_CYCLES_DSP_FILL                  0x00  MBOX
 UMASK_CYCLES_DSP_FILL_RDQ_FULL         0x01 0x01 0x00
@@ -588,34 +647,35 @@ UMASK_CYCLES_SCHED_MODE_WRPRIO           0x02 0x01 0x00
 UMASK_CYCLES_SCHED_MODE_ADAPTIVE         0x03 0x01 0x00
 
 EVENT_DRAM_CMD                              0x0A  MBOX
+OPTIONS_DRAM_CMD_ALL                        EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
 UMASK_DRAM_CMD_ALL                          0x00 0x02 0x00
-UMASK_DRAM_CMD_ILLEGAL                      0x01 0x02 0x00
+UMASK_DRAM_CMD_ILLEGAL                      0x00 0x02 0x00
 UMASK_DRAM_CMD_PREALL                       0x01 0x02 0x00
-UMASK_DRAM_CMD_PREALL_TRDOFF                0x01 0x02 0x00
-UMASK_DRAM_CMD_PREALL_RDPRIO                0x01 0x02 0x01
-UMASK_DRAM_CMD_PREALL_WRPRIO                0x01 0x02 0x02
-UMASK_DRAM_CMD_PREALL_ADAPTIVE              0x01 0x02 0x02
+UMASK_DRAM_CMD_PREALL_TRDOFF                0x01 0x02 0x10
+UMASK_DRAM_CMD_PREALL_RDPRIO                0x01 0x02 0x11
+UMASK_DRAM_CMD_PREALL_WRPRIO                0x01 0x02 0x12
+UMASK_DRAM_CMD_PREALL_ADAPTIVE              0x01 0x02 0x13
 UMASK_DRAM_CMD_RAS                          0x02 0x02 0x00
-UMASK_DRAM_CMD_RAS_TRDOFF                   0x02 0x02 0x00
-UMASK_DRAM_CMD_RAS_RDPRIO                   0x02 0x02 0x01
-UMASK_DRAM_CMD_RAS_WRPRIO                   0x02 0x02 0x02
-UMASK_DRAM_CMD_RAS_ADAPTIVE                 0x02 0x02 0x02
-UMASK_DRAM_CMD_CAS_RD_OPN                   0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN                   0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN_TRDOFF            0x02 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_OPN_RDPRIO            0x04 0x02 0x01
-UMASK_DRAM_CMD_CAS_WR_OPN_WRPRIO            0x04 0x02 0x02
-UMASK_DRAM_CMD_CAS_WR_OPN_ADAPTIVE          0x04 0x02 0x03
+UMASK_DRAM_CMD_RAS_TRDOFF                   0x02 0x02 0x10
+UMASK_DRAM_CMD_RAS_RDPRIO                   0x02 0x02 0x11
+UMASK_DRAM_CMD_RAS_WRPRIO                   0x02 0x02 0x12
+UMASK_DRAM_CMD_RAS_ADAPTIVE                 0x02 0x02 0x13
+UMASK_DRAM_CMD_CAS_RD_OPN                   0x03 0x02 0x00
+UMASK_DRAM_CMD_CAS_WR_OPN                   0x04 0x02 0x00
+UMASK_DRAM_CMD_CAS_WR_OPN_TRDOFF            0x04 0x02 0x10
+UMASK_DRAM_CMD_CAS_WR_OPN_RDPRIO            0x04 0x02 0x11
+UMASK_DRAM_CMD_CAS_WR_OPN_WRPRIO            0x04 0x02 0x12
+UMASK_DRAM_CMD_CAS_WR_OPN_ADAPTIVE          0x04 0x02 0x13
 UMASK_DRAM_CMD_CAS_RD_CLS                   0x05 0x02 0x00
-UMASK_DRAM_CMD_CAS_RD_CLS_TRDOFF            0x05 0x02 0x00
-UMASK_DRAM_CMD_CAS_RD_CLS_RDPRIO            0x05 0x02 0x01
-UMASK_DRAM_CMD_CAS_RD_CLS_WRPRIO            0x05 0x02 0x02
-UMASK_DRAM_CMD_CAS_RD_CLS_ADAPTIVE          0x05 0x02 0x03
+UMASK_DRAM_CMD_CAS_RD_CLS_TRDOFF            0x05 0x02 0x10
+UMASK_DRAM_CMD_CAS_RD_CLS_RDPRIO            0x05 0x02 0x11
+UMASK_DRAM_CMD_CAS_RD_CLS_WRPRIO            0x05 0x02 0x12
+UMASK_DRAM_CMD_CAS_RD_CLS_ADAPTIVE          0x05 0x02 0x13
 UMASK_DRAM_CMD_CAS_WR_CLS                   0x06 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_CLS_TRDOFF            0x06 0x02 0x00
-UMASK_DRAM_CMD_CAS_WR_CLS_RDPRIO            0x06 0x02 0x01
-UMASK_DRAM_CMD_CAS_WR_CLS_WRPRIO            0x06 0x02 0x02
-UMASK_DRAM_CMD_CAS_WR_CLS_ADAPTIVE          0x06 0x02 0x03
+UMASK_DRAM_CMD_CAS_WR_CLS_TRDOFF            0x06 0x02 0x10
+UMASK_DRAM_CMD_CAS_WR_CLS_RDPRIO            0x06 0x02 0x11
+UMASK_DRAM_CMD_CAS_WR_CLS_WRPRIO            0x06 0x02 0x12
+UMASK_DRAM_CMD_CAS_WR_CLS_ADAPTIVE          0x06 0x02 0x13
 UMASK_DRAM_CMD_MRS                          0x07 0x02 0x00
 UMASK_DRAM_CMD_RFR                          0x09 0x02 0x00
 UMASK_DRAM_CMD_ENSR                         0x0A 0x02 0x00
@@ -647,7 +707,6 @@ UMASK_DRAM_MISC_RETRIES_ALL              0x00 0x04 0x03
 UMASK_DRAM_MISC_RETRIES_FVID             0x01 0x04 0x03
 UMASK_DRAM_MISC_VALID                    0x01 0x04 0x02
 UMASK_DRAM_MISC_NON_NOP_TRKL             0x01 0x04 0x01
-
 UMASK_DRAM_MISC_ILLEGAL                  0x00 0x04 0x00
 UMASK_DRAM_MISC_PREALL                   0x01 0x04 0x00
 UMASK_DRAM_MISC_RAS                      0x02 0x04 0x00
@@ -704,12 +763,12 @@ UMASK_FVC_EV1_FAST_RESET              0x04 0x07 0x00
 UMASK_FVC_EV1_BBOX_CMDS_READS         0x05 0x07 0x00
 UMASK_FVC_EV1_BBOX_CMDS_WRITES        0x05 0x07 0x01
 UMASK_FVC_EV1_BBOX_RSP_ACK            0x06 0x07 0x00
-UMASK_FVC_EV1_BBOX_RSP_RETRY          0x06 0x07 0x10
-UMASK_FVC_EV1_BBOX_RSP_COR            0x06 0x07 0x20
-UMASK_FVC_EV1_BBOX_RSP_UNCOR          0x06 0x07 0x30
-UMASK_FVC_EV1_BBOX_RSP_SPEC_ACK       0x06 0x07 0x40
-UMASK_FVC_EV1_BBOX_RSP_SPR_ACK        0x06 0x07 0x50
-UMASK_FVC_EV1_BBOX_RSP_SPR_UNCORE     0x06 0x07 0x70
+UMASK_FVC_EV1_BBOX_RSP_RETRY          0x06 0x07 0x01
+UMASK_FVC_EV1_BBOX_RSP_COR            0x06 0x07 0x02
+UMASK_FVC_EV1_BBOX_RSP_UNCOR          0x06 0x07 0x03
+UMASK_FVC_EV1_BBOX_RSP_SPEC_ACK       0x06 0x07 0x04
+UMASK_FVC_EV1_BBOX_RSP_SPR_ACK        0x06 0x07 0x05
+UMASK_FVC_EV1_BBOX_RSP_SPR_UNCORE     0x06 0x07 0x07
 UMASK_FVC_EV1_SMI_NB_TRIG             0x07 0x07 0x00
 
 EVENT_FVC_EV2                         0x0F  MBOX
@@ -721,30 +780,30 @@ UMASK_FVC_EV2_FAST_RESET              0x04 0x08 0x00
 UMASK_FVC_EV2_BBOX_CMDS_READS         0x05 0x08 0x00
 UMASK_FVC_EV2_BBOX_CMDS_WRITES        0x05 0x08 0x01
 UMASK_FVC_EV2_BBOX_RSP_ACK            0x06 0x08 0x00
-UMASK_FVC_EV2_BBOX_RSP_RETRY          0x06 0x08 0x10
-UMASK_FVC_EV2_BBOX_RSP_COR            0x06 0x08 0x20
-UMASK_FVC_EV2_BBOX_RSP_UNCOR          0x06 0x08 0x30
-UMASK_FVC_EV2_BBOX_RSP_SPEC_ACK       0x06 0x08 0x40
-UMASK_FVC_EV2_BBOX_RSP_SPR_ACK        0x06 0x08 0x50
-UMASK_FVC_EV2_BBOX_RSP_SPR_UNCORE     0x06 0x08 0x70
+UMASK_FVC_EV2_BBOX_RSP_RETRY          0x06 0x08 0x01
+UMASK_FVC_EV2_BBOX_RSP_COR            0x06 0x08 0x02
+UMASK_FVC_EV2_BBOX_RSP_UNCOR          0x06 0x08 0x03
+UMASK_FVC_EV2_BBOX_RSP_SPEC_ACK       0x06 0x08 0x04
+UMASK_FVC_EV2_BBOX_RSP_SPR_ACK        0x06 0x08 0x05
+UMASK_FVC_EV2_BBOX_RSP_SPR_UNCORE     0x06 0x08 0x07
 UMASK_FVC_EV2_SMI_NB_TRIG             0x07 0x08 0x00
 
 EVENT_FVC_EV3                         0x10  MBOX
 UMASK_FVC_EV3_SMI_CRC_ERR             0x00 0x09 0x00
-UMASK_FVC_EV3_MEM_ECC_ERR             0x00 0x09 0x00
-UMASK_FVC_EV3_POISON_TXN              0x00 0x09 0x00
-UMASK_FVC_EV3_ALERT_FRAMES            0x00 0x09 0x00
-UMASK_FVC_EV3_FAST_RESET              0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_CMDS_READS         0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_CMDS_WRITES        0x00 0x09 0x01
-UMASK_FVC_EV3_BBOX_RSP_ACK            0x00 0x09 0x00
-UMASK_FVC_EV3_BBOX_RSP_RETRY          0x00 0x09 0x10
-UMASK_FVC_EV3_BBOX_RSP_COR            0x00 0x09 0x20
-UMASK_FVC_EV3_BBOX_RSP_UNCOR          0x00 0x09 0x30
-UMASK_FVC_EV3_BBOX_RSP_SPEC_ACK       0x00 0x09 0x40
-UMASK_FVC_EV3_BBOX_RSP_SPR_ACK        0x00 0x09 0x50
-UMASK_FVC_EV3_BBOX_RSP_SPR_UNCORE     0x00 0x09 0x70
-UMASK_FVC_EV3_SMI_NB_TRIG             0x00 0x09 0x00
+UMASK_FVC_EV3_MEM_ECC_ERR             0x01 0x09 0x00
+UMASK_FVC_EV3_POISON_TXN              0x02 0x09 0x00
+UMASK_FVC_EV3_ALERT_FRAMES            0x03 0x09 0x00
+UMASK_FVC_EV3_FAST_RESET              0x04 0x09 0x00
+UMASK_FVC_EV3_BBOX_CMDS_READS         0x05 0x09 0x00
+UMASK_FVC_EV3_BBOX_CMDS_WRITES        0x05 0x09 0x01
+UMASK_FVC_EV3_BBOX_RSP_ACK            0x06 0x09 0x00
+UMASK_FVC_EV3_BBOX_RSP_RETRY          0x06 0x09 0x01
+UMASK_FVC_EV3_BBOX_RSP_COR            0x06 0x09 0x02
+UMASK_FVC_EV3_BBOX_RSP_UNCOR          0x06 0x09 0x03
+UMASK_FVC_EV3_BBOX_RSP_SPEC_ACK       0x06 0x09 0x04
+UMASK_FVC_EV3_BBOX_RSP_SPR_ACK        0x06 0x09 0x05
+UMASK_FVC_EV3_BBOX_RSP_SPR_UNCORE     0x06 0x09 0x07
+UMASK_FVC_EV3_SMI_NB_TRIG             0x07 0x09 0x00
 
 EVENT_FVID_RACE                       0x18  MBOX
 UMASK_FVID_RACE                       0x00 0x00 0x00
@@ -799,12 +858,44 @@ UMASK_THERM_TRP_DN_ALL_GT_MID_RISE    0x03 0x0D 0x00
 UMASK_THERM_TRP_DN_ALL_GT_MID_FALL    0x02 0x0D 0x00
 UMASK_THERM_TRP_DN_ALL_GT_LO          0x01 0x0D 0x00
 UMASK_THERM_TRP_DN_ALL_LT_LO          0x00 0x0D 0x00
+UMASK_THERM_TRP_DN_DIMM0_GT_MID_RISE  0x03 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_MID_RISE  0x03 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_MID_RISE  0x03 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_MID_RISE  0x03 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_GT_MID_FALL  0x02 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_MID_FALL  0x02 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_MID_FALL  0x02 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_MID_FALL  0x02 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_GT_LO        0x01 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_GT_LO        0x01 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_GT_LO        0x01 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_GT_LO        0x01 0x0D 0x04
+UMASK_THERM_TRP_DN_DIMM0_LT_LO        0x00 0x0D 0x01
+UMASK_THERM_TRP_DN_DIMM1_LT_LO        0x00 0x0D 0x02
+UMASK_THERM_TRP_DN_DIMM2_LT_LO        0x00 0x0D 0x03
+UMASK_THERM_TRP_DN_DIMM3_LT_LO        0x00 0x0D 0x04
 
 EVENT_THERM_TRP_UP                    0x04  MBOX
 UMASK_THERM_TRP_UP_ALL_GT_MID_RISE    0x03 0x0E 0x00
 UMASK_THERM_TRP_UP_ALL_GT_MID_FALL    0x02 0x0E 0x00
 UMASK_THERM_TRP_UP_ALL_GT_LO          0x01 0x0E 0x00
 UMASK_THERM_TRP_UP_ALL_LT_LO          0x00 0x0E 0x00
+UMASK_THERM_TRP_UP_DIMM0_GT_MID_RISE  0x03 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_MID_RISE  0x03 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_MID_RISE  0x03 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_MID_RISE  0x03 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_GT_MID_FALL  0x02 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_MID_FALL  0x02 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_MID_FALL  0x02 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_MID_FALL  0x02 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_GT_LO        0x01 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_GT_LO        0x01 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_GT_LO        0x01 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_GT_LO        0x01 0x0E 0x04
+UMASK_THERM_TRP_UP_DIMM0_LT_LO        0x00 0x0E 0x01
+UMASK_THERM_TRP_UP_DIMM1_LT_LO        0x00 0x0E 0x02
+UMASK_THERM_TRP_UP_DIMM2_LT_LO        0x00 0x0E 0x03
+UMASK_THERM_TRP_UP_DIMM3_LT_LO        0x00 0x0E 0x04
 
 EVENT_TRANS_CMDS                      0x12  MBOX
 UMASK_TRANS_CMDS                      0x00 0x00 0x00
@@ -813,112 +904,165 @@ EVENT_TT_CMD_CONFLICT                 0x19  MBOX
 UMASK_TT_CMD_CONFLICT                 0x00 0x00 0x00
 
 EVENT_ACK_BEFORE_LAST_SNP             0x19  BBOX0C3|BBOX1C3
-UMASK_ACK_BEFORE_LAST_SNP             0x03
-
-EVENT_ADDR_IN_MATCH             0x04  BBOX0C2|BBOX1C2
-UMASK_ADDR_IN_MATCH             0x02
+UMASK_ACK_BEFORE_LAST_SNP             0x00
 
 EVENT_CONFLICTS             0x17  BBOX0C3|BBOX1C3
-UMASK_CONFLICTS             0x03
+UMASK_CONFLICTS             0x00
 
 EVENT_COHQ_BYPASS             0x0E  BBOX0C3|BBOX1C3
-UMASK_COHQ_BYPASS             0x03
+UMASK_COHQ_BYPASS             0x00
 
 EVENT_COHQ_IMT_ALLOC_WAIT             0x0E  BBOX0C3|BBOX1C3
-UMASK_COHQ_IMT_ALLOC_WAIT             0x03
+UMASK_COHQ_IMT_ALLOC_WAIT             0x00
 
 EVENT_DIRQ_INSERTS             0x17  BBOX0C1|BBOX1C1
-UMASK_DIRQ_INSERTS             0x01
+UMASK_DIRQ_INSERTS             0x00
 
 EVENT_DIRQ_OCCUPANCY             0x17  BBOX0C0|BBOX1C0
 UMASK_DIRQ_OCCUPANCY             0x00
 
 EVENT_DEMAND_FETCH             0x0F  BBOX0C3|BBOX1C3
-UMASK_DEMAND_FETCH             0x03
+UMASK_DEMAND_FETCH             0x00
 
 EVENT_DRSQ_INSERTS             0x09  BBOX0C1|BBOX1C1
-UMASK_DRSQ_INSERTS             0x01
+UMASK_DRSQ_INSERTS             0x00
 
 EVENT_DRSQ_OCCUPANCY             0x09  BBOX0C0|BBOX1C0
 UMASK_DRSQ_OCCUPANCY             0x00
 
 EVENT_EARLY_ACK             0x02  BBOX0C3|BBOX1C3
-UMASK_EARLY_ACK             0x03
+UMASK_EARLY_ACK             0x00
 
 EVENT_IMPLICIT_WBS             0x12  BBOX0C3|BBOX1C3
-UMASK_IMPLICIT_WBS             0x03
+UMASK_IMPLICIT_WBS             0x00
 
 EVENT_IMT_FULL             0x12  BBOX0C3|BBOX1C3
-UMASK_IMT_FULL             0x03
+UMASK_IMT_FULL             0x00
 
 EVENT_IMT_INSERTS_ALL             0x07  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_ALL             0x01
+UMASK_IMT_INSERTS_ALL             0x00
 
 EVENT_IMT_INSERTS_INVITOE             0x0F  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_INVITOE             0x01
+UMASK_IMT_INSERTS_INVITOE             0x00
 
 EVENT_IMT_INSERTS_IOH             0x0A  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH             0x01
+UMASK_IMT_INSERTS_IOH             0x00
 
 EVENT_IMT_INSERTS_IOH_INVITOE             0x10  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH_INVITOE             0x01
+UMASK_IMT_INSERTS_IOH_INVITOE             0x00
 
 EVENT_IMT_INSERTS_IOH_WR             0x0D  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_IOH_WR             0x01
+UMASK_IMT_INSERTS_IOH_WR             0x00
 
 EVENT_IMT_INSERTS_NON_IOH             0x0B  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH             0x01
+UMASK_IMT_INSERTS_NON_IOH             0x00
 
 EVENT_IMT_INSERTS_NON_IOH_INVITOE             0x1C  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH_INVITOE             0x01
+UMASK_IMT_INSERTS_NON_IOH_INVITOE             0x00
 
-EVENT_INSERTS_NON_IOH_RD             0x1F  BBOX0C1|BBOX1C1
-UMASK_INSERTS_NON_IOH_RD             0x01
+EVENT_IMT_INSERTS_NON_IOH_RD             0x1F  BBOX0C1|BBOX1C1
+UMASK_IMT_INSERTS_NON_IOH_RD             0x00
 
 EVENT_IMT_INSERTS_NON_IOH_WR             0x0E  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_NON_IOH_WR             0x01
+UMASK_IMT_INSERTS_NON_IOH_WR             0x00
 
 EVENT_IMT_INSERTS_RD             0x1D  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_RD             0x01
+UMASK_IMT_INSERTS_RD             0x00
 
 EVENT_IMT_INSERTS_WR             0x0C  BBOX0C1|BBOX1C1
-UMASK_IMT_INSERTS_WR             0x01
+UMASK_IMT_INSERTS_WR             0x00
 
 EVENT_IMT_NE_CYCLES             0x07  BBOX0C2|BBOX1C2
-UMASK_IMT_NE_CYCLES             0x02
+UMASK_IMT_NE_CYCLES             0x00
 
 EVENT_IMT_PREALLOC             0x06  BBOX0C3|BBOX1C3
-UMASK_IMT_PREALLOC             0x03
+UMASK_IMT_PREALLOC             0x00
 
 EVENT_IMT_VALID_OCCUPANCY             0x07  BBOX0C0|BBOX1C0
 UMASK_IMT_VALID_OCCUPANCY             0x00
 
-EVENT_MSG_ADDR_IN_MATCH             0x01  BBOX0C0|BBOX1C0
-UMASK_MSG_ADDR_IN_MATCH             0x00
+EVENT_MSGS_B_TO_S             0x03  BBOX0C2|BBOX1C2
+UMASK_MSGS_B_TO_S             0x00
 
-EVENT_MSGS_B_TO_S             0x03  BBOX0C1|BBOX1C1
-UMASK_MSGS_B_TO_S             0x01
+EVENT_MSGS_S_TO_B             0x02  BBOX0C2|BBOX1C2
+UMASK_MSGS_S_TO_B             0x00
 
-EVENT_MSGS_B_TO_S             0x03  BBOX0C2|BBOX1C2
-UMASK_MSGS_B_TO_S             0x02
+EVENT_MSGS_IN_NON_SNP             0x01  BBOX0C2|BBOX1C2
+UMASK_MSGS_IN_NON_SNP             0x00
 
 EVENT_MSG_IN_MATCH             0x01  BBOX0C1|BBOX1C1
-UMASK_MSG_IN_MATCH             0x01
+OPTIONS_MSG_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_IN_MATCH             0x00
 
-EVENT_MSGS_IN_NON_SNP             0x01  BBOX0C2|BBOX1C2
-UMASK_MSGS_IN_NON_SNP             0x02
+EVENT_MSG_ADDR_IN_MATCH             0x01  BBOX0C0|BBOX1C0
+OPTIONS_MSG_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_ADDR_IN_MATCH             0x00
 
 EVENT_MSG_OPCODE_ADDR_IN_MATCH             0x03  BBOX0C0|BBOX1C0
+OPTIONS_MSG_OPCODE_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
 UMASK_MSG_OPCODE_ADDR_IN_MATCH             0x00
 
 EVENT_MSG_OPCODE_IN_MATCH             0x05  BBOX0C1|BBOX1C1
-UMASK_MSG_OPCODE_IN_MATCH             0x01
+OPTIONS_MSG_OPCODE_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OPCODE_IN_MATCH             0x00
 
 EVENT_MSG_OPCODE_OUT_MATCH             0x06  BBOX0C1|BBOX1C1
-UMASK_MSG_OPCODE_OUT_MATCH             0x01
+OPTIONS_MSG_OPCODE_OUT_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OPCODE_OUT_MATCH             0x00
 
 EVENT_MSG_OUT_MATCH             0x02  BBOX0C1|BBOX1C1
-UMASK_MSG_OUT_MATCH             0x01
+OPTIONS_MSG_OUT_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_MSG_OUT_MATCH             0x00
+
+EVENT_OPCODE_ADDR_IN_MATCH             0x02  BBOX0C0|BBOX1C0
+OPTIONS_OPCODE_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_ADDR_IN_MATCH             0x00
+
+EVENT_OPCODE_IN_MATCH             0x03  BBOX0C1|BBOX1C1
+OPTIONS_OPCODE_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_IN_MATCH             0x00
+
+EVENT_OPCODE_OUT_MATCH             0x04  BBOX0C1|BBOX1C1
+OPTIONS_OPCODE_OUT_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_OPCODE_OUT_MATCH             0x00
+
+EVENT_ADDR_IN_MATCH             0x04  BBOX0C2|BBOX1C2
+OPTIONS_ADDR_IN_MATCH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MASK0_MASK
+UMASK_ADDR_IN_MATCH             0x00
+
+EVENT_RBOX_VNA_UNAVAIL             0x15  BBOX0C3|BBOX1C3
+UMASK_RBOX_VNA_UNAVAIL             0x00
+
+EVENT_SBOX_VN0_UNAVAIL             0x14  BBOX0C3|BBOX1C3
+UMASK_SBOX_VN0_UNAVAIL             0x00
+
+EVENT_SNPOQ_INSERTS             0x12  BBOX0C1|BBOX1C1
+UMASK_SNPOQ_INSERTS             0x00
+
+EVENT_SNPOQ_OCCUPANCY             0x12  BBOX0C0|BBOX1C0
+UMASK_SNPOQ_OCCUPANCY             0x00
+
+EVENT_TF_ALL             0x04  BBOX0C0|BBOX1C0
+UMASK_TF_ALL             0x00
+
+EVENT_TF_INVITOE             0x06  BBOX0C0|BBOX1C0
+UMASK_TF_INVITOE             0x00
+
+EVENT_TF_IOH             0x0B  BBOX0C0|BBOX1C0
+UMASK_TF_IOH             0x00
+
+EVENT_TF_IOH_INVITOE             0x0F  BBOX0C0|BBOX1C0
+UMASK_TF_IOH_INVITOE             0x00
+
+EVENT_TF_IOH_NON_INVITOE_RD             0x1C  BBOX0C0|BBOX1C0
+UMASK_TF_IOH_NON_INVITOE_RD             0x00
+
+EVENT_TF_IOH_WR             0x0D  BBOX0C0|BBOX1C0
+UMASK_TF_IOH_WR             0x00
+
+EVENT_TF_WR             0x05  BBOX0C0|BBOX1C0
+UMASK_TF_WR             0x00
+
 
 EVENT_ALLOC_TO_ARB                              0x00  RBOX0
 UMASK_ALLOC_TO_ARB_PORT0_IPERF0_NCB             0x00 0x01  0x09
@@ -3012,6 +3156,7 @@ EVENT_TRANS_VIQ                                 0x1D CBOX
 UMASK_TRANS_VIQ                                 0x00
 
 EVENT_TO_R_PROG_EV                              0x00 SBOX
+OPTIONS_TO_R_PROG_EV                            EVENT_OPTION_MASK0_MASK|EVENT_OPTION_MATCH0_MASK
 UMASK_TO_R_PROG_EV                              0x00
 
 EVENT_TO_R_B_HOM_MSGQ_CYCLES_FULL               0x03 SBOX
diff --git a/src/includes/perfmon_westmere_events.txt b/src/includes/perfmon_westmere_events.txt
index 3c3e66f..53d45f1 100644
--- a/src/includes/perfmon_westmere_events.txt
+++ b/src/includes/perfmon_westmere_events.txt
@@ -1,16 +1,17 @@
 # =======================================================================================
-#  
+#
 #      Filename:  perfmon_westmere_events.txt
-# 
+#
 #      Description:  Event list for Intel Westmere
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
-#      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
+#      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+#                Thomas Roehl (tr), thomas.roehl at googlemail.com
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -473,6 +474,17 @@ UMASK_SIMD_INT_64_PACKED_LOGICAL        0x10
 UMASK_SIMD_INT_64_PACKED_ARITH          0x20
 UMASK_SIMD_INT_64_SHUFFLE_MOVE          0x40
 
+EVENT_OFFCORE_RESPONSE_0                            0xB7 PMC
+OPTIONS_OFFCORE_RESPONSE_0_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_0_OPTIONS                    0x01 0xFF 0xFF
+
+EVENT_OFFCORE_RESPONSE_1                            0xBB PMC
+OPTIONS_OFFCORE_RESPONSE_1_OPTIONS                  EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_MATCH1_MASK
+UMASK_OFFCORE_RESPONSE_1_OPTIONS                    0x01 0xFF 0xFF
+
+EVENT_UNCORE_CLOCKTICKS                 0x00 UPMCFIX
+UMASK_UNCORE_CLOCKTICKS                 0x00
+
 EVENT_UNC_GQ_CYCLES_FULL                0x00   UPMC
 UMASK_UNC_GQ_CYCLES_FULL_READ_TRACKER         0x01
 UMASK_UNC_GQ_CYCLES_FULL_WRITE_TRACKER        0x02
@@ -678,10 +690,36 @@ UMASK_UNC_QHL_SLEEPS_IOH_CONFLICT             0x08
 UMASK_UNC_QHL_SLEEPS_REMOTE_CONFLICT          0x10
 UMASK_UNC_QHL_SLEEPS_LOCAL_CONFLICT           0x20
 
-EVENT_UNC_ADDR_OPCODE_MATCH                         0x35   UPMC
-UMASK_UNC_ADDR_OPCODE_MATCH_IOH                     0x01
-UMASK_UNC_ADDR_OPCODE_MATCH_REMOTE                  0x02
-UMASK_UNC_ADDR_OPCODE_MATCH_LOCAL                   0x04
+EVENT_UNC_ADDR_OPCODE_MATCH_AND                 0x35   UPMC
+OPTIONS_UNC_ADDR_OPCODE_MATCH_AND_IOH           EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_AND_IOH             0x01 0x02 0x0
+OPTIONS_UNC_ADDR_OPCODE_MATCH_AND_REMOTE        EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_AND_REMOTE          0x02 0x02 0x0
+OPTIONS_UNC_ADDR_OPCODE_MATCH_AND_LOCAL         EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_AND_LOCAL           0x04 0x02 0x0
+
+EVENT_UNC_ADDR_OPCODE_MATCH_OR                  0x35   UPMC
+OPTIONS_UNC_ADDR_OPCODE_MATCH_OR_IOH            EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_OR_IOH              0x01 0x0C 0x0
+OPTIONS_UNC_ADDR_OPCODE_MATCH_OR_REMOTE         EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_OR_REMOTE           0x02 0x0C 0x0
+OPTIONS_UNC_ADDR_OPCODE_MATCH_OR_LOCAL          EVENT_OPTION_MATCH0_MASK|EVENT_OPTION_OPCODE_MASK
+UMASK_UNC_ADDR_OPCODE_MATCH_OR_LOCAL            0x04 0x0C 0x0
+
+EVENT_UNC_ADDR_OPCODE_MATCH_RSPFWDS             0x35   UPMC
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPFWDS_IOH         0x01 0x04 0x1A
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPFWDS_REMOTE      0x02 0x04 0x1A
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPFWDS_LOCAL       0x04 0x04 0x1A
+
+EVENT_UNC_ADDR_OPCODE_MATCH_RSPIWB              0x35   UPMC
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_IOH          0x01 0x04 0x1D
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_REMOTE       0x02 0x04 0x1D
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_LOCAL        0x04 0x04 0x1D
+
+EVENT_UNC_ADDR_OPCODE_MATCH_RSPIWB              0x35   UPMC
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_IOH          0x01 0x04 0x00
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_REMOTE       0x02 0x04 0x00
+UMASK_UNC_ADDR_OPCODE_MATCH_RSPIWB_LOCAL        0x04 0x04 0x00
 
 EVENT_UNC_QPI_TX_STALLED_SINGLE_FLIT                0x40  UPMC
 UMASK_UNC_QPI_TX_STALLED_SINGLE_FLIT_HOME_LINK_0    0x01
@@ -789,3 +827,4 @@ UMASK_UNC_CYCLES_UNHALTED_L3_FLL_ENABLE             0x02
 
 EVENT_UNC_CYCLES_UNHALTED_L3_FLL_DISABLE            0x86  UPMC
 UMASK_UNC_CYCLES_UNHALTED_L3_FLL_DISABLE            0x01
+
diff --git a/src/includes/power.h b/src/includes/power.h
index 6cb5fd3..f66b95c 100644
--- a/src/includes/power.h
+++ b/src/includes/power.h
@@ -6,13 +6,14 @@
  *      Description:  Header File Power Module
  *                    Implements Intel RAPL Interface.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -36,45 +37,174 @@
 #include <registers.h>
 #include <bitUtil.h>
 #include <msr.h>
+#include <error.h>
+#include <access.h>
 
-extern PowerInfo power_info;
-extern  const uint32_t power_regs[4];
+const char* power_names[NUM_POWER_DOMAINS] = {"PKG", "PP0", "PP1", "DRAM"};
 
-extern void power_init(int cpuId);
-static inline void power_start(PowerData* data, int cpuId, PowerType type);
-static inline void power_stop(PowerData* data, int cpuId, PowerType type);
-static inline uint32_t power_read(int cpuId, uint64_t reg);
-static inline uint32_t power_tread(int socket_fd, int cpuId, uint64_t reg);
-static inline double power_printEnergy(PowerData* data);
+uint32_t power_regs[NUM_POWER_DOMAINS] = {MSR_PKG_ENERGY_STATUS,
+                                MSR_PP0_ENERGY_STATUS,
+                                MSR_PP1_ENERGY_STATUS,
+                                MSR_DRAM_ENERGY_STATUS};
 
-static double
+uint32_t limit_regs[NUM_POWER_DOMAINS] = {MSR_PKG_RAPL_POWER_LIMIT,
+                                MSR_PP0_RAPL_POWER_LIMIT,
+                                MSR_PP1_RAPL_POWER_LIMIT,
+                                MSR_DRAM_RAPL_POWER_LIMIT};
+
+uint32_t policy_regs[NUM_POWER_DOMAINS] = {0,
+                                MSR_PP0_ENERGY_POLICY,
+                                MSR_PP1_ENERGY_POLICY,
+                                0};
+
+uint32_t perf_regs[NUM_POWER_DOMAINS] = {MSR_PKG_PERF_STATUS,
+                                MSR_PP0_PERF_STATUS,
+                                0,
+                                MSR_DRAM_PERF_STATUS};
+
+uint32_t info_regs[NUM_POWER_DOMAINS] = {MSR_PKG_POWER_INFO,
+                                0,
+                                0,
+                                MSR_DRAM_POWER_INFO};
+
+
+double
 power_printEnergy(PowerData* data)
 {
-    return  (double) ((data->after - data->before) * power_info.energyUnit);
+    return  (double) ((data->after - data->before) * power_info.domains[data->domain].energyUnit);
 }
 
-static void
+int
 power_start(PowerData* data, int cpuId, PowerType type)
 {
-    data->before = extractBitField(msr_read(cpuId, power_regs[type]),32,0);
+    if (power_info.hasRAPL)
+    {
+        if (power_info.domains[type].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+        {
+            uint64_t result = 0;
+            data->before = 0;
+            CHECK_MSR_READ_ERROR(HPMread(cpuId, MSR_DEV, power_regs[type], &result))
+            data->before = extractBitField(result,32,0);
+            data->domain = type;
+            return 0;
+        }
+        else
+        {
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, RAPL domain %s not supported, power_names[type]);
+            return -EFAULT;
+        }
+    }
+    else
+    {
+        DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, No RAPL support);
+        return -EIO;
+    }
 }
 
-static void
+int
 power_stop(PowerData* data, int cpuId, PowerType type)
 {
-    data->after = extractBitField(msr_read(cpuId, power_regs[type]),32,0);
+    if (power_info.hasRAPL)
+    {
+        if (power_info.domains[type].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+        {
+            uint64_t result = 0;
+            data->after = 0;
+            CHECK_MSR_READ_ERROR(HPMread(cpuId, MSR_DEV, power_regs[type], &result))
+            data->after = extractBitField(result,32,0);
+            data->domain = type;
+            return 0;
+        }
+        else
+        {
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, RAPL domain %s not supported, power_names[type]);
+            return -EFAULT;
+        }
+    }
+    else
+    {
+        DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, No RAPL support);
+        return -EIO;
+    }
+}
+
+int
+power_read(int cpuId, uint64_t reg, uint32_t *data)
+{
+    int i;
+    PowerType type = -1;
+
+    if (power_info.hasRAPL)
+    {
+        for (i = 0; i < NUM_POWER_DOMAINS; i++)
+        {
+            if (reg == power_regs[i])
+            {
+                type = i;
+                break;
+            }
+        }
+        if (power_info.domains[type].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+        {
+            uint64_t result = 0;
+            *data = 0;
+            CHECK_MSR_READ_ERROR(HPMread(cpuId, MSR_DEV, reg, &result))
+            *data = extractBitField(result,32,0);
+            return 0;
+        }
+        else
+        {
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, RAPL domain %s not supported, power_names[type]);
+            return -EFAULT;
+        }
+    }
+    else
+    {
+        DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, No RAPL support);
+        return -EIO;
+    }
 }
 
-static uint32_t
-power_read(int cpuId, uint64_t reg)
+int
+power_tread(int socket_fd, int cpuId, uint64_t reg, uint32_t *data)
 {
-    return extractBitField(msr_read(cpuId, reg),32,0);
+    int i;
+    PowerType type;
+    if (power_info.hasRAPL)
+    {
+        for (i = 0; i < NUM_POWER_DOMAINS; i++)
+        {
+            if (reg == power_regs[i])
+            {
+                type = i;
+                break;
+            }
+        }
+        if (power_info.domains[type].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+        {
+            uint64_t result = 0;
+            *data = 0;
+            CHECK_MSR_READ_ERROR(HPMread(cpuId, MSR_DEV, reg, &result))
+            *data = extractBitField(result,32,0);
+            return 0;
+        }
+        else
+        {
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, RAPL domain %s not supported, power_names[type]);
+            return -EFAULT;
+        }
+    }
+    else
+    {
+        DEBUG_PLAIN_PRINT(DEBUGLEV_DEVELOP, No RAPL support);
+        return -EIO;
+    }
 }
 
-static uint32_t
-power_tread(int socket_fd, int cpuId, uint64_t reg)
+double
+power_getEnergyUnit(int domain)
 {
-    return extractBitField(msr_tread(socket_fd, cpuId, reg),32,0);
+    return power_info.domains[domain].energyUnit;
 }
 
 #endif /*POWER_H*/
diff --git a/src/includes/power_types.h b/src/includes/power_types.h
index b53ce85..341ceb2 100644
--- a/src/includes/power_types.h
+++ b/src/includes/power_types.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Types file for power module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -32,36 +33,10 @@
 #define POWER_TYPES_H
 
 #include <stdint.h>
+#include <likwid.h>
 
-typedef enum {
-    PKG = 0,
-    PP0,
-    PP1,
-    DRAM
-} PowerType;
 
-typedef struct {
-    int numSteps;
-    double* steps;
-} TurboBoost;
-
-typedef struct {
-    double baseFrequency;
-    double minFrequency;
-    TurboBoost turbo;
-    double powerUnit;
-    double energyUnit;
-    double timeUnit;
-    double tdp;
-    double minPower;
-    double maxPower;
-    double maxTimeWindow;
-} PowerInfo;
-
-typedef struct {
-    uint32_t before;
-    uint32_t after;
-} PowerData;
+extern uint32_t power_regs[NUM_POWER_DOMAINS];
 
 
 #endif /*POWER_TYPES_H*/
diff --git a/src/includes/registers.h b/src/includes/registers.h
index ae80e28..68c780a 100644
--- a/src/includes/registers.h
+++ b/src/includes/registers.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Register Defines for the perfmon module
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -47,6 +48,10 @@
 #define MSR_PERFEVTSEL1           0x187
 #define MSR_PERFEVTSEL2           0x188
 #define MSR_PERFEVTSEL3           0x189
+#define MSR_PERFEVTSEL4           0x190
+#define MSR_PERFEVTSEL5           0x191
+#define MSR_PERFEVTSEL6           0x192
+#define MSR_PERFEVTSEL7           0x193
 #define MSR_PMC0                  0x0C1
 #define MSR_PMC1                  0x0C2
 #define MSR_PMC2                  0x0C3
@@ -60,6 +65,7 @@
 #define MSR_PERF_GLOBAL_STATUS    0x38E
 #define MSR_PERF_GLOBAL_OVF_CTRL  0x390
 #define MSR_PEBS_ENABLE           0x3F1
+#define MSR_PEBS_LD_LAT           0x3F6
 /* Perfmon V3 */
 #define MSR_OFFCORE_RESP0              0x1A6
 #define MSR_OFFCORE_RESP1              0x1A7
@@ -85,20 +91,21 @@
 #define MSR_UNCORE_PMC5                0x3B5
 #define MSR_UNCORE_PMC6                0x3B6
 #define MSR_UNCORE_PMC7                0x3B7
-/*
- * Perfmon V4 (starting with Haswell, according to
+/* 
+ * Perfmon V4 (starting with Haswell, according to 
  * Intel software developers guide also for SandyBridge,
- * IvyBridge not mentioned in this section)
+ * IvyBridge not mentioned in this section) 
  */
 #define MSR_UNC_PERF_GLOBAL_CTRL       MSR_UNCORE_PERF_GLOBAL_CTRL
 #define MSR_UNC_PERF_GLOBAL_STATUS     MSR_UNCORE_PERF_GLOBAL_STATUS
+#define MSR_UNC_PERF_GLOBAL_OVF_CTRL   MSR_UNCORE_PERF_GLOBAL_OVF_CTRL
 #define MSR_UNC_PERF_FIXED_CTRL        MSR_UNCORE_FIXED_CTR0
 #define MSR_UNC_PERF_FIXED_CTR         MSR_UNCORE_FIXED_CTR_CTRL
 #define MSR_UNC_ARB_PERFEVTSEL0        MSR_UNCORE_PMC2
 #define MSR_UNC_ARB_PERFEVTSEL1        MSR_UNCORE_PMC3
 #define MSR_UNC_ARB_CTR0               MSR_UNCORE_PMC0
 #define MSR_UNC_ARB_CTR1               MSR_UNCORE_PMC1
-#define MSR_UNC_CBO_CONFIG             0x396
+#define MSR_UNC_CBO_CONFIG             MSR_UNCORE_ADDR_OPCODE_MATCH
 #define MSR_UNC_CBO_0_PERFEVTSEL0      0x700
 #define MSR_UNC_CBO_0_PERFEVTSEL1      0x701
 #define MSR_UNC_CBO_0_CTR0             0x706
@@ -324,7 +331,10 @@
 #define MSR_UNC_PCU_PMON_CTL2           0xC32
 #define MSR_UNC_PCU_PMON_CTL3           0xC33
 #define MSR_UNC_PCU_PMON_BOX_FILTER     0xC34
-#define MSR_UNC_PCU_PMON_BOX_CTL        0xD24
+#define MSR_UNC_PCU_PMON_BOX_CTL        0xC24
+#define MSR_UNC_PCU_PMON_BOX_STATUS     0xC35
+#define MSR_UNC_PCU_PMON_FIXED_CTR0     0x3FC
+#define MSR_UNC_PCU_PMON_FIXED_CTR1     0x3FD
 
 /* UBox Performance Monitoring */
 
@@ -342,6 +352,7 @@
 /* HA Box Performance Monitoring */
 
 #define PCI_UNC_HA_PMON_BOX_CTL         0xF4
+#define PCI_UNC_HA_PMON_BOX_STATUS      0xF8
 #define PCI_UNC_HA_PMON_CTL_0           0xD8
 #define PCI_UNC_HA_PMON_CTL_1           0xDC
 #define PCI_UNC_HA_PMON_CTL_2           0xE0
@@ -378,9 +389,22 @@
 #define PCI_UNC_MC_PMON_CTR_2_B         0xB0
 #define PCI_UNC_MC_PMON_CTR_3_B         0xB8
 
+/* IRP Performance Monitoring */
+#define PCI_UNC_IRP_PMON_BOX_STATUS     0xF8
+#define PCI_UNC_IRP_PMON_BOX_CTL        0xF4
+#define PCI_UNC_IRP0_PMON_CTL_0         0xD8
+#define PCI_UNC_IRP0_PMON_CTL_1         0xDC
+#define PCI_UNC_IRP0_PMON_CTR_0         0xA0
+#define PCI_UNC_IRP0_PMON_CTR_1         0xB0
+#define PCI_UNC_IRP1_PMON_CTL_0         0xE0
+#define PCI_UNC_IRP1_PMON_CTL_1         0xE4
+#define PCI_UNC_IRP1_PMON_CTR_0         0xB8
+#define PCI_UNC_IRP1_PMON_CTR_1         0xC0
+
 /* QPI Box Performance Monitoring */
 
 #define PCI_UNC_QPI_PMON_BOX_CTL         0xF4
+#define PCI_UNC_QPI_PMON_BOX_STATUS      0xF8
 #define PCI_UNC_QPI_PMON_CTL_0           0xD8
 #define PCI_UNC_QPI_PMON_CTL_1           0xDC
 #define PCI_UNC_QPI_PMON_CTL_2           0xE0
@@ -402,6 +426,7 @@
 /* R2PCIE Box Performance Monitoring */
 
 #define PCI_UNC_R2PCIE_PMON_BOX_CTL         0xF4
+#define PCI_UNC_R2PCIE_PMON_BOX_STATUS      0xF8
 #define PCI_UNC_R2PCIE_PMON_CTL_0           0xD8
 #define PCI_UNC_R2PCIE_PMON_CTL_1           0xDC
 #define PCI_UNC_R2PCIE_PMON_CTL_2           0xE0
@@ -418,6 +443,7 @@
 /* R3QPI Box Performance Monitoring */
 
 #define PCI_UNC_R3QPI_PMON_BOX_CTL         0xF4
+#define PCI_UNC_R3QPI_PMON_BOX_STATUS      0xF8
 #define PCI_UNC_R3QPI_PMON_CTL_0           0xD8
 #define PCI_UNC_R3QPI_PMON_CTL_1           0xDC
 #define PCI_UNC_R3QPI_PMON_CTL_2           0xE0
@@ -428,6 +454,360 @@
 #define PCI_UNC_R3QPI_PMON_CTR_1_B         0xA8
 #define PCI_UNC_R3QPI_PMON_CTR_2_B         0xB0
 
+/* ########################################################## */
+/* Core v3 type uncore
+ * Naming following Intel Uncore Performance Monitoring Guide
+ * Ref. Nr. 331051-001
+ * */
+
+/* UBox Performance Monitoring */
+#define MSR_UNC_V3_U_PMON_CTR0             0x709
+#define MSR_UNC_V3_U_PMON_CTR1             0x70A
+#define MSR_UNC_V3_U_PMON_CTL0             0x705
+#define MSR_UNC_V3_U_PMON_CTL1             0x706
+#define MSR_UNC_V3_U_UCLK_FIXED_CTR        0x704
+#define MSR_UNC_V3_U_UCLK_FIXED_CTL        0x703
+#define MSR_UNC_V3_U_PMON_BOX_STATUS       0x708
+#define MSR_UNC_V3_U_PMON_GLOBAL_STATUS    0x701
+#define MSR_UNC_V3_U_PMON_GLOBAL_CTL       0x700
+#define MSR_UNC_V3_U_PMON_GLOBAL_CONFIG    0x702
+
+/* CBox Performance Monitoring */
+#define MSR_UNC_V3_C0_PMON_BOX_CTL         0xE00
+#define MSR_UNC_V3_C0_PMON_BOX_STATUS      0xE07
+#define MSR_UNC_V3_C0_PMON_BOX_FILTER0     0xE05
+#define MSR_UNC_V3_C0_PMON_BOX_FILTER1     0xE06
+#define MSR_UNC_V3_C0_PMON_CTL0            0xE01
+#define MSR_UNC_V3_C0_PMON_CTL1            0xE02
+#define MSR_UNC_V3_C0_PMON_CTL2            0xE03
+#define MSR_UNC_V3_C0_PMON_CTL3            0xE04
+#define MSR_UNC_V3_C0_PMON_CTR0            0xE08
+#define MSR_UNC_V3_C0_PMON_CTR1            0xE09
+#define MSR_UNC_V3_C0_PMON_CTR2            0xE0A
+#define MSR_UNC_V3_C0_PMON_CTR3            0xE0B
+
+#define MSR_UNC_V3_C1_PMON_BOX_CTL         0xE10
+#define MSR_UNC_V3_C1_PMON_BOX_STATUS      0xE17
+#define MSR_UNC_V3_C1_PMON_BOX_FILTER0     0xE15
+#define MSR_UNC_V3_C1_PMON_BOX_FILTER1     0xE16
+#define MSR_UNC_V3_C1_PMON_CTL0            0xE11
+#define MSR_UNC_V3_C1_PMON_CTL1            0xE12
+#define MSR_UNC_V3_C1_PMON_CTL2            0xE13
+#define MSR_UNC_V3_C1_PMON_CTL3            0xE14
+#define MSR_UNC_V3_C1_PMON_CTR0            0xE18
+#define MSR_UNC_V3_C1_PMON_CTR1            0xE19
+#define MSR_UNC_V3_C1_PMON_CTR2            0xE1A
+#define MSR_UNC_V3_C1_PMON_CTR3            0xE1B
+
+#define MSR_UNC_V3_C2_PMON_BOX_CTL         0xE20
+#define MSR_UNC_V3_C2_PMON_BOX_STATUS      0xE27
+#define MSR_UNC_V3_C2_PMON_BOX_FILTER0     0xE25
+#define MSR_UNC_V3_C2_PMON_BOX_FILTER1     0xE26
+#define MSR_UNC_V3_C2_PMON_CTL0            0xE21
+#define MSR_UNC_V3_C2_PMON_CTL1            0xE22
+#define MSR_UNC_V3_C2_PMON_CTL2            0xE23
+#define MSR_UNC_V3_C2_PMON_CTL3            0xE24
+#define MSR_UNC_V3_C2_PMON_CTR0            0xE28
+#define MSR_UNC_V3_C2_PMON_CTR1            0xE29
+#define MSR_UNC_V3_C2_PMON_CTR2            0xE2A
+#define MSR_UNC_V3_C2_PMON_CTR3            0xE2B
+
+#define MSR_UNC_V3_C3_PMON_BOX_CTL         0xE30
+#define MSR_UNC_V3_C3_PMON_BOX_STATUS      0xE37
+#define MSR_UNC_V3_C3_PMON_BOX_FILTER0     0xE35
+#define MSR_UNC_V3_C3_PMON_BOX_FILTER1     0xE36
+#define MSR_UNC_V3_C3_PMON_CTL0            0xE31
+#define MSR_UNC_V3_C3_PMON_CTL1            0xE32
+#define MSR_UNC_V3_C3_PMON_CTL2            0xE33
+#define MSR_UNC_V3_C3_PMON_CTL3            0xE34
+#define MSR_UNC_V3_C3_PMON_CTR0            0xE38
+#define MSR_UNC_V3_C3_PMON_CTR1            0xE39
+#define MSR_UNC_V3_C3_PMON_CTR2            0xE3A
+#define MSR_UNC_V3_C3_PMON_CTR3            0xE3B
+
+#define MSR_UNC_V3_C4_PMON_BOX_CTL         0xE40
+#define MSR_UNC_V3_C4_PMON_BOX_STATUS      0xE47
+#define MSR_UNC_V3_C4_PMON_BOX_FILTER0     0xE45
+#define MSR_UNC_V3_C4_PMON_BOX_FILTER1     0xE46
+#define MSR_UNC_V3_C4_PMON_CTL0            0xE41
+#define MSR_UNC_V3_C4_PMON_CTL1            0xE42
+#define MSR_UNC_V3_C4_PMON_CTL2            0xE43
+#define MSR_UNC_V3_C4_PMON_CTL3            0xE44
+#define MSR_UNC_V3_C4_PMON_CTR0            0xE48
+#define MSR_UNC_V3_C4_PMON_CTR1            0xE49
+#define MSR_UNC_V3_C4_PMON_CTR2            0xE4A
+#define MSR_UNC_V3_C4_PMON_CTR3            0xE4B
+
+#define MSR_UNC_V3_C5_PMON_BOX_CTL         0xE50
+#define MSR_UNC_V3_C5_PMON_BOX_STATUS      0xE57
+#define MSR_UNC_V3_C5_PMON_BOX_FILTER0     0xE55
+#define MSR_UNC_V3_C5_PMON_BOX_FILTER1     0xE56
+#define MSR_UNC_V3_C5_PMON_CTL0            0xE51
+#define MSR_UNC_V3_C5_PMON_CTL1            0xE52
+#define MSR_UNC_V3_C5_PMON_CTL2            0xE53
+#define MSR_UNC_V3_C5_PMON_CTL3            0xE54
+#define MSR_UNC_V3_C5_PMON_CTR0            0xE58
+#define MSR_UNC_V3_C5_PMON_CTR1            0xE59
+#define MSR_UNC_V3_C5_PMON_CTR2            0xE5A
+#define MSR_UNC_V3_C5_PMON_CTR3            0xE5B
+
+#define MSR_UNC_V3_C6_PMON_BOX_CTL         0xE60
+#define MSR_UNC_V3_C6_PMON_BOX_STATUS      0xE67
+#define MSR_UNC_V3_C6_PMON_BOX_FILTER0     0xE65
+#define MSR_UNC_V3_C6_PMON_BOX_FILTER1     0xE66
+#define MSR_UNC_V3_C6_PMON_CTL0            0xE61
+#define MSR_UNC_V3_C6_PMON_CTL1            0xE62
+#define MSR_UNC_V3_C6_PMON_CTL2            0xE63
+#define MSR_UNC_V3_C6_PMON_CTL3            0xE64
+#define MSR_UNC_V3_C6_PMON_CTR0            0xE68
+#define MSR_UNC_V3_C6_PMON_CTR1            0xE69
+#define MSR_UNC_V3_C6_PMON_CTR2            0xE6A
+#define MSR_UNC_V3_C6_PMON_CTR3            0xE6B
+
+#define MSR_UNC_V3_C7_PMON_BOX_CTL         0xE70
+#define MSR_UNC_V3_C7_PMON_BOX_STATUS      0xE77
+#define MSR_UNC_V3_C7_PMON_BOX_FILTER0     0xE75
+#define MSR_UNC_V3_C7_PMON_BOX_FILTER1     0xE76
+#define MSR_UNC_V3_C7_PMON_CTL0            0xE71
+#define MSR_UNC_V3_C7_PMON_CTL1            0xE72
+#define MSR_UNC_V3_C7_PMON_CTL2            0xE73
+#define MSR_UNC_V3_C7_PMON_CTL3            0xE74
+#define MSR_UNC_V3_C7_PMON_CTR0            0xE78
+#define MSR_UNC_V3_C7_PMON_CTR1            0xE79
+#define MSR_UNC_V3_C7_PMON_CTR2            0xE7A
+#define MSR_UNC_V3_C7_PMON_CTR3            0xE7B
+
+#define MSR_UNC_V3_C8_PMON_BOX_CTL         0xE80
+#define MSR_UNC_V3_C8_PMON_BOX_STATUS      0xE87
+#define MSR_UNC_V3_C8_PMON_BOX_FILTER0     0xE85
+#define MSR_UNC_V3_C8_PMON_BOX_FILTER1     0xE86
+#define MSR_UNC_V3_C8_PMON_CTL0            0xE81
+#define MSR_UNC_V3_C8_PMON_CTL1            0xE82
+#define MSR_UNC_V3_C8_PMON_CTL2            0xE83
+#define MSR_UNC_V3_C8_PMON_CTL3            0xE84
+#define MSR_UNC_V3_C8_PMON_CTR0            0xE88
+#define MSR_UNC_V3_C8_PMON_CTR1            0xE89
+#define MSR_UNC_V3_C8_PMON_CTR2            0xE8A
+#define MSR_UNC_V3_C8_PMON_CTR3            0xE8B
+
+#define MSR_UNC_V3_C9_PMON_BOX_CTL         0xE90
+#define MSR_UNC_V3_C9_PMON_BOX_STATUS      0xE97
+#define MSR_UNC_V3_C9_PMON_BOX_FILTER0     0xE95
+#define MSR_UNC_V3_C9_PMON_BOX_FILTER1     0xE96
+#define MSR_UNC_V3_C9_PMON_CTL0            0xE91
+#define MSR_UNC_V3_C9_PMON_CTL1            0xE92
+#define MSR_UNC_V3_C9_PMON_CTL2            0xE93
+#define MSR_UNC_V3_C9_PMON_CTL3            0xE94
+#define MSR_UNC_V3_C9_PMON_CTR0            0xE98
+#define MSR_UNC_V3_C9_PMON_CTR1            0xE99
+#define MSR_UNC_V3_C9_PMON_CTR2            0xE9A
+#define MSR_UNC_V3_C9_PMON_CTR3            0xE9B
+
+#define MSR_UNC_V3_C10_PMON_BOX_CTL        0xEA0
+#define MSR_UNC_V3_C10_PMON_BOX_STATUS     0xEA7
+#define MSR_UNC_V3_C10_PMON_BOX_FILTER0    0xEA5
+#define MSR_UNC_V3_C10_PMON_BOX_FILTER1    0xEA6
+#define MSR_UNC_V3_C10_PMON_CTL0           0xEA1
+#define MSR_UNC_V3_C10_PMON_CTL1           0xEA2
+#define MSR_UNC_V3_C10_PMON_CTL2           0xEA3
+#define MSR_UNC_V3_C10_PMON_CTL3           0xEA4
+#define MSR_UNC_V3_C10_PMON_CTR0           0xEA8
+#define MSR_UNC_V3_C10_PMON_CTR1           0xEA9
+#define MSR_UNC_V3_C10_PMON_CTR2           0xEAA
+#define MSR_UNC_V3_C10_PMON_CTR3           0xEAB
+
+#define MSR_UNC_V3_C11_PMON_BOX_CTL        0xEB0
+#define MSR_UNC_V3_C11_PMON_BOX_STATUS     0xEB7
+#define MSR_UNC_V3_C11_PMON_BOX_FILTER0    0xEB5
+#define MSR_UNC_V3_C11_PMON_BOX_FILTER1    0xEB6
+#define MSR_UNC_V3_C11_PMON_CTL0           0xEB1
+#define MSR_UNC_V3_C11_PMON_CTL1           0xEB2
+#define MSR_UNC_V3_C11_PMON_CTL2           0xEB3
+#define MSR_UNC_V3_C11_PMON_CTL3           0xEB4
+#define MSR_UNC_V3_C11_PMON_CTR0           0xEB8
+#define MSR_UNC_V3_C11_PMON_CTR1           0xEB9
+#define MSR_UNC_V3_C11_PMON_CTR2           0xEBA
+#define MSR_UNC_V3_C11_PMON_CTR3           0xEBB
+
+#define MSR_UNC_V3_C12_PMON_BOX_CTL        0xEC0
+#define MSR_UNC_V3_C12_PMON_BOX_STATUS     0xEC7
+#define MSR_UNC_V3_C12_PMON_BOX_FILTER0    0xEC5
+#define MSR_UNC_V3_C12_PMON_BOX_FILTER1    0xEC6
+#define MSR_UNC_V3_C12_PMON_CTL0           0xEC1
+#define MSR_UNC_V3_C12_PMON_CTL1           0xEC2
+#define MSR_UNC_V3_C12_PMON_CTL2           0xEC3
+#define MSR_UNC_V3_C12_PMON_CTL3           0xEC4
+#define MSR_UNC_V3_C12_PMON_CTR0           0xEC8
+#define MSR_UNC_V3_C12_PMON_CTR1           0xEC9
+#define MSR_UNC_V3_C12_PMON_CTR2           0xECA
+#define MSR_UNC_V3_C12_PMON_CTR3           0xECB
+
+#define MSR_UNC_V3_C13_PMON_BOX_CTL        0xED0
+#define MSR_UNC_V3_C13_PMON_BOX_STATUS     0xED7
+#define MSR_UNC_V3_C13_PMON_BOX_FILTER0    0xED5
+#define MSR_UNC_V3_C13_PMON_BOX_FILTER1    0xED6
+#define MSR_UNC_V3_C13_PMON_CTL0           0xED1
+#define MSR_UNC_V3_C13_PMON_CTL1           0xED2
+#define MSR_UNC_V3_C13_PMON_CTL2           0xED3
+#define MSR_UNC_V3_C13_PMON_CTL3           0xED4
+#define MSR_UNC_V3_C13_PMON_CTR0           0xED8
+#define MSR_UNC_V3_C13_PMON_CTR1           0xED9
+#define MSR_UNC_V3_C13_PMON_CTR2           0xEDA
+#define MSR_UNC_V3_C13_PMON_CTR3           0xEDB
+
+#define MSR_UNC_V3_C14_PMON_BOX_CTL        0xEE0
+#define MSR_UNC_V3_C14_PMON_BOX_STATUS     0xEE7
+#define MSR_UNC_V3_C14_PMON_BOX_FILTER0    0xEE5
+#define MSR_UNC_V3_C14_PMON_BOX_FILTER1    0xEE6
+#define MSR_UNC_V3_C14_PMON_CTL0           0xEE1
+#define MSR_UNC_V3_C14_PMON_CTL1           0xEE2
+#define MSR_UNC_V3_C14_PMON_CTL2           0xEE3
+#define MSR_UNC_V3_C14_PMON_CTL3           0xEE4
+#define MSR_UNC_V3_C14_PMON_CTR0           0xEE8
+#define MSR_UNC_V3_C14_PMON_CTR1           0xEE9
+#define MSR_UNC_V3_C14_PMON_CTR2           0xEEA
+#define MSR_UNC_V3_C14_PMON_CTR3           0xEEB
+
+#define MSR_UNC_V3_C15_PMON_BOX_CTL        0xEF0
+#define MSR_UNC_V3_C15_PMON_BOX_STATUS     0xEF7
+#define MSR_UNC_V3_C15_PMON_BOX_FILTER0    0xEF5
+#define MSR_UNC_V3_C15_PMON_BOX_FILTER1    0xEF6
+#define MSR_UNC_V3_C15_PMON_CTL0           0xEF1
+#define MSR_UNC_V3_C15_PMON_CTL1           0xEF2
+#define MSR_UNC_V3_C15_PMON_CTL2           0xEF3
+#define MSR_UNC_V3_C15_PMON_CTL3           0xEF4
+#define MSR_UNC_V3_C15_PMON_CTR0           0xEF8
+#define MSR_UNC_V3_C15_PMON_CTR1           0xEF9
+#define MSR_UNC_V3_C15_PMON_CTR2           0xEFA
+#define MSR_UNC_V3_C15_PMON_CTR3           0xEFB
+
+#define MSR_UNC_V3_C16_PMON_BOX_CTL        0xF00
+#define MSR_UNC_V3_C16_PMON_BOX_STATUS     0xF07
+#define MSR_UNC_V3_C16_PMON_BOX_FILTER0    0xF05
+#define MSR_UNC_V3_C16_PMON_BOX_FILTER1    0xF06
+#define MSR_UNC_V3_C16_PMON_CTL0           0xF01
+#define MSR_UNC_V3_C16_PMON_CTL1           0xF02
+#define MSR_UNC_V3_C16_PMON_CTL2           0xF03
+#define MSR_UNC_V3_C16_PMON_CTL3           0xF04
+#define MSR_UNC_V3_C16_PMON_CTR0           0xF08
+#define MSR_UNC_V3_C16_PMON_CTR1           0xF09
+#define MSR_UNC_V3_C16_PMON_CTR2           0xF0A
+#define MSR_UNC_V3_C16_PMON_CTR3           0xF0B
+
+#define MSR_UNC_V3_C17_PMON_BOX_CTL        0xF10
+#define MSR_UNC_V3_C17_PMON_BOX_STATUS     0xF17
+#define MSR_UNC_V3_C17_PMON_BOX_FILTER0    0xF15
+#define MSR_UNC_V3_C17_PMON_BOX_FILTER1    0xF16
+#define MSR_UNC_V3_C17_PMON_CTL0           0xF11
+#define MSR_UNC_V3_C17_PMON_CTL1           0xF12
+#define MSR_UNC_V3_C17_PMON_CTL2           0xF13
+#define MSR_UNC_V3_C17_PMON_CTL3           0xF14
+#define MSR_UNC_V3_C17_PMON_CTR0           0xF18
+#define MSR_UNC_V3_C17_PMON_CTR1           0xF19
+#define MSR_UNC_V3_C17_PMON_CTR2           0xF1A
+#define MSR_UNC_V3_C17_PMON_CTR3           0xF1B
+
+/* Sbox */
+#define MSR_UNC_V3_S0_PMON_BOX_CTL         0x720
+#define MSR_UNC_V3_S0_PMON_BOX_STATUS      0x725
+#define MSR_UNC_V3_S0_PMON_CTL_0           0x721
+#define MSR_UNC_V3_S0_PMON_CTL_1           0x722
+#define MSR_UNC_V3_S0_PMON_CTL_2           0x723
+#define MSR_UNC_V3_S0_PMON_CTL_3           0x724
+#define MSR_UNC_V3_S0_PMON_CTR_0           0x726
+#define MSR_UNC_V3_S0_PMON_CTR_1           0x727
+#define MSR_UNC_V3_S0_PMON_CTR_2           0x728
+#define MSR_UNC_V3_S0_PMON_CTR_3           0x729
+
+#define MSR_UNC_V3_S1_PMON_BOX_CTL         0x72A
+#define MSR_UNC_V3_S1_PMON_BOX_STATUS      0x72F
+#define MSR_UNC_V3_S1_PMON_CTL_0           0x72B
+#define MSR_UNC_V3_S1_PMON_CTL_1           0x72C
+#define MSR_UNC_V3_S1_PMON_CTL_2           0x72D
+#define MSR_UNC_V3_S1_PMON_CTL_3           0x72E
+#define MSR_UNC_V3_S1_PMON_CTR_0           0x730
+#define MSR_UNC_V3_S1_PMON_CTR_1           0x731
+#define MSR_UNC_V3_S1_PMON_CTR_2           0x732
+#define MSR_UNC_V3_S1_PMON_CTR_3           0x733
+
+#define MSR_UNC_V3_S2_PMON_BOX_CTL         0x734
+#define MSR_UNC_V3_S2_PMON_BOX_STATUS      0x739
+#define MSR_UNC_V3_S2_PMON_CTL_0           0x735
+#define MSR_UNC_V3_S2_PMON_CTL_1           0x736
+#define MSR_UNC_V3_S2_PMON_CTL_2           0x737
+#define MSR_UNC_V3_S2_PMON_CTL_3           0x738
+#define MSR_UNC_V3_S2_PMON_CTR_0           0x73A
+#define MSR_UNC_V3_S2_PMON_CTR_1           0x73B
+#define MSR_UNC_V3_S2_PMON_CTR_2           0x73C
+#define MSR_UNC_V3_S2_PMON_CTR_3           0x73D
+
+#define MSR_UNC_V3_S3_PMON_BOX_CTL         0x73E
+#define MSR_UNC_V3_S3_PMON_BOX_STATUS      0x743
+#define MSR_UNC_V3_S3_PMON_CTL_0           0x73F
+#define MSR_UNC_V3_S3_PMON_CTL_1           0x740
+#define MSR_UNC_V3_S3_PMON_CTL_2           0x741
+#define MSR_UNC_V3_S3_PMON_CTL_3           0x742
+#define MSR_UNC_V3_S3_PMON_CTR_0           0x744
+#define MSR_UNC_V3_S3_PMON_CTR_1           0x745
+#define MSR_UNC_V3_S3_PMON_CTR_2           0x746
+#define MSR_UNC_V3_S3_PMON_CTR_3           0x747
+
+/* V3 HA similar to V1/V2 */
+/* V3 iMC similar to V1/V2 */
+
+
+/* PCU (Power Control) Performance Monitoring */
+
+#define MSR_UNC_V3_PCU_PMON_CTR0           0x717
+#define MSR_UNC_V3_PCU_PMON_CTR1           0x718
+#define MSR_UNC_V3_PCU_PMON_CTR2           0x719
+#define MSR_UNC_V3_PCU_PMON_CTR3           0x71A
+#define MSR_UNC_V3_PCU_PMON_CTL0           0x711
+#define MSR_UNC_V3_PCU_PMON_CTL1           0x712
+#define MSR_UNC_V3_PCU_PMON_CTL2           0x713
+#define MSR_UNC_V3_PCU_PMON_CTL3           0x714
+#define MSR_UNC_V3_PCU_PMON_BOX_FILTER     0x715
+#define MSR_UNC_V3_PCU_PMON_BOX_CTL        0x710
+#define MSR_UNC_V3_PCU_PMON_BOX_STATUS     0x716
+#define MSR_UNC_V3_PCU_CC6_CTR             0x3FD
+#define MSR_UNC_V3_PCU_CC3_CTR             0x3FC
+#define MSR_UNC_V3_PCU_PC2_CTR             0x60D
+#define MSR_UNC_V3_PCU_PC3_CTR             0x3F8
+
+/* V3 QPI Box Performance Monitoring, mostly similar to V1/V2 */
+
+#define PCI_UNC_V3_QPI_PMON_BOX_CTL         0xF4
+#define PCI_UNC_V3_QPI_PMON_BOX_STATUS      0xF8
+#define PCI_UNC_V3_QPI_PMON_CTL_0           0xD8
+#define PCI_UNC_V3_QPI_PMON_CTL_1           0xDC
+#define PCI_UNC_V3_QPI_PMON_CTL_2           0xE0
+#define PCI_UNC_V3_QPI_PMON_CTL_3           0xE4
+#define PCI_UNC_V3_QPI_PMON_CTR_0_A         0xA4
+#define PCI_UNC_V3_QPI_PMON_CTR_1_A         0xAC
+#define PCI_UNC_V3_QPI_PMON_CTR_2_A         0xB4
+#define PCI_UNC_V3_QPI_PMON_CTR_3_A         0xBC
+#define PCI_UNC_V3_QPI_PMON_CTR_0_B         0xA0
+#define PCI_UNC_V3_QPI_PMON_CTR_1_B         0xA8
+#define PCI_UNC_V3_QPI_PMON_CTR_2_B         0xB0
+#define PCI_UNC_V3_QPI_PMON_CTR_3_B         0xB8
+#define PCI_UNC_V3_QPI_PMON_RX_MASK_0          0x238
+#define PCI_UNC_V3_QPI_PMON_RX_MASK_1          0x23C
+#define PCI_UNC_V3_QPI_PMON_RX_MATCH_0         0x228
+#define PCI_UNC_V3_QPI_PMON_RX_MATCH_1         0x22C
+#define PCI_UNC_V3_QPI_PMON_TX_MASK_0          0x210
+#define PCI_UNC_V3_QPI_PMON_TX_MASK_1          0x214
+#define PCI_UNC_V3_QPI_PMON_TX_MATCH_0         0x200
+#define PCI_UNC_V3_QPI_PMON_TX_MATCH_1         0x204
+#define PCI_UNC_V3_QPI_RATE_STATUS          0xD4
+#define PCI_UNC_V3_QPI_LINK_LLR             0xD0
+#define PCI_UNC_V3_QPI_LINK_IDLE            0xC8
+
+
+/* V3 R2PCIE Box Performance Monitoring similar to V1/V2 */
+
+/* V3 R3QPI Box Performance Monitoring similar to V1/V2 */
+
+/* ########################################################## */
 
 /* EX type uncore */
 /* U box - System Config Controller */
@@ -774,6 +1154,7 @@
 /* Match/Mask MSRs */
 #define MSR_B0_PMON_MATCH               0xE45
 #define MSR_B0_PMON_MASK                0xE46
+#define MSR_S0_PMON_MM_CFG              0xE49
 #define MSR_S0_PMON_MATCH               0xE49
 #define MSR_S0_PMON_MASK                0xE4A
 #define MSR_B1_PMON_MATCH               0xE4D
@@ -781,6 +1162,7 @@
 #define MSR_M0_PMON_MM_CONFIG           0xE54
 #define MSR_M0_PMON_ADDR_MATCH          0xE55
 #define MSR_M0_PMON_ADDR_MASK           0xE56
+#define MSR_S1_PMON_MM_CFG              0xE58
 #define MSR_S1_PMON_MATCH               0xE59
 #define MSR_S1_PMON_MASK                0xE5A
 #define MSR_M1_PMON_MM_CONFIG           0xE5C
@@ -803,6 +1185,8 @@
 #define MSR_DRAM_ENERGY_STATUS          0x619
 #define MSR_DRAM_PERF_STATUS            0x61B
 #define MSR_DRAM_POWER_INFO             0x61C
+/* Intel Silvermont's RAPL registers */
+#define MSR_PKG_POWER_INFO_SILVERMONT   0x66E
 
 /* TM/TM2 interface */
 #define IA32_THERM_STATUS               0x19C
@@ -814,9 +1198,9 @@
 #define MSR_PLATFORM_INFO               0x0CE
 #define MSR_TURBO_POWER_CURRENT_LIMIT   0x1AC
 #define MSR_TURBO_RATIO_LIMIT           0x1AD
+#define MSR_TURBO_RATIO_LIMIT1          0x1AE
+#define MSR_TURBO_RATIO_LIMIT2          0x1AF
 
-/* Intel Silvermont's RAPL registers */
-#define MSR_PKG_POWER_INFO_SILVERMONT   0x66E
 /*
  * AMD
  */
@@ -866,6 +1250,15 @@
 #define MSR_AMD16_PMC2                  0xC0010006
 #define MSR_AMD16_PMC3                  0xC0010007
 
+#define MSR_AMD16_L2_PERFEVTSEL0        0xC0010230
+#define MSR_AMD16_L2_PERFEVTSEL1        0xC0010232
+#define MSR_AMD16_L2_PERFEVTSEL2        0xC0010234
+#define MSR_AMD16_L2_PERFEVTSEL3        0xC0010236
+#define MSR_AMD16_L2_PMC0               0xC0010231
+#define MSR_AMD16_L2_PMC1               0xC0010233
+#define MSR_AMD16_L2_PMC2               0xC0010235
+#define MSR_AMD16_L2_PMC3               0xC0010237
+
 #define MSR_AMD16_NB_PERFEVTSEL0        0xC0010240
 #define MSR_AMD16_NB_PERFEVTSEL1        0xC0010242
 #define MSR_AMD16_NB_PERFEVTSEL2        0xC0010244
diff --git a/src/includes/registers_types.h b/src/includes/registers_types.h
new file mode 100644
index 0000000..9674ad0
--- /dev/null
+++ b/src/includes/registers_types.h
@@ -0,0 +1,198 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  registers_types.h
+ *
+ *      Description:  Header File of registers.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef REGISTERS_TYPES_H
+#define REGISTERS_TYPES_H
+
+#include <pci_types.h>
+
+typedef enum {
+    PMC0 = 0,
+    PMC1, PMC2, PMC3, PMC4, PMC5, PMC6,
+    PMC7, PMC8, PMC9, PMC10, PMC11, PMC12,
+    PMC13, PMC14, PMC15, PMC16, PMC17, PMC18,
+    PMC19, PMC20, PMC21, PMC22, PMC23, PMC24,
+    PMC25, PMC26, PMC27, PMC28, PMC29, PMC30,
+    PMC31, PMC32, PMC33, PMC34, PMC35, PMC36,
+    PMC37, PMC38, PMC39, PMC40, PMC41, PMC42,
+    PMC43, PMC44, PMC45, PMC46, PMC47, PMC48,
+    PMC49, PMC50, PMC51, PMC52, PMC53, PMC54,
+    PMC55, PMC56, PMC57, PMC58, PMC59, PMC60,
+    PMC61, PMC62, PMC63, PMC64, PMC65, PMC66,
+    PMC67, PMC68, PMC69, PMC70, PMC71, PMC72,
+    PMC73, PMC74, PMC75, PMC76, PMC77, PMC78,
+    PMC79, PMC80, PMC81, PMC82, PMC83, PMC84,
+    PMC85, PMC86, PMC87, PMC88, PMC89, PMC90,
+    PMC91, PMC92, PMC93, PMC94, PMC95, PMC96,
+    PMC97, PMC98, PMC99, PMC100, PMC101, PMC102,
+    PMC103, PMC104, PMC105, PMC106, PMC107, PMC108,
+    PMC109, PMC110, PMC111, PMC112, PMC113, PMC114,
+    PMC115, PMC116, PMC117, PMC118, PMC119, PMC120,
+    PMC121, PMC122, PMC123, PMC124, PMC125, PMC126,
+    PMC127, PMC128, PMC129, PMC130, PMC131, PMC132,
+    PMC133, PMC134, PMC135, PMC136, PMC137, PMC138,
+    PMC139, PMC140, PMC141, PMC142, PMC143, PMC144,
+    PMC145, PMC146, PMC147, PMC148, PMC149, PMC150,
+    PMC151, PMC152, PMC153, PMC154, PMC155, PMC156,
+    PMC157, PMC158, PMC159, PMC160, PMC161, PMC162,
+    PMC163, PMC164, PMC165, PMC166, PMC167, PMC168,
+    PMC169, PMC170, PMC171, PMC172, PMC173, PMC174,
+    PMC175, PMC176, PMC177, PMC178, PMC179, PMC180,
+    PMC181, PMC182, PMC183, PMC184, PMC185, PMC186,
+    PMC187, PMC188, PMC189, PMC190, PMC191, PMC192,
+    PMC193, PMC194, PMC195, PMC196, PMC197, PMC198,
+    PMC199, PMC200, PMC201, PMC202, PMC203, PMC204,
+    NUM_PMC
+} RegisterIndex;
+
+typedef enum {
+    PMC = 0, FIXED, THERMAL,
+    POWER, UNCORE, MBOX0,
+    MBOX1, MBOX2, MBOX3,
+    MBOX4, MBOX5, MBOX6, MBOX7,
+    MBOX0FIX, MBOX1FIX, MBOX2FIX,
+    MBOX3FIX, MBOX4FIX, MBOX5FIX,
+    MBOX6FIX, MBOX7FIX,
+    BBOX0, BBOX1,
+    RBOX0, RBOX1, RBOX2,
+    WBOX,
+    WBOX0FIX, WBOX1FIX,
+    SBOX0, SBOX1, SBOX2, SBOX3,
+    SBOX0FIX, SBOX1FIX, SBOX2FIX, SBOX3FIX,
+    CBOX0, CBOX1, CBOX2,
+    CBOX3, CBOX4, CBOX5,
+    CBOX6, CBOX7, CBOX8,
+    CBOX9, CBOX10, CBOX11,
+    CBOX12, CBOX13, CBOX14,
+    CBOX15, CBOX16, CBOX17,
+    PBOX,
+    UBOX,
+    UBOXFIX,
+    IBOX0, IBOX1,
+    QBOX0, QBOX1,
+    QBOX0FIX, QBOX1FIX,
+    NUM_UNITS, NOTYPE, MAX_UNITS
+} RegisterType;
+
+static char* RegisterTypeNames[MAX_UNITS] = {
+    [PMC] = "Core-local general purpose counters",
+    [FIXED] = "Fixed counters",
+    [THERMAL] = "Thermal",
+    [POWER] = "Energy/Power counters (RAPL)",
+    [UNCORE] = "Socket-local general/fixed purpose counters",
+    [MBOX0] = "Memory Controller 0 Channel 0",
+    [MBOX1] = "Memory Controller 0 Channel 1",
+    [MBOX2] = "Memory Controller 0 Channel 2",
+    [MBOX3] = "Memory Controller 0 Channel 3",
+    [MBOX4] = "Memory Controller 1 Channel 0",
+    [MBOX5] = "Memory Controller 1 Channel 1",
+    [MBOX6] = "Memory Controller 1 Channel 2",
+    [MBOX7] = "Memory Controller 1 Channel 3",
+    [MBOX0FIX] = "Memory Controller 0 Channel 0 Fixed Counter",
+    [MBOX1FIX] = "Memory Controller 0 Channel 1 Fixed Counter",
+    [MBOX2FIX] = "Memory Controller 0 Channel 2 Fixed Counter",
+    [MBOX3FIX] = "Memory Controller 0 Channel 3 Fixed Counter",
+    [MBOX4FIX] = "Memory Controller 1 Channel 0 Fixed Counter",
+    [MBOX5FIX] = "Memory Controller 1 Channel 1 Fixed Counter",
+    [MBOX6FIX] = "Memory Controller 1 Channel 2 Fixed Counter",
+    [MBOX7FIX] = "Memory Controller 1 Channel 3 Fixed Counter",
+    [BBOX0] = "Home Agent box 0",
+    [BBOX1] = "Home Agent box 1",
+    [RBOX0] = "Routing box 0",
+    [RBOX1] = "Routing box 1",
+    [WBOX] = "Power control box",
+    [WBOX0FIX] = "Power control box fixed counter 0",
+    [WBOX1FIX] = "Power control box fixed counter 1",
+    [SBOX0] = "QPI Link Layer box 0",
+    [SBOX1] = "QPI Link Layer box 1",
+    [SBOX2] = "QPI Link Layer box 2",
+    [SBOX3] = "QPI Link Layer box 3",
+    [SBOX0FIX] = "QPI Link Layer box fixed 0",
+    [SBOX1FIX] = "QPI Link Layer box fixed 1",
+    [SBOX2FIX] = "QPI Link Layer box fixed 2",
+    [SBOX3FIX] = "QPI Link Layer box fixed 3",
+    [CBOX0] = "Caching Agent box 0",
+    [CBOX1] = "Caching Agent box 1",
+    [CBOX2] = "Caching Agent box 2",
+    [CBOX3] = "Caching Agent box 3",
+    [CBOX4] = "Caching Agent box 4",
+    [CBOX5] = "Caching Agent box 5",
+    [CBOX6] = "Caching Agent box 6",
+    [CBOX7] = "Caching Agent box 7",
+    [CBOX8] = "Caching Agent box 8",
+    [CBOX9] = "Caching Agent box 9",
+    [CBOX10] = "Caching Agent box 10",
+    [CBOX11] = "Caching Agent box 11",
+    [CBOX12] = "Caching Agent box 12",
+    [CBOX13] = "Caching Agent box 13",
+    [CBOX14] = "Caching Agent box 14",
+    [CBOX15] = "Caching Agent box 15",
+    [CBOX16] = "Caching Agent box 16",
+    [CBOX17] = "Caching Agent box 17",
+    [PBOX] = "Physical Layer box",
+    [UBOX] = "System Configuration box",
+    [UBOXFIX] = "System Configuration box fixed counter",
+    [IBOX0] = "Coherency Maintainer for IIO traffic",
+    [IBOX1] = "Coherency Maintainer for IIO traffic",
+    [QBOX0] = "QPI Link Layer 0",
+    [QBOX1] = "QPI Link Layer 1",
+    [QBOX0FIX] = "QPI Link Layer rate status 0",
+    [QBOX1FIX] = "QPI Link Layer rate status 1",
+    [NUM_UNITS] = "Maximally usable register types",
+    [NOTYPE] = "No Type, used for skipping unavailable counters"
+};
+
+#define REG_TYPE_MASK(type) (type < NUM_UNITS ? (0x1ULL<<type) : 0x0ULL)
+
+typedef struct {
+    char*               key;
+    RegisterIndex       index;
+    RegisterType        type;
+    uint64_t            configRegister;
+    uint64_t            counterRegister;
+    uint64_t            counterRegister2;
+    PciDeviceIndex      device;
+    uint64_t            optionMask;
+} RegisterMap;
+
+typedef struct {
+    uint32_t  ctrlRegister;
+    uint32_t  statusRegister;
+    uint32_t  ovflRegister;
+    int       ovflOffset;
+    uint8_t   isPci;
+    PciDeviceIndex device;
+    uint32_t  regWidth;
+    uint32_t  filterRegister1;
+    uint32_t  filterRegister2;
+} BoxMap;
+
+#endif
diff --git a/src/includes/strUtil.h b/src/includes/strUtil.h
deleted file mode 100644
index 18236b6..0000000
--- a/src/includes/strUtil.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  strUtil.h
- *
- *      Description:  Header File strUtil Module. 
- *                    Helper routines for bstrlib and command line parsing
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef STRUTIL_H
-#define STRUTIL_H
-
-#include <bstrlib.h>
-#include <types.h>
-#include <time.h>
-
-#define CHECK_OPTION_STRING  \
-if (! (argString = bSecureInput(400,optarg))) {  \
-    ERROR_PLAIN_PRINT(Failed to read argument string!);  \
-}
-
-extern int str2int(const char* str);
-extern uint32_t bstr_to_cpuset_physical(uint32_t* threads,  const_bstring q);
-extern int bstr_to_cpuset(int* threads,  const_bstring str);
-extern void bstr_to_eventset(StrUtilEventSet* set, const_bstring str);
-extern bstring bSecureInput (int maxlen, char* vgcCtx);
-extern int bJustifyCenter (bstring b, int width);
-extern void bstr_to_workgroup(Workgroup* threads,  const_bstring str, DataType type, int numberOfStreams);
-extern FILE* bstr_to_outstream(const_bstring argString, bstring filter);
-extern uint64_t bstr_to_doubleSize(const_bstring str, DataType type);
-extern void bstr_to_interval(const_bstring str, struct timespec* interval);
-
-#endif /*STRUTIL_H*/
diff --git a/src/includes/strUtil_types.h b/src/includes/strUtil_types.h
deleted file mode 100644
index 25766ff..0000000
--- a/src/includes/strUtil_types.h
+++ /dev/null
@@ -1,61 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  strUtil_types.h
- *
- *      Description:  Types file for strUtil module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef STRUTIL_TYPES_H
-#define STRUTIL_TYPES_H
-
-#include  <bstrlib.h>
-
-
-typedef struct {
-    bstring eventName;
-    bstring counterName;
-} StrUtilEvent;
-
-typedef struct {
-    StrUtilEvent* events;
-    int numberOfEvents;
-} StrUtilEventSet;
-
-typedef struct {
-    bstring domain;
-    int offset;
-    void* ptr;
-} Stream;
-
-typedef struct {
-    uint32_t numberOfThreads;
-    int* processorIds;
-    uint64_t size;
-    Stream* streams;
-} Workgroup;
-
-
-#endif /*STRUTIL_TYPES_H*/
diff --git a/src/includes/test_types.h b/src/includes/test_types.h
deleted file mode 100644
index 45c0932..0000000
--- a/src/includes/test_types.h
+++ /dev/null
@@ -1,108 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  test_types.h
- *
- *      Description:  Type definitions for benchmarking framework
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef TEST_TYPES_H
-#define TEST_TYPES_H
-
-#include <stdint.h>
-#include <bstrlib.h>
-
-typedef void (*FuncPrototype)();
-
-typedef enum {
-    SINGLE = 0,
-    DOUBLE,
-    SINGLE_RAND,
-    DOUBLE_RAND
-} DataType;
-
-typedef enum {
-    STREAM_1 = 1,
-    STREAM_2,
-    STREAM_3,
-    STREAM_4,
-    STREAM_5,
-    STREAM_6,
-    STREAM_7,
-    STREAM_8,
-    STREAM_9,
-    STREAM_10,
-    STREAM_11,
-    STREAM_12,
-    STREAM_13,
-    STREAM_14,
-    STREAM_15,
-    STREAM_16,
-    STREAM_17,
-    STREAM_18,
-    STREAM_19,
-    STREAM_20,
-    STREAM_21,
-    STREAM_22,
-    STREAM_23,
-    STREAM_24,
-    STREAM_25,
-    STREAM_26,
-    STREAM_27,
-    STREAM_28,
-    STREAM_29,
-    STREAM_30,
-    STREAM_31,
-    STREAM_32,
-    STREAM_33,
-    STREAM_34,
-    STREAM_35,
-    STREAM_36,
-    STREAM_37,
-    STREAM_38,
-    MAX_STREAMS
-} Pattern;
-
-typedef struct {
-    char* name;
-    Pattern streams;
-    DataType type ;
-    int stride;
-    FuncPrototype kernel;
-    double flops;
-    int bytes;
-} TestCase;
-
-typedef struct {
-    uint64_t size;
-    uint32_t iter;
-    const TestCase* test;
-    uint64_t cycles;
-    uint32_t numberOfThreads;
-    int* processors;
-    void** streams;
-} ThreadUserData;
-
-#endif /*TEST_TYPES_H*/
diff --git a/src/includes/textcolor.h b/src/includes/textcolor.h
index 4c1b7b1..a16592b 100644
--- a/src/includes/textcolor.h
+++ b/src/includes/textcolor.h
@@ -7,13 +7,13 @@
  *                    Allows toggling of terminal escape sequences for 
  *                    colored text.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/thermal.h b/src/includes/thermal.h
index 3153386..24e1dd4 100644
--- a/src/includes/thermal.h
+++ b/src/includes/thermal.h
@@ -6,13 +6,13 @@
  *      Description:  Header File Thermal Module.
  *                    Implements Intel TM/TM2 Interface.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -35,19 +35,43 @@
 #include <registers.h>
 #include <bitUtil.h>
 #include <msr.h>
+#include <error.h>
+#include <access.h>
 
-extern ThermalInfo thermal_info;
 
-extern void thermal_init(int cpuId);
-static inline uint32_t thermal_read(int cpuId);
 
-static uint32_t
-thermal_read(int cpuId)
+int
+thermal_read(int cpuId, uint32_t *data)
 {
-    uint32_t readout = extractBitField(msr_read(cpuId, IA32_THERM_STATUS),7,16);
-    return (readout == 0 ? 
-            thermal_info.activationT - thermal_info.offset :
-            (thermal_info.activationT-thermal_info.offset) - readout );
+    uint64_t result = 0;
+    uint32_t readout = 0;
+    if (HPMread(cpuId, MSR_DEV, IA32_THERM_STATUS, &result))
+    {
+        *data = 0;
+        return -EIO;
+    }
+    readout = extractBitField(result,7,16);
+    *data = (readout == 0 ?
+                thermal_info.activationT - thermal_info.offset :
+                (thermal_info.activationT - thermal_info.offset) - readout );
+    return 0;
+}
+
+int
+thermal_tread(int socket_fd, int cpuId, uint32_t *data)
+{
+    uint64_t result = 0;
+    uint32_t readout = 0;
+    if (HPMread(cpuId, MSR_DEV, IA32_THERM_STATUS, &result))
+    {
+        *data = 0;
+        return -EIO;
+    }
+    readout = extractBitField(result,7,16);
+    *data = (readout == 0 ?
+                thermal_info.activationT - thermal_info.offset :
+                (thermal_info.activationT - thermal_info.offset) - readout );
+    return 0;
 }
 
 #endif /*THERMAL_H*/
diff --git a/src/includes/thermal_types.h b/src/includes/thermal_types.h
index a619180..1205bd3 100644
--- a/src/includes/thermal_types.h
+++ b/src/includes/thermal_types.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Types file for thermal module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -33,7 +33,9 @@
 
 #include <stdint.h>
 
-
+/** \addtogroup ThermalMon
+ *  @{
+ */
 typedef struct {
     uint16_t highT;
     uint32_t resolution;
@@ -41,5 +43,11 @@ typedef struct {
     uint32_t offset;
 } ThermalInfo;
 
+/** \brief Pointer for exporting the ThermalInfo data structure */
+typedef ThermalInfo* ThermalInfo_t;
+/** @}*/
+
+extern ThermalInfo thermal_info;
+
 
 #endif /*THERMAL_TYPES_H*/
diff --git a/src/includes/threads.h b/src/includes/threads.h
deleted file mode 100644
index 6e00191..0000000
--- a/src/includes/threads.h
+++ /dev/null
@@ -1,107 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  threads.h
- *
- *      Description:  Header file of pthread interface module
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef THREADS_H
-#define THREADS_H
-
-#include <types.h>
-#include <pthread.h>
-#include <threads_types.h>
-#include <stdio.h>
-
-#define THREADS_BARRIER pthread_barrier_wait(&threads_barrier)
-
-extern pthread_barrier_t threads_barrier;
-extern ThreadData* threads_data;
-extern ThreadGroup* threads_groups;
-
-
-/**
- * @brief  Initialization of the thread module
- * @param  numberOfThreads  The total number of threads
- */
-extern void threads_init(FILE* OUTSTREAM, int numberOfThreads);
-
-/**
- * @brief  Create all threads
- * @param  startRoutine thread entry function pointer
- */
-extern void threads_create(void *(*startRoutine)(void*));
-
-/**
- * @brief  Register User thread data for all threads
- * @param  data  Reference to the user data structo
- * @param  func  Optional function pointer to copy data
- */
-extern void threads_registerDataAll(
-        ThreadUserData* data,
-        threads_copyDataFunc func);
-
-/**
- * @brief  Register User thread data for one thread
- * @param  threadId thread Id 
- * @param  data  Reference to the user data structo
- * @param  func  Optional function pointer to copy data
- */
-extern void threads_registerDataThread(
-        int threadId,
-        ThreadUserData* data,
-        threads_copyDataFunc func);
-
-/**
- * @brief  Register User thread data for a thread group
- * @param  groupId  group Id
- * @param  data  Reference to the user data structo
- * @param  func  Optional function pointer to copy data
- */
-extern void threads_registerDataGroup(
-        int groupId,
-        ThreadUserData* data,
-        threads_copyDataFunc func);
-
-/**
- * @brief  Join the threads and free pthread related data structures
- * @param
- */
-extern void threads_join(void);
-
-/**
- * @brief  Free memory of thread data structures
- * @param  numberOfGroups The number of groups to destroy
- */
-extern void threads_destroy(int numberOfGroups);
-
-/**
- * @brief  Create Thread groups
- * @param  numberOfGroups The number of groups to create
- */
-extern void threads_createGroups(int numberOfGroups);
-
-#endif /* THREADS_H */
diff --git a/src/includes/threads_types.h b/src/includes/threads_types.h
deleted file mode 100644
index dfa13f3..0000000
--- a/src/includes/threads_types.h
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  threads_types.h
- *
- *      Description:  Types file for threads module.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-#ifndef THREADS_TYPES_H
-#define THREADS_TYPES_H
-
-#include <stdio.h>
-#include <stdint.h>
-
-typedef struct {
-    int globalNumberOfThreads;
-    int numberOfThreads;
-    int globalThreadId;
-    int threadId;
-    int numberOfGroups;
-    int groupId;
-    double time;
-    uint64_t cycles;
-    FILE* output;
-    ThreadUserData data;
-} ThreadData;
-
-typedef struct {
-    int numberOfThreads;
-    int* threadIds;
-} ThreadGroup;
-
-typedef void (*threads_copyDataFunc)(ThreadUserData* src,ThreadUserData* dst);
-
-#endif /*THREADS_TYPES_H*/
diff --git a/src/includes/timer.h b/src/includes/timer.h
index b97f4ac..4991ee9 100644
--- a/src/includes/timer.h
+++ b/src/includes/timer.h
@@ -10,13 +10,13 @@
  *      with rdtsc of 100 cycles in the worst case. Therefore sensible
  *      measurements should be over 1000 cycles.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -37,36 +37,6 @@
 
 #include <types.h>
 
-#define RDTSC(cpu_c) \
-    __asm__ volatile("xor %%eax,%%eax\n\t" \
-                     "cpuid\n\t"           \
-                     "rdtsc\n\t"           \
-                     "movl %%eax, %0\n\t"  \
-                     "movl %%edx, %1\n\t"  \
-    : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
-    : : "%eax","%ebx","%ecx","%edx")
-
-#define RDTSC_CR(cpu_c) \
-    __asm__ volatile("rdtsc\n\t"           \
-                     "movl %%eax, %0\n\t"  \
-                     "movl %%edx, %1\n\t"  \
-    : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
-    : : "%eax","%ebx","%ecx","%edx")
-
-#define RDTSCP(cpu_c) \
-    __asm__ volatile("rdtscp\n\t"          \
-                     "movl %%eax, %0\n\t"  \
-                     "movl %%edx, %1\n\t"  \
-                     "cpuid\n\t"           \
-    : "=r" ((cpu_c).int32.lo), "=r" ((cpu_c).int32.hi) \
-    : : "%eax","%ebx","%ecx","%edx")
-
-#ifdef HAS_RDTSCP
-#define RDTSC_STOP(cpu_c) RDTSCP(cpu_c);
-#else
-#define RDTSC_STOP(cpu_c) RDTSC_CR(cpu_c);
-#endif
-
 
 extern void timer_init( void );
 extern double timer_print( TimerData* );
@@ -74,43 +44,11 @@ extern uint64_t timer_printCycles( TimerData* );
 extern uint64_t timer_getCpuClock( void );
 extern uint64_t timer_getBaseline( void );
 
-static inline void timer_start( TimerData* );
-static inline void timer_stop ( TimerData* );
-
-void timer_start( TimerData* time )
-{
-#ifdef __x86_64
-    RDTSC(time->start);
-#endif
-#ifdef _ARCH_PPC
-    uint32_t tbl, tbu0, tbu1;
-
-    do {
-        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
-        __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
-        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
-    } while (tbu0 != tbu1);
+extern void timer_start( TimerData* );
+extern void timer_stop ( TimerData* );
 
-    time->start.int64 = (((uint64_t)tbu0) << 32) | tbl;
-#endif
-}
 
-void timer_stop( TimerData* time )
-{
-#ifdef __x86_64
-    RDTSC_STOP(time->stop)
-#endif
-#ifdef _ARCH_PPC
-    uint32_t tbl, tbu0, tbu1;
-    do {
-        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
-        __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
-        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
-    } while (tbu0 != tbu1);
 
-    time->stop.int64 = (((uint64_t)tbu0) << 32) | tbl;
-#endif
-}
 
 
 #endif /* TIMER_H */
diff --git a/src/includes/timer_types.h b/src/includes/timer_types.h
index 265d5c9..1492e7c 100644
--- a/src/includes/timer_types.h
+++ b/src/includes/timer_types.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Types file for timer module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
diff --git a/src/includes/tlb-info.h b/src/includes/tlb-info.h
new file mode 100644
index 0000000..fe6c9ba
--- /dev/null
+++ b/src/includes/tlb-info.h
@@ -0,0 +1,89 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  tlb-info.h
+ *
+ *      Description:  Header File of topology module that contains the TLB
+ *                    describing strings. Not used currently.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+
+#ifndef TLB_INFO_H
+#define TLB_INFO_H
+
+static char* intel_tlb_info[256] = {
+    [0] = NULL,
+    [1] = "Instruction TLB: 4 KByte pages, 4-way set associative, 32 entries",
+    [2] = "Instruction TLB: 4 MByte pages, fully associative, 2 entries",
+    [3] = "Data TLB: 4 KByte pages, 4-way set associative, 64 entries",
+    [4] = "Data TLB: 4 MByte pages, 4-way set associative, 8 entries",
+    [5] = "Data TLB1: 4 MByte pages, 4-way set associative, 32 entries",
+    [6 ... 10] = NULL,
+    [11] = "Instruction TLB: 4 MByte pages, 4-way set associative, 4 entries",
+    [12 ... 78] = NULL,
+    [79] = "Instruction TLB: 4 KByte pages, 32 entries",
+    [80] = "Instruction TLB: 4 KByte and 2-MByte or 4-MByte pages, 64 entries",
+    [81] = "Instruction TLB: 4 KByte and 2-MByte or 4-MByte pages, 128 entries",
+    [82] = "Instruction TLB: 4 KByte and 2-MByte or 4-MByte pages, 256 entries",
+    [83 ... 84] = NULL,
+    [85] = "Instruction TLB: 2-MByte or 4-MByte pages, fully associative, 7 entries",
+    [86] = "Data TLB0: 4 MByte pages, 4-way set associative, 16 entries",
+    [87] = "Data TLB0: 4 KByte pages, 4-way associative, 16 entries",
+    [88] = NULL,
+    [89] = "Data TLB0: 4 KByte pages, fully associative, 16 entries",
+    [90] = "Data TLB0: 2-MByte or 4 MByte pages, 4-way set associative, 32 entries",
+    [91] = "Data TLB: 4 KByte and 4 MByte pages, 64 entries",
+    [92] = "Data TLB: 4 KByte and 4 MByte pages,128 entries",
+    [93] = "Data TLB: 4 KByte and 4 MByte pages,256 entries",
+    [94 ... 96] = NULL,
+    [97] = "Instruction TLB: 4 KByte pages, fully associative, 48 entries",
+    [98] = NULL,
+    [99] = "Data TLB: 1 GByte pages, 4-way set associative, 4 entries",
+    [100 ... 117] = NULL,
+    [118] = "Instruction TLB: 2M/4M pages, fully associative, 8 entries",
+    [119 ... 159] = NULL,
+    [160] = "DTLB: 4k pages, fully associative, 32 entries",
+    [161 ... 175] = NULL,
+    [176] = "Instruction TLB: 4 KByte pages, 4-way set associative, 128 entries",
+    [177] = "Instruction TLB: 2M pages, 4-way, 8 entries or 4M pages, 4-way, 4 entries",
+    [178] = "Instruction TLB: 4KByte pages, 4-way set associative, 64 entries",
+    [179] = "Data TLB: 4 KByte pages, 4-way set associative, 128 entries",
+    [180] = "Data TLB1: 4 KByte pages, 4-way associative, 256 entries",
+    [181] = "Instruction TLB: 4KByte pages, 8-way set associative, 64 entries",
+    [182] = "Instruction TLB: 4KByte pages, 8-way set associative, 128 entries",
+    [183 ... 185] = NULL,
+    [186] = "Data TLB1: 4 KByte pages, 4-way associative, 64 entries",
+    [187 ... 191] = NULL,
+    [192] = "Data TLB: 4 KByte and 4 MByte pages, 4-way associative, 8 entries",
+    [193] = "Shared 2nd-Level TLB: 4 KByte/2MByte pages, 8-way associative, 1024 entries",
+    [194] = "DTLB: 4 KByte/2 MByte pages, 4-way associative, 16 entries",
+    [195 ... 201] = NULL,
+    [202] = "Shared 2nd-Level TLB: 4 KByte pages, 4-way associative, 512 entries",
+    [203 ... 239] = NULL,
+    [240] = "64-Byte prefetching",
+    [241] = "128-Byte prefetching",
+    [242 ... 255] = NULL
+}; 
+#endif
diff --git a/src/includes/topology.h b/src/includes/topology.h
new file mode 100644
index 0000000..d470bbe
--- /dev/null
+++ b/src/includes/topology.h
@@ -0,0 +1,139 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology.h
+ *
+ *      Description:  Header File of topology module.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_TOPOLOGY
+#define LIKWID_TOPOLOGY
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <topology_cpuid.h>
+#include <topology_proc.h>
+#ifdef LIKWID_USE_HWLOC
+#include <topology_hwloc.h>
+#endif
+#include <types.h>
+#include <tree.h>
+
+
+#define MAX_FEATURE_STRING_LENGTH 512
+#define MAX_MODEL_STRING_LENGTH 512
+
+extern int affinity_thread2tile_lookup[MAX_NUM_THREADS];
+extern int affinity_thread2tile_lookup[MAX_NUM_THREADS];
+struct topology_functions {
+    void (*init_cpuInfo) (cpu_set_t cpuSet);
+    void (*init_cpuFeatures) (void);
+    void (*init_nodeTopology) (cpu_set_t cpuSet);
+    void (*init_cacheTopology) (void);
+    void (*init_fileTopology) (FILE*);
+};
+
+/* Intel P6 */
+#define PENTIUM_M_BANIAS     0x09U
+#define PENTIUM_M_DOTHAN     0x0DU
+#define CORE_DUO             0x0EU
+#define CORE2_65             0x0FU
+#define CORE2_45             0x17U
+#define ATOM                 0x1CU
+#define ATOM_45              0x26U
+#define ATOM_32              0x36U
+#define ATOM_22              0x27U
+#define ATOM_SILVERMONT_E    0x37U
+#define ATOM_SILVERMONT_C    0x4DU
+#define ATOM_SILVERMONT_Z1   0x4AU
+#define ATOM_SILVERMONT_Z2   0x5AU
+#define ATOM_SILVERMONT_F    0x5DU
+#define ATOM_SILVERMONT_AIR  0x4CU
+#define NEHALEM              0x1AU
+#define NEHALEM_BLOOMFIELD   0x1AU
+#define NEHALEM_LYNNFIELD    0x1EU
+#define NEHALEM_LYNNFIELD_M  0x1FU
+#define NEHALEM_WESTMERE     0x2CU
+#define NEHALEM_WESTMERE_M   0x25U
+#define SANDYBRIDGE          0x2AU
+#define SANDYBRIDGE_EP       0x2DU
+#define HASWELL              0x3CU
+#define HASWELL_EP           0x3FU
+#define HASWELL_M1           0x45U
+#define HASWELL_M2           0x46U
+#define IVYBRIDGE            0x3AU
+#define IVYBRIDGE_EP         0x3EU
+#define NEHALEM_EX           0x2EU
+#define WESTMERE_EX          0x2FU
+#define XEON_MP              0x1DU
+#define BROADWELL            0x3DU
+#define BROADWELL_E          0x4FU
+#define BROADWELL_D          0x56U
+
+/* Intel MIC */
+#define XEON_PHI           0x01U
+
+/* AMD K10 */
+#define BARCELONA      0x02U
+#define SHANGHAI       0x04U
+#define ISTANBUL       0x08U
+#define MAGNYCOURS     0x09U
+
+/* AMD K8 */
+#define OPTERON_SC_1MB  0x05U
+#define OPTERON_DC_E    0x21U
+#define OPTERON_DC_F    0x41U
+#define ATHLON64_X2     0x43U
+#define ATHLON64_X2_F   0x4BU
+#define ATHLON64_F1     0x4FU
+#define ATHLON64_F2     0x5FU
+#define ATHLON64_X2_G   0x6BU
+#define ATHLON64_G1     0x6FU
+#define ATHLON64_G2     0x7FU
+
+
+#define  P6_FAMILY        0x6U
+#define  MIC_FAMILY       0xBU
+#define  NETBURST_FAMILY  0xFFU
+#define  K15_FAMILY       0x15U
+#define  K16_FAMILY       0x16U
+#define  K10_FAMILY       0x10U
+#define  K8_FAMILY        0xFU
+
+
+
+
+
+extern int cpu_count(cpu_set_t* set);
+
+static inline int cpuid_hasFeature(FeatureBit bit)
+{
+      return (cpuid_info.featureFlags & (1<<bit));
+}
+
+
+#endif
diff --git a/src/includes/topology_cpuid.h b/src/includes/topology_cpuid.h
new file mode 100644
index 0000000..7e475d7
--- /dev/null
+++ b/src/includes/topology_cpuid.h
@@ -0,0 +1,43 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology_cpuid.h
+ *
+ *      Description:  Header File of topology backend using cpuid instruction.
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_TOPOLOGY_CPUID
+#define LIKWID_TOPOLOGY_CPUID
+
+#include <sched.h>
+
+void cpuid_init_cpuInfo(cpu_set_t cpuSet);
+void cpuid_init_cpuFeatures(void);
+void cpuid_init_nodeTopology(cpu_set_t cpuSet);
+void cpuid_init_cacheTopology(void);
+
+
+#endif
diff --git a/src/includes/topology_hwloc.h b/src/includes/topology_hwloc.h
new file mode 100644
index 0000000..eec67f0
--- /dev/null
+++ b/src/includes/topology_hwloc.h
@@ -0,0 +1,51 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology_hwloc.h
+ *
+ *      Description:  Header File of topology backend using the hwloc library
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_TOPOLOGY_HWLOC
+#define LIKWID_TOPOLOGY_HWLOC
+
+
+#include <hwloc.h>
+#include <sched.h>
+
+
+extern hwloc_topology_t hwloc_topology;
+
+int hwloc_record_objs_of_type_below_obj(hwloc_topology_t t, hwloc_obj_t obj, hwloc_obj_type_t type, int* index, uint32_t **list);
+
+
+
+void hwloc_init_cpuInfo(cpu_set_t cpuSet);
+void hwloc_init_cpuFeatures(void);
+void hwloc_init_nodeTopology(cpu_set_t cpuSet);
+void hwloc_init_cacheTopology(void);
+
+
+#endif
diff --git a/src/includes/topology_proc.h b/src/includes/topology_proc.h
new file mode 100644
index 0000000..ebdc006
--- /dev/null
+++ b/src/includes/topology_proc.h
@@ -0,0 +1,51 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology_proc.h
+ *
+ *      Description:  Header File of topology backend using procfs/sysfs
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef LIKWID_TOPOLOGY_PROC
+#define LIKWID_TOPOLOGY_PROC
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sched.h>
+#include <unistd.h>
+#include <sched.h>
+
+#include <error.h>
+#include <tree.h>
+#include <bitUtil.h>
+#include <topology.h>
+
+void proc_init_cpuInfo(cpu_set_t cpuSet);
+void proc_init_cpuFeatures(void);
+void proc_init_nodeTopology(cpu_set_t cpuSet);
+void proc_init_cacheTopology(void);
+
+
+#endif
diff --git a/src/includes/topology_types.h b/src/includes/topology_types.h
new file mode 100644
index 0000000..8ac4dfb
--- /dev/null
+++ b/src/includes/topology_types.h
@@ -0,0 +1,73 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology_types.h
+ *
+ *      Description:  Types file for topology module. External definitions are
+ *                    in likwid.h
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#ifndef CPUID_TYPES_H
+#define CPUID_TYPES_H
+
+/** \addtogroup CPUTopology CPU information module
+*  @{
+*/
+/*! \brief Enum of possible CPU features
+
+CPUs implement different features that likely improve application performance if
+optimized using the feature. The list contains all features that are currently 
+supported by LIKWID. LIKWID does not perform any action based on these features,
+it gathers the data only for output purposes. It is not a complete list.
+\extends CpuInfo
+*/
+typedef enum {
+    SSE3=0, /*!< \brief Streaming SIMD Extensions 3 */
+    MMX, /*!< \brief Multi Media Extension */
+    SSE, /*!< \brief Streaming SIMD Extensions */
+    SSE2, /*!< \brief Streaming SIMD Extensions 2 */
+    MONITOR, /*!< \brief MONITOR and MWAIT instructions (part of SSE3) */
+    ACPI, /*!< \brief Advanced Configuration and Power Interface */
+    RDTSCP, /*!< \brief Serializing Read of the Time Stamp Counter */
+    VMX, /*!< \brief Virtual Machine eXtensions (VT-x) */
+    EIST, /*!< \brief Enhanced Intel SpeedStep */
+    TM, /*!< \brief Thermal Monitor */
+    TM2, /*!< \brief Thermal Monitor 2 */
+    AES, /*!< \brief AES instruction set */
+    RDRAND, /*!< \brief Random numbers from an on-chip hardware random number generator */
+    SSSE3, /*!< \brief Supplemental Streaming SIMD Extensions 3 */
+    SSE41, /*!< \brief Streaming SIMD Extensions 4.1 */
+    SSE42, /*!< \brief Streaming SIMD Extensions 4.2 */
+    AVX, /*!< \brief Advanced Vector Extensions */
+    FMA, /*!< \brief Fused multiply-add (FMA3) */
+    AVX2, /*!< \brief Advanced Vector Extensions 2 */
+    RTM, /*!< \brief Restricted Transactional Memory */
+    HLE, /*!< \brief Hardware Lock Elision */
+    HTT, /*!< \brief Hyper-Threading Technology */
+    RDSEED, /*!< \brief Non-deterministic random bit generator */
+} FeatureBit;
+/** @}*/
+#endif /*CPUID_TYPES_H*/
diff --git a/src/includes/tree.h b/src/includes/tree.h
index 9816cf7..8970304 100644
--- a/src/includes/tree.h
+++ b/src/includes/tree.h
@@ -6,13 +6,13 @@
  *      Description:  Header File tree Module. 
  *                    Implements a simple tree data structure.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -36,6 +36,7 @@
 
 extern void tree_init(TreeNode** root, int id);
 extern void tree_print(TreeNode* nodePtr);
+extern void tree_destroy(TreeNode* nodePtr);
 extern void tree_insertNode(TreeNode* nodePtr, int id);
 extern int tree_nodeExists(TreeNode* nodePtr, int id);
 extern int tree_countChildren(TreeNode* nodePtr);
diff --git a/src/includes/tree_types.h b/src/includes/tree_types.h
index b449e39..477c78d 100644
--- a/src/includes/tree_types.h
+++ b/src/includes/tree_types.h
@@ -5,13 +5,13 @@
  *
  *      Description:  Types file for tree module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -31,14 +31,24 @@
 #ifndef TREE_TYPES_H
 #define TREE_TYPES_H
 
-/* For arbitrary trees llink are the children and
- * rlink are the neighbours
- */
-typedef struct treeNode {
-    int id;
-    struct treeNode* llink;
-    struct treeNode* rlink;
-} TreeNode;
 
+/** \addtogroup CPUTopology
+*  @{
+*/
+/*! \brief Structure of a tree node
+
+This structure is used to form the tree of the system topology. The information
+describing each node is store in other places, therefore an ID is enough.
+\extends CpuTopology
+*/
+struct treeNode {
+    int id; /*!< \brief ID of the node */
+    struct treeNode* llink; /*!< \brief List of children of the current node */
+    struct treeNode* rlink; /*!< \brief List of neighbors of the current node */
+};
+
+/** \brief Shorter name for struct treeNode */
+typedef struct treeNode TreeNode;
+/** @}*/
 
 #endif /*TREE_TYPES_H*/
diff --git a/src/includes/types.h b/src/includes/types.h
index 2b0745a..c4160f3 100644
--- a/src/includes/types.h
+++ b/src/includes/types.h
@@ -5,13 +5,14 @@
  *
  *      Description:  Global  Types file
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -34,27 +35,18 @@
 
 /* #####   HEADER FILE INCLUDES   ######################################### */
 #include <stdint.h>
+#include <bstrlib.h>
 
 #include <accessClient_types.h>
+#include <registers_types.h>
 #include <pci_types.h>
 #include <power_types.h>
 #include <thermal_types.h>
-#include <strUtil_types.h>
-#include <test_types.h>
-#include <barrier_types.h>
-#include <timer_types.h>
 #include <tree_types.h>
-#include <cpuid_types.h>
-#include <affinity_types.h>
-#include <threads_types.h>
-#include <cpuFeatures_types.h>
-#include <asciiBoxes_types.h>
-#include <asciiTable_types.h>
+#include <topology_types.h>
 #include <perfmon_types.h>
 #include <libperfctr_types.h>
-#include <multiplex_types.h>
-#include <numa_types.h>
-#include <pci_types.h>
+#include <cpuFeatures_types.h>
 
 
 typedef struct {
@@ -83,4 +75,7 @@ typedef struct {
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
 
+#define likely(x)       __builtin_expect(!!(x), 1)
+#define unlikely(x)     __builtin_expect(!!(x), 0)
+
 #endif /*TYPES_H*/
diff --git a/src/libperfctr.c b/src/libperfctr.c
index a4b2158..0b66095 100644
--- a/src/libperfctr.c
+++ b/src/libperfctr.c
@@ -5,13 +5,14 @@
  *
  *      Description:  Marker API interface of module perfmon
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -38,26 +39,19 @@
 #include <unistd.h>
 #include <sched.h>
 #include <pthread.h>
+#include <inttypes.h>
 
-#include <error.h>
-#include <types.h>
+#include <likwid.h>
 #include <bitUtil.h>
-#include <bstrlib.h>
-#include <cpuid.h>
-#include <numa.h>
-#include <affinity.h>
 #include <lock.h>
 #include <tree.h>
-#include <accessClient.h>
-#include <msr.h>
-#include <pci.h>
-#include <power.h>
-#include <thermal.h>
 #include <timer.h>
 #include <hashTable.h>
 #include <registers.h>
-#include <likwid.h>
+#include <error.h>
+#include <access.h>
 
+#include <perfmon.h>
 #include <perfmon_core2_counters.h>
 #include <perfmon_haswell_counters.h>
 #include <perfmon_interlagos_counters.h>
@@ -74,15 +68,12 @@
 
 /* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
 
-static int perfmon_numCounters=0;     /* total number of counters */
-static int perfmon_numCountersCore=0; /* max index of core counters */
-static int perfmon_numCountersUncore=0; /* max index of conventional uncore counters */
-static PerfmonCounterMap* perfmon_counter_map = NULL;
-static int socket_lock[MAX_NUM_NODES];
-static int thread_socketFD[MAX_NUM_THREADS];
-static int hasPCICounters = 0;
+int socket_lock[MAX_NUM_NODES];
 static int likwid_init = 0;
-static BitMask counterMask;
+static int numberOfGroups = 0;
+static int* groups;
+static int threads2Cpu[MAX_NUM_THREADS];
+static int num_cpus = 0;
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 
@@ -101,15 +92,14 @@ void str2BitMask(const char* str, BitMask* mask)
     for (int i=0; i<tokens->qty; i++)
     {
         uint64_t val =  strtoull((char*) tokens->entry[i]->data, &endptr, 16);
-
         if ((errno == ERANGE && val == LONG_MAX ) || (errno != 0 && val == 0))
         {
-            ERROR;
+          ERROR;
         }
 
         if (endptr == str)
         {
-            ERROR_PLAIN_PRINT(No digits were found);
+          ERROR_PLAIN_PRINT(No digits were found);
         }
 
         mask->mask[i] = val;
@@ -133,247 +123,145 @@ static int getProcessorID(cpu_set_t* cpu_set)
     return processorId;
 }
 
+static int getThreadID(int cpu_id)
+{
+    int i;
+    for(i=0;i<groupSet->numberOfThreads;i++)
+    {
+        if (cpu_id == groupSet->threads[i].processorId)
+        {
+            return i;
+        }
+    }
+    return -1;
+}
+
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
 void likwid_markerInit(void)
 {
-    int cpuId = likwid_getProcessorId();
+    int i;
+    int verbosity;
+    bstring bThreadStr;
+    bstring bEventStr;
+    struct bstrList* threadTokens;
+    struct bstrList* eventStrings;
     char* modeStr = getenv("LIKWID_MODE");
-    char* maskStr = getenv("LIKWID_MASK");
-
-    if ((modeStr != NULL) && (maskStr != NULL))
+    char* eventStr = getenv("LIKWID_EVENTS");
+    char* cThreadStr = getenv("LIKWID_THREADS");
+    char* filepath = getenv("LIKWID_FILEPATH");
+    /* Dirty hack to avoid nonnull warnings */
+    int (*ownatoi)(const char*);
+    ownatoi = &atoi;
+
+    if ((modeStr != NULL) && (filepath != NULL) && (eventStr != NULL) && (cThreadStr != NULL))
     {
         likwid_init = 1;
     }
+    else if (likwid_init == 0)
+    {
+        fprintf(stderr, "Cannot initalize LIKWID marker API, environment variables are not set\n");
+        fprintf(stderr, "You have to set the -m commandline switch for likwid-perfctr\n");
+        return;
+    }
     else
     {
         return;
     }
 
+    verbosity = atoi(getenv("LIKWID_DEBUG"));
     if (!lock_check())
     {
         fprintf(stderr,"Access to performance counters is locked.\n");
         exit(EXIT_FAILURE);
     }
 
-    cpuid_init();
+    topology_init();
     numa_init();
     affinity_init();
-    timer_init();
     hashTable_init();
 
-    for(int i=0; i<MAX_NUM_THREADS; i++) thread_socketFD[i] = -1;
+    for(int i=0; i<MAX_NUM_THREADS; i++) thread_sockets[i] = -1;
     for(int i=0; i<MAX_NUM_NODES; i++) socket_lock[i] = LOCK_INIT;
 
-    accessClient_mode = atoi(modeStr);
-    str2BitMask(maskStr, &counterMask);
+    accessClient_setaccessmode(atoi(modeStr));
+    perfmon_verbosity = verbosity;
+
 
-    if (accessClient_mode != DAEMON_AM_DIRECT)
+    bThreadStr = bfromcstr(cThreadStr);
+    threadTokens = bstrListCreate();
+    threadTokens = bsplit(bThreadStr,',');
+    num_cpus = threadTokens->qty;
+    for (i=0; i<num_cpus; i++)
     {
-        accessClient_init(&thread_socketFD[cpuId]);
+        threads2Cpu[i] = ownatoi(bdata(threadTokens->entry[i]));
     }
+    bdestroy(bThreadStr);
+    bstrListDestroy(threadTokens);
 
-    msr_init(thread_socketFD[cpuId]);
-    thermal_init(cpuId);
-
-    switch ( cpuid_info.family )
+    i = perfmon_init(num_cpus, threads2Cpu);
+    if (i<0)
     {
-        case P6_FAMILY:
-
-            switch ( cpuid_info.model )
-            {
-                case PENTIUM_M_BANIAS:
-
-                case PENTIUM_M_DOTHAN:
-
-                    perfmon_counter_map = pm_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_PM;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_PM;
-                    break;
-
-                case ATOM_45:
-
-                case ATOM_32:
-
-                case ATOM_22:
-
-                case ATOM:
-
-                    perfmon_counter_map = core2_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_CORE2;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_CORE2;
-                    break;
-
-                case ATOM_SILVERMONT_C:
-                case ATOM_SILVERMONT_E:
-                case ATOM_SILVERMONT_F1:
-                case ATOM_SILVERMONT_F2:
-                case ATOM_SILVERMONT_F3:
-                    power_init(0);
-                    perfmon_counter_map = silvermont_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_SILVERMONT;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_SILVERMONT;
-                    break;
-
-                case CORE_DUO:
-                    ERROR_PLAIN_PRINT(Unsupported Processor);
-                    break;
-
-                case XEON_MP:
-
-                case CORE2_65:
-
-                case CORE2_45:
-
-                    perfmon_counter_map = core2_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_CORE2;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_CORE2;
-                    break;
-
-                case NEHALEM_EX:
-
-                case WESTMERE_EX:
-
-                    perfmon_counter_map = westmereEX_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_WESTMEREEX;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_WESTMEREEX;
-                    perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_WESTMEREEX;
-                    break;
-
-                case NEHALEM_BLOOMFIELD:
-
-                case NEHALEM_LYNNFIELD:
-
-                case NEHALEM_WESTMERE_M:
-
-                case NEHALEM_WESTMERE:
-
-                    perfmon_counter_map = nehalem_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_NEHALEM;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_NEHALEM;
-                    perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_NEHALEM;
-                    break;
-
-                case IVYBRIDGE:
-
-                case IVYBRIDGE_EP:
-
-                    {
-                        int socket_fd = thread_socketFD[cpuId];
-                        hasPCICounters = 1;
-                        power_init(0); /* FIXME Static coreId is dangerous */
-                        pci_init(socket_fd);
-                        perfmon_counter_map = ivybridge_counter_map;
-                        perfmon_numCounters = NUM_COUNTERS_IVYBRIDGE;
-                        perfmon_numCountersCore = NUM_COUNTERS_CORE_IVYBRIDGE;
-                        perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_IVYBRIDGE;
-                    }
-                    break;
-
-                case HASWELL:
-
-                case HASWELL_EX:
-
-                case HASWELL_M1:
-
-                case HASWELL_M2:
-
-                    power_init(0); /* FIXME Static coreId is dangerous */
-
-                    perfmon_counter_map = haswell_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_HASWELL;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_HASWELL;
-                    break;
-
-                case SANDYBRIDGE:
-
-                case SANDYBRIDGE_EP:
-
-                    {
-                        int socket_fd = thread_socketFD[cpuId];
-                        hasPCICounters = 1;
-                        power_init(0); /* FIXME Static coreId is dangerous */
-                        pci_init(socket_fd);
-                        perfmon_counter_map = sandybridge_counter_map;
-                        perfmon_numCounters = NUM_COUNTERS_SANDYBRIDGE;
-                        perfmon_numCountersCore = NUM_COUNTERS_CORE_SANDYBRIDGE;
-                        perfmon_numCountersUncore = NUM_COUNTERS_UNCORE_SANDYBRIDGE;
-                    }
-                    break;
-
-                default:
-                    ERROR_PLAIN_PRINT(Unsupported Processor);
-                    break;
-            }
-            break;
-
-        case MIC_FAMILY:
-
-            switch ( cpuid_info.model )
-            {
-                case XEON_PHI:
-
-                    perfmon_counter_map = phi_counter_map;
-                    perfmon_numCounters = NUM_COUNTERS_PHI;
-                    perfmon_numCountersCore = NUM_COUNTERS_CORE_PHI;
-                    break;
-
-                default:
-                    ERROR_PLAIN_PRINT(Unsupported Processor);
-                    break;
-            }
-            break;
-
-        case K8_FAMILY:
-
-            perfmon_counter_map = k10_counter_map;
-            perfmon_numCounters = NUM_COUNTERS_K10;
-            perfmon_numCountersCore = NUM_COUNTERS_CORE_K10;
-            break;
-
-        case K10_FAMILY:
+        fprintf(stderr,"Failed to initialize LIKWID perfmon library.\n");
+        return;
+    }
 
-            perfmon_counter_map = k10_counter_map;
-            perfmon_numCounters = NUM_COUNTERS_K10;
-            perfmon_numCountersCore = NUM_COUNTERS_CORE_K10;
-            break;
+    bEventStr = bfromcstr(eventStr);
+    eventStrings = bstrListCreate();
+    eventStrings = bsplit(bEventStr,'|');
+    numberOfGroups = eventStrings->qty;
+    groups = malloc(numberOfGroups * sizeof(int));
+    if (!groups)
+    {
+        fprintf(stderr,"Cannot allocate space for group handling.\n");
+        bstrListDestroy(eventStrings);
+        exit(EXIT_FAILURE);
+    }
+    for (i=0; i<eventStrings->qty; i++)
+    {
+        groups[i] = perfmon_addEventSet(bdata(eventStrings->entry[i]));
+    }
+    bstrListDestroy(eventStrings);
 
-        case K15_FAMILY:
+    groupSet->activeGroup = 0;
+}
 
-            perfmon_counter_map = interlagos_counter_map;
-            perfmon_numCounters = NUM_COUNTERS_INTERLAGOS;
-            perfmon_numCountersCore = NUM_COUNTERS_CORE_INTERLAGOS;
-            break;
+void likwid_markerThreadInit(void)
+{
+    if ( !likwid_init )
+    {
+        return;
+    }
 
-        case K16_FAMILY:
+    int cpu_id = likwid_getProcessorId();
+    int thread_id = getThreadID(cpu_id);
 
-            perfmon_counter_map = kabini_counter_map;
-            perfmon_numCounters = NUM_COUNTERS_KABINI;
-            perfmon_numCountersCore = NUM_COUNTERS_CORE_KABINI;
-            break;
+    HPMaddThread(cpu_id);
+    initThreadArch(cpu_id);
+    hashTable_initThread(cpu_id);
 
-        default:
-            ERROR_PLAIN_PRINT(Unsupported Processor);
-            break;
+    for(int i=0; i<groupSet->groups[groupSet->activeGroup].numberOfEvents;i++)
+    {
+        groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].init = TRUE;
     }
 }
 
-void likwid_markerThreadInit(void)
+void likwid_markerNextGroup(void)
 {
-    if ( ! likwid_init )
+    int i;
+    int next_group;
+
+    if (!likwid_init)
     {
         return;
     }
 
-    int cpuId = likwid_getProcessorId();
-
-    if (accessClient_mode != DAEMON_AM_DIRECT)
+    next_group = (groupSet->activeGroup + 1) % numberOfGroups;
+    if (next_group != groupSet->activeGroup)
     {
-        if (thread_socketFD[cpuId] == -1)
-        {
-            accessClient_init(&thread_socketFD[cpuId]);
-        }
+        i = perfmon_switchActiveGroup(next_group);
     }
+    return;
 }
 
 /* File format
@@ -389,35 +277,41 @@ void likwid_markerClose(void)
     LikwidResults* results = NULL;
     int numberOfThreads;
     int numberOfRegions;
-
     if ( ! likwid_init )
     {
+        
         return;
     }
-
     hashTable_finalize(&numberOfThreads, &numberOfRegions, &results);
-
     file = fopen(getenv("LIKWID_FILEPATH"),"w");
 
     if (file != NULL)
     {
-        fprintf(file,"%d %d\n",numberOfThreads,numberOfRegions);
-
+        fprintf(file,"%d %d %d\n",numberOfThreads, numberOfRegions, numberOfGroups);
         for (int i=0; i<numberOfRegions; i++)
         {
+            if (results[i].count[0] == 0)
+            {
+                continue;
+            }
             fprintf(file,"%d:%s\n",i,bdata(results[i].tag));
         }
-
         for (int i=0; i<numberOfRegions; i++)
         {
+            if (results[i].count[0] == 0)
+            {
+                continue;
+            }
             for (int j=0; j<numberOfThreads; j++)
             {
                 fprintf(file,"%d ",i);
+                fprintf(file,"%d ",results[i].groupID);
                 fprintf(file,"%d ",j);
                 fprintf(file,"%u ",results[i].count[j]);
                 fprintf(file,"%e ",results[i].time[j]);
+                fprintf(file,"%d ",groupSet->groups[results[i].groupID].numberOfEvents);
 
-                for (int k=0; k<NUM_PMC; k++)
+                for (int k=0; k<groupSet->groups[results[i].groupID].numberOfEvents; k++)
                 {
                     fprintf(file,"%e ",results[i].counters[j][k]);
                 }
@@ -426,6 +320,10 @@ void likwid_markerClose(void)
         }
         fclose(file);
     }
+    else
+    {
+        fprintf(stderr, "Cannot open file %s\n", getenv("LIKWID_FILEPATH"));
+    }
 
     for (int i=0;i<numberOfRegions; i++)
     {
@@ -443,275 +341,125 @@ void likwid_markerClose(void)
     {
         free(results);
     }
+    likwid_init = 0;
+    HPMfinalize();
+}
 
-    msr_finalize();
-    pci_finalize();
-
-    for (int i=0; i<MAX_NUM_THREADS; i++)
+int likwid_markerRegisterRegion(const char* regionTag)
+{
+    if ( ! likwid_init )
     {
-        accessClient_finalize(thread_socketFD[i]);
-        thread_socketFD[i] = -1;
+        return -EFAULT;
     }
+    TimerData timer;
+    bstring tag = bfromcstralloc(100, regionTag);
+    LikwidThreadResults* results;
+    char groupSuffix[10];
+    sprintf(groupSuffix, "-%d", groupSet->activeGroup);
+    bcatcstr(tag, groupSuffix);
+    int cpu_id = hashTable_get(tag, &results);;
+    bdestroy(tag);
+    return 0;
 }
 
 
-void likwid_markerStartRegion(const char* regionTag)
+int likwid_markerStartRegion(const char* regionTag)
 {
     if ( ! likwid_init )
     {
-        return;
+        return -EFAULT;
     }
 
     bstring tag = bfromcstralloc(100, regionTag);
     LikwidThreadResults* results;
-    uint64_t res;
+    char groupSuffix[10];
+    sprintf(groupSuffix, "-%d", groupSet->activeGroup);
+    bcatcstr(tag, groupSuffix);
+    
     int cpu_id = hashTable_get(tag, &results);
-    bdestroy(tag);
-    int socket_fd = thread_socketFD[cpu_id];
-
-    if (accessClient_mode != DAEMON_AM_DIRECT)
-    {
-        if (socket_fd == -1)
-        {
-            printf("ERROR: Invalid socket file handle on processor %d. \
-                    Did you call likwid_markerThreadInit() ?\n", cpu_id);
-        }
-    }
-
-    results->count++;
-
-    /* Core specific counters */
-    for ( int i=0; i<perfmon_numCountersCore; i++ )
-    {
-        bitMask_test(res,counterMask,i);
-        if ( res )
-        {
-            if (perfmon_counter_map[i].type != THERMAL)
-            {
-                results->StartPMcounters[i] =
-                    (double) msr_tread(
-                            socket_fd,
-                            cpu_id,
-                            perfmon_counter_map[i].counterRegister);
-            }
-        }
-    }
+    int thread_id = getThreadID(cpu_id);
+    perfmon_readCountersCpu(cpu_id);
 
-    /* Uncore specific counters */
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id) ||
-            lock_acquire((int*)
-                &socket_lock[affinity_core2node_lookup[cpu_id]], cpu_id))
+    for(int i=0;i<groupSet->groups[groupSet->activeGroup].numberOfEvents;i++)
     {
-        /* Conventional Uncore counters */
-        for ( int i=perfmon_numCountersCore; i<perfmon_numCountersUncore; i++ )
-        {
-            bitMask_test(res,counterMask,i);
-            if ( res )
-            {
-                if (perfmon_counter_map[i].type != POWER)
-                {
-                    results->StartPMcounters[i] =
-                        (double) msr_tread(
-                                socket_fd,
-                                cpu_id,
-                                perfmon_counter_map[i].counterRegister);
-                }
-                else
-                {
-                    results->StartPMcounters[i] =
-                        (double) power_tread(
-                                socket_fd,
-                                cpu_id,
-                                perfmon_counter_map[i].counterRegister);
-                }
-            }
-        }
-
-        /* PCI Uncore counters */
-        if ( hasPCICounters && (accessClient_mode != DAEMON_AM_DIRECT) )
-        {
-            for ( int i=perfmon_numCountersUncore; i<perfmon_numCounters; i++ )
-            {
-                bitMask_test(res,counterMask,i);
-                if ( res )
-                {
-                    uint64_t counter_result =
-                        pci_tread(
-                                socket_fd,
-                                cpu_id,
-                                perfmon_counter_map[i].device,
-                                perfmon_counter_map[i].counterRegister);
-
-                    counter_result = (counter_result<<32) +
-                        pci_tread(
-                                socket_fd,
-                                cpu_id,
-                                perfmon_counter_map[i].device,
-                                perfmon_counter_map[i].counterRegister2);
-
-                    results->StartPMcounters[perfmon_counter_map[i].index] =
-                        (double) counter_result;
-                }
-            }
-        }
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, START [%s] READ EVENT [%d=%d] EVENT %d VALUE %llu , regionTag, thread_id, cpu_id, i,
+                        LLU_CAST groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData);
+        groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].startData =
+                groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData;
     }
-
+    results->groupID = groupSet->activeGroup;
+    bdestroy(tag);
     timer_start(&(results->startTime));
+    return 0;
 }
 
-#define READ_END_MEM_CHANNEL(channel, reg, cid)                      \
-    counter_result = pci_tread(socket_fd, cpu_id, channel, reg##_A); \
-    counter_result = (counter_result<<32) +                          \
-    pci_tread(socket_fd, cpu_id, channel, reg##_B);                  \
-    results->PMcounters[cid] += (double) counter_result - results->StartPMcounters[cid]
-
 
-/* TODO: Readout hash at the end. Compute result at the end of the function to
- * keep overhead in region low */
 
-void likwid_markerStopRegion(const char* regionTag)
+int likwid_markerStopRegion(const char* regionTag)
 {
     if (! likwid_init)
     {
-        return;
+        return -EFAULT;
     }
 
     TimerData timestamp;
     timer_stop(&timestamp);
-    int cpu_id = likwid_getProcessorId();
-    uint64_t res;
-    int socket_fd = thread_socketFD[cpu_id];
-    double PMcounters[NUM_PMC];
-
-    /* Core specific counters */
-    for ( int i=0; i<perfmon_numCountersCore; i++ )
-    {
-        bitMask_test(res,counterMask,i);
-        if ( res )
-        {
-            if (perfmon_counter_map[i].type != THERMAL)
-            {
-                PMcounters[i] = (double) msr_tread(
-                        socket_fd,
-                        cpu_id,
-                        perfmon_counter_map[i].counterRegister);
-            }
-            else
-            {
-                PMcounters[i] = (double) thermal_read(cpu_id);
-            }
-        }
-    }
-
-    /* Uncore specific counters */
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
-    {
-        /* Conventional Uncore counters */
-        for ( int i=perfmon_numCountersCore; i<perfmon_numCountersUncore; i++ )
-        {
-            bitMask_test(res,counterMask,i);
-            if ( res )
-            {
-                if (perfmon_counter_map[i].type != POWER)
-                {
-                    PMcounters[i] = (double) msr_tread(
-                            socket_fd,
-                            cpu_id,
-                            perfmon_counter_map[i].counterRegister);
-                }
-                else
-                {
-                    PMcounters[i] = (double) power_tread(
-                            socket_fd,
-                            cpu_id,
-                            perfmon_counter_map[i].counterRegister);
-                }
-            }
-        }
-
-        /* PCI Uncore counters */
-        if ( hasPCICounters && (accessClient_mode != DAEMON_AM_DIRECT) )
-        {
-            for ( int i=perfmon_numCountersUncore; i<perfmon_numCounters; i++ )
-            {
-                bitMask_test(res,counterMask,i);
-                if ( res )
-                {
-                    uint64_t counter_result =
-                        pci_tread(
-                                socket_fd,
-                                cpu_id,
-                                perfmon_counter_map[i].device,
-                                perfmon_counter_map[i].counterRegister);
-
-                    counter_result = (counter_result<<32) +
-                        pci_tread(
-                                socket_fd,
-                                cpu_id,
-                                perfmon_counter_map[i].device,
-                                perfmon_counter_map[i].counterRegister2);
-
-                    PMcounters[i] = (double) counter_result;
-                }
-            }
-        }
-    }
-
-    bstring tag = bfromcstralloc(100, regionTag);
+    int cpu_id;
+    int thread_id;
+    bstring tag = bfromcstr(regionTag);
+    char groupSuffix[100];
     LikwidThreadResults* results;
-    hashTable_get(tag, &results);
-    results->startTime.stop = timestamp.stop;
+    sprintf(groupSuffix, "-%d", groupSet->activeGroup);
+    bcatcstr(tag, groupSuffix);
+    
+    cpu_id = hashTable_get(tag, &results);
+    thread_id = getThreadID(cpu_id);
+    results->startTime.stop.int64 = timestamp.stop.int64;
     results->time += timer_print(&(results->startTime));
+    results->count++;
     bdestroy(tag);
-
-    /* Accumulate the results */
-    /* Core counters */
-    for ( int i=0; i<perfmon_numCountersCore; i++ )
+    
+    perfmon_readCountersCpu(cpu_id);
+    
+    for(int i=0;i<groupSet->groups[groupSet->activeGroup].numberOfEvents;i++)
     {
-        bitMask_test(res,counterMask,i);
-        if ( res )
-        {
-            if (perfmon_counter_map[i].type != THERMAL)
-            {
-                results->PMcounters[i] += (PMcounters[i] - results->StartPMcounters[i]);
-            }
-            else
-            {
-                results->PMcounters[i] = PMcounters[i];
-            }
-        }
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, STOP [%s] READ EVENT [%d=%d] EVENT %d VALUE %llu, regionTag, thread_id, cpu_id, i,
+                        LLU_CAST groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].counterData);
+        results->PMcounters[i] += perfmon_getResult(groupSet->activeGroup, i, thread_id);
     }
+    return 0;
+}
+
 
-    /* Uncore counters */
-    if ((socket_lock[affinity_core2node_lookup[cpu_id]] == cpu_id))
+void likwid_markerGetRegion(const char* regionTag, int* nr_events, double* events, double *time, int *count)
+{
+    if (! likwid_init)
     {
-        for ( int i=perfmon_numCountersCore; i<perfmon_numCounters; i++ )
-        {
-            bitMask_test(res,counterMask,i);
-            if ( res )
-            {
-                if ( perfmon_counter_map[i].type == POWER )
-                {
-                    if (PMcounters[i] >= results->StartPMcounters[i])
-                    {
-                        results->PMcounters[i] += power_info.energyUnit *
-                            (PMcounters[i] - results->StartPMcounters[i]);
-                    }
-                    else
-                    {
-                        results->PMcounters[i] += power_info.energyUnit *
-                            (((double)0xFFFFFFFF) - results->StartPMcounters[i] + PMcounters[i]);
-                    }
-                }
-                else
-                {
-                    results->PMcounters[i] += (PMcounters[i] - results->StartPMcounters[i]);
-                }
-            }
-        }
+        return;
+    }
+    int length = 0;
+    int cpu_id;
+    int thread_id;
+    bstring tag = bfromcstr(regionTag);
+    char groupSuffix[100];
+    LikwidThreadResults* results;
+    sprintf(groupSuffix, "-%d", groupSet->activeGroup);
+    bcatcstr(tag, groupSuffix);
+
+    cpu_id = hashTable_get(tag, &results);
+    thread_id = getThreadID(cpu_id);
+    *count = results->count;
+    *time = results->time;
+    length = MIN(groupSet->groups[groupSet->activeGroup].numberOfEvents, *nr_events);
+    for(int i=0;i<length;i++)
+    {
+        events[i] = results->PMcounters[i];
     }
+    *nr_events = length;
+    return;
 }
 
+
 int  likwid_getProcessorId()
 {
     cpu_set_t  cpu_set;
diff --git a/src/likwid.f90 b/src/likwid.f90
index 1215dd4..dec57e8 100644
--- a/src/likwid.f90
+++ b/src/likwid.f90
@@ -4,13 +4,14 @@
 !
 !     Description: Marker API f90 module
 !
-!      Version:   3.1.3
-!      Released:  4.11.2014
+!      Version:   4.0
+!      Released:  16.6.2015
 !
-!     Author:  Jan Treibig (jt), jan.treibig at gmail.com
+!     Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
+!               Thomas Roehl (tr), thomas.roehl at googlemail.com
 !     Project:  likwid
 !
-!      Copyright (C) 2014 Jan Treibig
+!      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 !
 !      This program is free software: you can redistribute it and/or modify it under
 !      the terms of the GNU General Public License as published by the Free Software
@@ -26,28 +27,95 @@
 !
 ! =======================================================================================
 
+!> \defgroup Fortran_Interface Likwid Fortran90 Module
 
-
+!> \ingroup Fortran_Interface
+!> Likwid Fortran90 Module for embedding the Marker API into Fortran applications
+!> In the basic configuration the module is compiled with the Intel Fortran Compiler
 module likwid
 
 interface
 
-    subroutine likwid_markerInit()
-    end subroutine likwid_markerInit
+!> \ingroup Fortran_Interface
+!> \brief Initialize the Likwid Marker API
+!! This routine initializes the Marker API for Fortran. It reads some 
+!! environment commonly set by likwid-perfctr.
+!! \note Must be called once in a serial region.
+  subroutine likwid_markerInit()
+  end subroutine likwid_markerInit
+
+!> \ingroup Fortran_Interface
+!> \brief Add current thread to Likwid for Marker API measurements
+!! This routine adds the current thread to Likwid that it performs measurements
+!! for this thread. If using the daemon access mode, it starts a deamon for the
+!! current thread.
+!! \note  Must be called once in a parallel region.
+  subroutine likwid_markerThreadInit()
+  end subroutine likwid_markerThreadInit
+
+!> \ingroup Fortran_Interface
+!> \brief Setup performance counters for the next event set
+!> If multiple groups should be measured this function
+!> switches to the next group in a round robin fashion.
+!> Each call reprogramms the performance counters for the current CPU,
+!> \note Do not call it while measuring a code region.
+  subroutine likwid_markerNextGroup()
+  end subroutine likwid_markerNextGroup
+
+!> \ingroup Fortran_Interface
+!> \brief Close the Likwid Marker API
+!> Close the Likwid Marker API and write measured results to temporary file
+!> for evaluation done by likwid-perfctr
+!> \note Must be called once in a serial region and no further
+!> Likwid calls should be used
+  subroutine likwid_markerClose()
+  end subroutine likwid_markerClose
+
+!> \ingroup Fortran_Interface
+!> \brief Register a code region
+!> Initializes the hash table with an empty entry to reduce the overhead
+!> at likwid_markerStartRegion()
+  subroutine likwid_markerRegisterRegion( regionTag )
+!> \param regionTag Name for the code region for later identification
+  character(*) :: regionTag
+  end subroutine likwid_markerRegisterRegion
 
-    subroutine likwid_markerThreadInit()
-    end subroutine likwid_markerThreadInit
 
-    subroutine likwid_markerClose()
-    end subroutine likwid_markerClose
+!> \ingroup Fortran_Interface
+!> \brief Start the measurement for a code region
+!> Reads the currently running event set and store the results as start values.
+!> for the measurement group identified by regionTag
+  subroutine likwid_markerStartRegion( regionTag )
+!> \param regionTag Name for the code region for later identification
+  character(*) :: regionTag
+  end subroutine likwid_markerStartRegion
 
-    subroutine likwid_markerStartRegion( regionTag )
-    character(*) :: regionTag
-    end subroutine likwid_markerStartRegion
+!> \ingroup Fortran_Interface
+!> \brief Stop the measurement for a code region
+!> Reads the currently running event set and accumulate the difference between
+!> stop and start data in the measurement group identified by regionTag.
+  subroutine likwid_markerStopRegion( regionTag )
+!> \param regionTag Name for the code region for later identification
+  character(*) :: regionTag
+  end subroutine likwid_markerStopRegion
 
-    subroutine likwid_markerStopRegion( regionTag )
-    character(*) :: regionTag
-    end subroutine likwid_markerStopRegion
+!> \ingroup Fortran_Interface
+!> \brief Get accumulated measurement results for a code region
+!> Get the accumulated data in the measurement group identified by regionTag
+!> for the current thread.
+!> \warning Experimental
+  subroutine likwid_markerGetRegion( regionTag, nr_events, events, time, count )
+!> \param regionTag [in] Name for the code region for later identification
+!> \param nr_events [in,out] Length of the events array
+!> \param events [out] Events array to store intermediate results
+!> \param time [out] Accumulated measurement time
+!> \param count [out] Call count of the region
+  character(*) :: regionTag
+  INTEGER :: nr_events
+  DOUBLE PRECISION, DIMENSION(*) :: events
+  DOUBLE PRECISION :: time
+  INTEGER :: count
+  end subroutine likwid_markerGetRegion
 
 end interface
 
diff --git a/src/likwid_f90_interface.c b/src/likwid_f90_interface.c
index 31bad92..36add62 100644
--- a/src/likwid_f90_interface.c
+++ b/src/likwid_f90_interface.c
@@ -5,13 +5,14 @@
  *
  *      Description: F90 interface for marker API
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:  Jan Treibig (jt), jan.treibig at gmail.com,
+ *               Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -48,6 +49,28 @@ void likwid_markerclose_(void)
     likwid_markerClose();
 }
 
+void likwid_markernextgroup_(void)
+{
+    likwid_markerNextGroup();
+}
+
+void likwid_markerregisterregion_(char* regionTag, int len)
+{
+    char* tmp = (char*) malloc((len+1) * sizeof(char) );
+    strncpy(tmp, regionTag, len * sizeof(char) );
+
+    for (int i=(len-1); len > 0; len--)
+    {
+        if (tmp[i] != ' ') {
+            tmp[i+1] = 0;
+            break;
+        }
+    }
+
+    likwid_markerRegisterRegion( tmp );
+    free(tmp);
+}
+
 void likwid_markerstartregion_(char* regionTag, int len)
 {
     char* tmp = (char*) malloc((len+1) * sizeof(char) );
@@ -82,3 +105,19 @@ void likwid_markerstopregion_(char* regionTag, int len)
     free(tmp);
 }
 
+void likwid_markergetregion_(char* regionTag, int* nr_events, double* events, double *time, int *count, int len)
+{
+    char* tmp = (char*) malloc((len+1) * sizeof(char));
+    strncpy(tmp, regionTag, len * sizeof(char) );
+
+    for (int i=(len-1); len > 0; len--)
+    {
+        if (tmp[i] != ' ') {
+            tmp[i+1] = 0;
+            break;
+        }
+    }
+    likwid_markerGetRegion( tmp, nr_events,  events, time, count);
+    free(tmp);
+}
+
diff --git a/src/loadData.S b/src/loadData.S
new file mode 100644
index 0000000..86de4d6
--- /dev/null
+++ b/src/loadData.S
@@ -0,0 +1,44 @@
+.intel_syntax noprefix
+
+.text
+.globl _loadData
+.type _loadData, @function
+_loadData :
+#ifdef __x86_64
+xor rax, rax
+.align 16
+1:
+mov  r8,  [rsi + rax]
+mov  r9,  [rsi + rax + 64]
+mov  r10, [rsi + rax + 128]
+mov r11,  [rsi + rax + 192]
+add rax, 256
+cmp rax, rdi
+jb 1b
+
+ret
+#else
+#ifdef __i386__
+push	ebp
+mov	ebp, esp
+push edi
+push esi
+xor eax, eax
+1:
+mov edi, DWORD PTR [ebp + eax + 12]
+mov esi, DWORD PTR [ebp + eax + 76]
+mov ecx, DWORD PTR [ebp + eax + 140]
+mov edx, DWORD PTR [ebp + eax + 204]
+add eax, 256
+cmp eax, DWORD PTR [ebp+8]
+jb 1b
+pop esi
+pop edi
+mov esp, ebp
+pop ebp
+ret
+#endif
+#endif
+.size _loadData, .-_loadData
+
+
diff --git a/src/loadData.s b/src/loadData.s
deleted file mode 100644
index e176c53..0000000
--- a/src/loadData.s
+++ /dev/null
@@ -1,22 +0,0 @@
-.intel_syntax noprefix
-
-.text
-.globl _loadData
-.type _loadData, @function
-_loadData :
-
-xor rax, rax
-.align 16
-1:
-mov  r8,  [rsi + rax]
-mov  r9,  [rsi + rax + 64]
-mov  r10, [rsi + rax + 128]
-mov r11,  [rsi + rax + 192]
-add rax, 256
-cmp rax, rdi
-jb 1b
-
-ret
-.size _loadData, .-_loadData
-
-
diff --git a/src/loadData.s.tmp b/src/loadData.s.tmp
deleted file mode 100644
index e69de29..0000000
diff --git a/src/luawid.c b/src/luawid.c
new file mode 100644
index 0000000..ea9b19c
--- /dev/null
+++ b/src/luawid.c
@@ -0,0 +1,1681 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  luawid.c
+ *
+ *      Description:  C part of the Likwid Lua interface
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/wait.h>
+#include <sys/types.h>
+#include <sys/time.h>
+
+
+#include <lua.h>                               /* Always include this */
+#include <lauxlib.h>                           /* Always include this */
+#include <lualib.h>                            /* Always include this */
+
+#include <likwid.h>
+#include <tree.h>
+
+#ifdef COLOR
+#include <textcolor.h>
+#endif
+
+static int topology_isInitialized = 0;
+CpuInfo_t cpuinfo = NULL;
+CpuTopology_t cputopo = NULL;
+
+static int numa_isInitialized = 0;
+NumaTopology_t numainfo = NULL;
+static int affinity_isInitialized = 0;
+AffinityDomains_t affinity = NULL;
+static int perfmon_isInitialized = 0;
+static int timer_isInitialized = 0;
+static int power_isInitialized = 0;
+PowerInfo_t power;
+static int power_hasRAPL = 0;
+static int config_isInitialized = 0;
+Configuration_t configfile = NULL;
+
+
+static int lua_likwid_getConfiguration(lua_State* L)
+{
+    int ret = 0;
+    if (config_isInitialized == 0)
+    {
+        ret = init_configuration();
+        if (ret == 0)
+        {
+            config_isInitialized = 1;
+            configfile = get_configuration();
+        }
+        else
+        {
+            lua_newtable(L);
+            lua_pushstring(L, "configFile");
+            lua_pushnil(L);
+            lua_settable(L,-3);
+            lua_pushstring(L, "topologyFile");
+            lua_pushnil(L);
+            lua_settable(L,-3);
+            lua_pushstring(L, "daemonPath");
+            lua_pushnil(L);
+            lua_settable(L,-3);
+            lua_pushstring(L, "daemonMode");
+            lua_pushinteger(L, -1);
+            lua_settable(L,-3);
+            lua_pushstring(L, "maxNumThreads");
+            lua_pushinteger(L, MAX_NUM_THREADS);
+            lua_settable(L,-3);
+            lua_pushstring(L, "maxNumNodes");
+            lua_pushinteger(L, MAX_NUM_NODES);
+            lua_settable(L,-3);
+            return 1;
+        }
+    }
+    if ((config_isInitialized) && (configfile == NULL))
+    {
+        configfile = get_configuration();
+    }
+    lua_newtable(L);
+    lua_pushstring(L, "configFile");
+    lua_pushstring(L, configfile->configFileName);
+    lua_settable(L,-3);
+    lua_pushstring(L, "topologyFile");
+    lua_pushstring(L, configfile->topologyCfgFileName);
+    lua_settable(L,-3);
+    lua_pushstring(L, "daemonPath");
+    lua_pushstring(L, configfile->daemonPath);
+    lua_settable(L,-3);
+    lua_pushstring(L, "daemonMode");
+    lua_pushinteger(L, (int)configfile->daemonMode);
+    lua_settable(L,-3);
+    lua_pushstring(L, "maxNumThreads");
+    lua_pushinteger(L, configfile->maxNumThreads);
+    lua_settable(L,-3);
+    lua_pushstring(L, "maxNumNodes");
+    lua_pushinteger(L, configfile->maxNumNodes);
+    lua_settable(L,-3);
+    return 1;
+}
+
+static int lua_likwid_putConfiguration(lua_State* L)
+{
+    if (config_isInitialized == 1)
+    {
+        destroy_configuration();
+        config_isInitialized = 0;
+        configfile = NULL;
+    }
+    return 0;
+}
+
+static int lua_likwid_setAccessMode(lua_State* L)
+{
+    int flag;
+    flag = luaL_checknumber(L,1);
+    luaL_argcheck(L, flag >= 0 && flag <= 1, 1, "invalid access mode, only 0 (direct) and 1 (accessdaemon) allowed");
+    accessClient_setaccessmode(flag);
+    lua_pushnumber(L,0);
+    return 1;
+}
+
+static int lua_likwid_init(lua_State* L)
+{
+    int ret;
+    int nrThreads = luaL_checknumber(L,1);
+    luaL_argcheck(L, nrThreads > 0, 1, "CPU count must be greater than 0");
+    int cpus[nrThreads];
+    if (!lua_istable(L, -1)) {
+      lua_pushstring(L,"No table given as second argument");
+      lua_error(L);
+    }
+    for (ret = 1; ret<=nrThreads; ret++)
+    {
+        lua_rawgeti(L,-1,ret);
+        cpus[ret-1] = lua_tounsigned(L,-1);
+        lua_pop(L,1);
+    }
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cpuinfo = get_cpuInfo();
+        cputopo = get_cpuTopology();
+    }
+    if ((topology_isInitialized) && (cpuinfo == NULL))
+    {
+        cpuinfo = get_cpuInfo();
+    }
+    if ((topology_isInitialized) && (cputopo == NULL))
+    {
+        cputopo = get_cpuTopology();
+    }
+    if (numa_isInitialized == 0)
+    {
+        numa_init();
+        numa_isInitialized = 1;
+        numainfo = get_numaTopology();
+    }
+    if ((numa_isInitialized) && (numainfo == NULL))
+    {
+        numainfo = get_numaTopology();
+    }
+    if (perfmon_isInitialized == 0)
+    {
+        ret = perfmon_init(nrThreads, &(cpus[0]));
+        if (ret != 0)
+        {
+            lua_pushstring(L,"Cannot initialize likwid perfmon");
+            lua_error(L);
+            return 1;
+        }
+        perfmon_isInitialized = 1;
+        timer_isInitialized = 1;
+        lua_pushinteger(L,ret);
+    }
+    return 1;
+}
+
+
+static int lua_likwid_addEventSet(lua_State* L)
+{
+    int groupId, n;
+    const char* tmpString;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    n = lua_gettop(L);
+    tmpString = luaL_checkstring(L, n);
+    luaL_argcheck(L, strlen(tmpString) > 0, n, "Event string must be larger than 0");
+
+    groupId = perfmon_addEventSet((char*)tmpString);
+    lua_pushnumber(L, groupId+1);
+    return 1;
+}
+
+static int lua_likwid_setupCounters(lua_State* L)
+{
+    int ret;
+    int groupId = lua_tonumber(L,1);
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    ret = perfmon_setupCounters(groupId-1);
+    lua_pushnumber(L,ret);
+    return 1;
+}
+
+
+static int lua_likwid_startCounters(lua_State* L)
+{
+    int ret;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    ret = perfmon_startCounters();
+    lua_pushnumber(L,ret);
+    return 1;
+}
+
+static int lua_likwid_stopCounters(lua_State* L)
+{
+    int ret;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    ret = perfmon_stopCounters();
+    lua_pushnumber(L,ret);
+    return 1;
+}
+
+static int lua_likwid_readCounters(lua_State* L)
+{
+    int ret;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    ret = perfmon_readCounters();
+    lua_pushnumber(L,ret);
+    return 1;
+}
+
+static int lua_likwid_switchGroup(lua_State* L)
+{
+    int ret = -1;
+    int newgroup = lua_tonumber(L,1)-1;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    if (newgroup >= perfmon_getNumberOfGroups())
+    {
+        newgroup = 0;
+    }
+    if (newgroup == perfmon_getIdOfActiveGroup())
+    {
+        lua_pushinteger(L, ret);
+        return 1;
+    }
+    ret = perfmon_switchActiveGroup(newgroup);
+    lua_pushinteger(L, ret);
+    return 1;
+}
+
+static int lua_likwid_finalize(lua_State* L)
+{
+    if (topology_isInitialized == 1)
+    {
+        topology_finalize();
+        topology_isInitialized = 0;
+        cputopo = NULL;
+        cpuinfo = NULL;
+    }
+    if (numa_isInitialized == 1)
+    {
+        numa_finalize();
+        numa_isInitialized = 0;
+        numainfo = NULL;
+    }
+    if (affinity_isInitialized == 1)
+    {
+        affinity_finalize();
+        affinity_isInitialized = 0;
+        affinity = NULL;
+    }
+    if (perfmon_isInitialized == 1)
+    {
+        perfmon_finalize();
+        perfmon_isInitialized = 0;
+    }
+    if (config_isInitialized == 1)
+    {
+        destroy_configuration();
+        config_isInitialized = 0;
+        configfile = NULL;
+    }
+    return 0;
+}
+
+static int lua_likwid_getResult(lua_State* L)
+{
+    int groupId, eventId, threadId;
+    double result = 0;
+    groupId = lua_tonumber(L,1);
+    eventId = lua_tonumber(L,2);
+    threadId = lua_tonumber(L,3);
+    result = perfmon_getResult(groupId-1, eventId-1, threadId-1);
+    lua_pushnumber(L,result);
+    return 1;
+}
+
+static int lua_likwid_getNumberOfGroups(lua_State* L)
+{
+    int number;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    number = perfmon_getNumberOfGroups();
+    lua_pushnumber(L,number);
+    return 1;
+}
+
+static int lua_likwid_getIdOfActiveGroup(lua_State* L)
+{
+    int number;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    number = perfmon_getIdOfActiveGroup();
+    lua_pushnumber(L,number+1);
+    return 1;
+}
+
+static int lua_likwid_getRuntimeOfGroup(lua_State* L)
+{
+    double time;
+    int groupId;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    groupId = lua_tonumber(L,1);
+    time = perfmon_getTimeOfGroup(groupId-1);
+    lua_pushnumber(L, time);
+    return 1;
+}
+
+static int lua_likwid_getNumberOfEvents(lua_State* L)
+{
+    int number, groupId;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    groupId = lua_tonumber(L,1);
+    number = perfmon_getNumberOfEvents(groupId-1);
+    lua_pushnumber(L,number);
+    return 1;
+}
+
+static int lua_likwid_getNumberOfThreads(lua_State* L)
+{
+    int number;
+    if (perfmon_isInitialized == 0)
+    {
+        return 0;
+    }
+    number = perfmon_getNumberOfThreads();
+    lua_pushnumber(L,number);
+    return 1;
+}
+
+static int lua_likwid_printSupportedCPUs(lua_State* L)
+{
+    print_supportedCPUs();
+    return 0;
+}
+
+static int lua_likwid_getCpuInfo(lua_State* L)
+{
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cpuinfo = get_cpuInfo();
+    }
+    if ((topology_isInitialized) && (cpuinfo == NULL))
+    {
+        cpuinfo = get_cpuInfo();
+    }
+    lua_newtable(L);
+    lua_pushstring(L,"family");
+    lua_pushunsigned(L,cpuinfo->family);
+    lua_settable(L,-3);
+    lua_pushstring(L,"model");
+    lua_pushunsigned(L,cpuinfo->model);
+    lua_settable(L,-3);
+    lua_pushstring(L,"stepping");
+    lua_pushunsigned(L,cpuinfo->stepping);
+    lua_settable(L,-3);
+    lua_pushstring(L,"clock");
+    lua_pushunsigned(L,cpuinfo->clock);
+    lua_settable(L,-3);
+    lua_pushstring(L,"turbo");
+    lua_pushinteger(L,cpuinfo->turbo);
+    lua_settable(L,-3);
+    lua_pushstring(L,"name");
+    lua_pushstring(L,cpuinfo->name);
+    lua_settable(L,-3);
+    lua_pushstring(L,"osname");
+    lua_pushstring(L,cpuinfo->osname);
+    lua_settable(L,-3);
+    lua_pushstring(L,"short_name");
+    lua_pushstring(L,cpuinfo->short_name);
+    lua_settable(L,-3);
+    lua_pushstring(L,"features");
+    lua_pushstring(L,cpuinfo->features);
+    lua_settable(L,-3);
+    lua_pushstring(L,"isIntel");
+    lua_pushinteger(L,cpuinfo->isIntel);
+    lua_settable(L,-3);
+    lua_pushstring(L,"featureFlags");
+    lua_pushunsigned(L,cpuinfo->featureFlags);
+    lua_settable(L,-3);
+    lua_pushstring(L,"perf_version");
+    lua_pushunsigned(L, cpuinfo->perf_version);
+    lua_settable(L,-3);
+    lua_pushstring(L,"perf_num_ctr");
+    lua_pushunsigned(L,cpuinfo->perf_num_ctr);
+    lua_settable(L,-3);
+    lua_pushstring(L,"perf_width_ctr");
+    lua_pushunsigned(L,cpuinfo->perf_width_ctr);
+    lua_settable(L,-3);
+    lua_pushstring(L,"perf_num_fixed_ctr");
+    lua_pushunsigned(L,cpuinfo->perf_num_fixed_ctr);
+    lua_settable(L,-3);
+    return 1;
+}
+
+static int lua_likwid_getCpuTopology(lua_State* L)
+{
+    int i;
+    TreeNode* socketNode;
+    int socketCount = 0;
+    TreeNode* coreNode;
+    int coreCount = 0;
+    TreeNode* threadNode;
+    int threadCount = 0;
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cputopo = get_cpuTopology();
+    }
+    if ((topology_isInitialized) && (cputopo == NULL))
+    {
+        cputopo = get_cpuTopology();
+    }
+    if (numa_isInitialized == 0)
+    {
+        if (numa_init() == 0)
+        {
+            numa_isInitialized = 1;
+            numainfo = get_numaTopology();
+        }
+    }
+    if ((numa_isInitialized) && (numainfo == NULL))
+    {
+        numainfo = get_numaTopology();
+    }
+
+    lua_newtable(L);
+
+    lua_pushstring(L,"numHWThreads");
+    lua_pushunsigned(L,cputopo->numHWThreads);
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"activeHWThreads");
+    lua_pushunsigned(L,cputopo->activeHWThreads);
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"numSockets");
+    lua_pushunsigned(L,cputopo->numSockets);
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"numCoresPerSocket");
+    lua_pushunsigned(L,cputopo->numCoresPerSocket);
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"numThreadsPerCore");
+    lua_pushunsigned(L,cputopo->numThreadsPerCore);
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"numCacheLevels");
+    lua_pushinteger(L,cputopo->numCacheLevels);
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"threadPool");
+    lua_newtable(L);
+    for(i=0;i<cputopo->numHWThreads;i++)
+    {
+        lua_pushnumber(L,i);
+        lua_newtable(L);
+        lua_pushstring(L,"threadId");
+        lua_pushunsigned(L,cputopo->threadPool[i].threadId);
+        lua_settable(L,-3);
+        lua_pushstring(L,"coreId");
+        lua_pushunsigned(L,cputopo->threadPool[i].coreId);
+        lua_settable(L,-3);
+        lua_pushstring(L,"packageId");
+        lua_pushunsigned(L,cputopo->threadPool[i].packageId);
+        lua_settable(L,-3);
+        lua_pushstring(L,"apicId");
+        lua_pushunsigned(L,cputopo->threadPool[i].apicId);
+        lua_settable(L,-3);
+        lua_pushstring(L,"inCpuSet");
+        lua_pushunsigned(L,cputopo->threadPool[i].inCpuSet);
+        lua_settable(L,-3);
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"cacheLevels");
+    lua_newtable(L);
+    for(i=0;i<cputopo->numCacheLevels;i++)
+    {
+        lua_pushnumber(L,i+1);
+        lua_newtable(L);
+
+        lua_pushstring(L,"level");
+        lua_pushunsigned(L,cputopo->cacheLevels[i].level);
+        lua_settable(L,-3);
+
+        lua_pushstring(L,"associativity");
+        lua_pushunsigned(L,cputopo->cacheLevels[i].associativity);
+        lua_settable(L,-3);
+
+        lua_pushstring(L,"sets");
+        lua_pushunsigned(L,cputopo->cacheLevels[i].sets);
+        lua_settable(L,-3);
+
+        lua_pushstring(L,"lineSize");
+        lua_pushunsigned(L,cputopo->cacheLevels[i].lineSize);
+        lua_settable(L,-3);
+
+        lua_pushstring(L,"size");
+        lua_pushunsigned(L,cputopo->cacheLevels[i].size);
+        lua_settable(L,-3);
+
+        lua_pushstring(L,"threads");
+        lua_pushunsigned(L,cputopo->cacheLevels[i].threads);
+        lua_settable(L,-3);
+
+        lua_pushstring(L,"inclusive");
+        lua_pushunsigned(L,cputopo->cacheLevels[i].inclusive);
+        lua_settable(L,-3);
+
+        lua_pushstring(L,"type");
+        switch (cputopo->cacheLevels[i].type)
+        {
+            case DATACACHE:
+                lua_pushstring(L,"DATACACHE");
+                break;
+            case INSTRUCTIONCACHE:
+                lua_pushstring(L,"INSTRUCTIONCACHE");
+                break;
+            case UNIFIEDCACHE:
+                lua_pushstring(L,"UNIFIEDCACHE");
+                break;
+            case ITLB:
+                lua_pushstring(L,"ITLB");
+                break;
+            case DTLB:
+                lua_pushstring(L,"DTLB");
+                break;
+            case NOCACHE:
+            default:
+                lua_pushstring(L,"NOCACHE");
+                break;
+        }
+        lua_settable(L,-3);
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"topologyTree");
+    lua_newtable(L);
+
+    socketNode = tree_getChildNode(cputopo->topologyTree);
+    while (socketNode != NULL)
+    {
+        lua_pushinteger(L, socketCount);
+        lua_newtable(L);
+        lua_pushstring(L, "ID");
+        lua_pushunsigned(L,socketNode->id);
+        lua_settable(L, -3);
+        lua_pushstring(L, "Childs");
+        lua_newtable(L);
+        coreCount = 0;
+        coreNode = tree_getChildNode(socketNode);
+        while (coreNode != NULL)
+        {
+            lua_pushinteger(L, coreCount);
+            lua_newtable(L);
+            lua_pushstring(L, "ID");
+            lua_pushunsigned(L,coreNode->id);
+            lua_settable(L,-3);
+            lua_pushstring(L, "Childs");
+            lua_newtable(L);
+            threadNode = tree_getChildNode(coreNode);
+            threadCount = 0;
+            while (threadNode != NULL)
+            {
+                lua_pushunsigned(L,threadCount);
+                lua_pushunsigned(L,threadNode->id);
+                lua_settable(L,-3);
+                threadNode = tree_getNextNode(threadNode);
+                threadCount++;
+            }
+            lua_settable(L,-3);
+            coreNode = tree_getNextNode(coreNode);
+            coreCount++;
+            lua_settable(L,-3);
+        }
+        lua_settable(L,-3);
+        socketNode = tree_getNextNode(socketNode);
+        socketCount++;
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+    return 1;
+}
+
+static int lua_likwid_putTopology(lua_State* L)
+{
+    if (topology_isInitialized == 1)
+    {
+        topology_finalize();
+        topology_isInitialized = 0;
+        cpuinfo = NULL;
+        cputopo = NULL;
+    }
+    return 0;
+}
+
+
+static int lua_likwid_getEventsAndCounters(lua_State* L)
+{
+    int i;
+    char optString[1024];
+    int optStringIndex = 0;
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cpuinfo = get_cpuInfo();
+    }
+    if ((topology_isInitialized) && (cpuinfo == NULL))
+    {
+        cpuinfo = get_cpuInfo();
+    }
+    perfmon_init_maps();
+    lua_newtable(L);
+    lua_pushstring(L,"Counters");
+    lua_newtable(L);
+    for(i=1;i<=perfmon_numCounters;i++)
+    {
+        optStringIndex = 0;
+        optString[0] = '\0';
+        lua_pushunsigned(L,i);
+        lua_newtable(L);
+        lua_pushstring(L,"Name");
+        lua_pushstring(L,counter_map[i-1].key);
+        lua_settable(L,-3);
+        lua_pushstring(L,"Options");
+        for(int j=1; j<NUM_EVENT_OPTIONS; j++)
+        {
+            if (counter_map[i-1].optionMask & REG_TYPE_MASK(j))
+            {
+                optStringIndex += sprintf(&(optString[optStringIndex]), "%s|", eventOptionTypeName[j]);
+            }
+        }
+        optString[optStringIndex-1] = '\0';
+        lua_pushstring(L,optString);
+        lua_settable(L,-3);
+        lua_pushstring(L,"Type");
+        lua_pushunsigned(L, counter_map[i-1].type);
+        lua_settable(L,-3);
+        lua_pushstring(L,"TypeName");
+        lua_pushstring(L, RegisterTypeNames[counter_map[i-1].type]);
+        lua_settable(L,-3);
+        lua_pushstring(L,"Index");
+        lua_pushunsigned(L,counter_map[i-1].index);
+        lua_settable(L,-3);
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+    lua_pushstring(L,"Events");
+    lua_newtable(L);
+    for(i=1;i<=perfmon_numArchEvents;i++)
+    {
+        optStringIndex = 0;
+        optString[0] = '\0';
+        lua_pushunsigned(L,i);
+        lua_newtable(L);
+        lua_pushstring(L,"Name");
+        lua_pushstring(L,eventHash[i-1].name);
+        lua_settable(L,-3);
+        lua_pushstring(L,"ID");
+        lua_pushunsigned(L,eventHash[i-1].eventId);
+        lua_settable(L,-3);
+        lua_pushstring(L,"UMask");
+        lua_pushunsigned(L,eventHash[i-1].umask);
+        lua_settable(L,-3);
+        lua_pushstring(L,"Limit");
+        lua_pushstring(L,eventHash[i-1].limit);
+        lua_settable(L,-3);
+        lua_pushstring(L,"Options");
+        for(int j=1; j<NUM_EVENT_OPTIONS; j++)
+        {
+            if (eventHash[i-1].optionMask & REG_TYPE_MASK(j))
+            {
+                optStringIndex += sprintf(&(optString[optStringIndex]), "%s|", eventOptionTypeName[j]);
+            }
+        }
+        optString[optStringIndex-1] = '\0';
+        lua_pushstring(L,optString);
+        lua_settable(L,-3);
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+    return 1;
+}
+
+static int lua_likwid_getOnlineDevices(lua_State* L)
+{
+    int i;
+    lua_newtable(L);
+    for(i=0;i<=MAX_NUM_PCI_DEVICES;i++)
+    {
+        if (pci_devices[i].online)
+        {
+            lua_pushstring(L,pci_devices[i].likwid_name);
+            lua_newtable(L);
+            lua_pushstring(L, "Name");
+            lua_pushstring(L,pci_devices[i].name);
+            lua_settable(L,-3);
+            lua_pushstring(L, "Path");
+            lua_pushstring(L,pci_devices[i].path);
+            lua_settable(L,-3);
+            lua_pushstring(L, "Type");
+            lua_pushstring(L,pci_types[pci_devices[i].type].name);
+            lua_settable(L,-3);
+            lua_pushstring(L, "TypeDescription");
+            lua_pushstring(L,pci_types[pci_devices[i].type].desc);
+            lua_settable(L,-3);
+        }
+        lua_settable(L,-3);
+    }
+    return 1;
+}
+
+static int lua_likwid_getNumaInfo(lua_State* L)
+{
+    uint32_t i,j;
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cpuinfo = get_cpuInfo();
+        cputopo = get_cpuTopology();
+    }
+    if ((topology_isInitialized) && (cpuinfo == NULL))
+    {
+        cpuinfo = get_cpuInfo();
+    }
+    if ((topology_isInitialized) && (cputopo == NULL))
+    {
+        cputopo = get_cpuTopology();
+    }
+    if (numa_isInitialized == 0)
+    {
+        if (numa_init() == 0)
+        {
+            numa_isInitialized = 1;
+            numainfo = get_numaTopology();
+        }
+        else
+        {
+            lua_newtable(L);
+            lua_pushstring(L,"numberOfNodes");
+            lua_pushunsigned(L,0);
+            lua_settable(L,-3);
+            lua_pushstring(L,"nodes");
+            lua_newtable(L);
+            lua_settable(L,-3);
+            return 1;
+        }
+    }
+    if ((numa_isInitialized) && (numainfo == NULL))
+    {
+        numainfo = get_numaTopology();
+    }
+    if (affinity_isInitialized == 0)
+    {
+        affinity_init();
+        affinity_isInitialized = 1;
+        affinity = get_affinityDomains();
+    }
+    if ((affinity_isInitialized) && (affinity == NULL))
+    {
+        affinity = get_affinityDomains();
+    }
+    lua_newtable(L);
+    lua_pushstring(L,"numberOfNodes");
+    lua_pushunsigned(L,numainfo->numberOfNodes);
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"nodes");
+    lua_newtable(L);
+    for(i=0;i<numainfo->numberOfNodes;i++)
+    {
+        lua_pushinteger(L, i+1);
+        lua_newtable(L);
+        
+        lua_pushstring(L,"id");
+        lua_pushunsigned(L,numainfo->nodes[i].id);
+        lua_settable(L,-3);
+        lua_pushstring(L,"totalMemory");
+        lua_pushunsigned(L,numainfo->nodes[i].totalMemory);
+        lua_settable(L,-3);
+        lua_pushstring(L,"freeMemory");
+        lua_pushunsigned(L,numainfo->nodes[i].freeMemory);
+        lua_settable(L,-3);
+        lua_pushstring(L,"numberOfProcessors");
+        lua_pushunsigned(L,numainfo->nodes[i].numberOfProcessors);
+        lua_settable(L,-3);
+        lua_pushstring(L,"numberOfDistances");
+        lua_pushunsigned(L,numainfo->nodes[i].numberOfDistances);
+        lua_settable(L,-3);
+        
+        lua_pushstring(L,"processors");
+        lua_newtable(L);
+        for(j=0;j<numainfo->nodes[i].numberOfProcessors;j++)
+        {
+            lua_pushunsigned(L,j+1);
+            lua_pushunsigned(L,numainfo->nodes[i].processors[j]);
+            lua_settable(L,-3);
+        }
+        lua_settable(L,-3);
+        
+        /*lua_pushstring(L,"processorsCompact");
+        lua_newtable(L);
+        for(j=0;j<numa->nodes[i].numberOfProcessors;j++)
+        {
+            lua_pushunsigned(L,j);
+            lua_pushunsigned(L,numa->nodes[i].processorsCompact[j]);
+            lua_settable(L,-3);
+        }
+        lua_settable(L,-3);*/
+        
+        lua_pushstring(L,"distances");
+        lua_newtable(L);
+        for(j=0;j<numainfo->nodes[i].numberOfDistances;j++)
+        {
+            lua_pushinteger(L,j+1);
+            lua_newtable(L);
+            lua_pushinteger(L,j);
+            lua_pushunsigned(L,numainfo->nodes[i].distances[j]);
+            lua_settable(L,-3);
+            lua_settable(L,-3);
+        }
+        lua_settable(L,-3);
+        
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+    return 1;
+}
+
+static int lua_likwid_putNumaInfo(lua_State* L)
+{
+    if (numa_isInitialized)
+    {
+        numa_finalize();
+        numa_isInitialized = 0;
+        numainfo = NULL;
+    }
+    return 0;
+}
+
+static int lua_likwid_setMemInterleaved(lua_State* L)
+{
+    int ret;
+    int nrThreads = luaL_checknumber(L,1);
+    luaL_argcheck(L, nrThreads > 0, 1, "Thread count must be greater than 0");
+    int cpus[nrThreads];
+    if (!lua_istable(L, -1)) {
+      lua_pushstring(L,"No table given as second argument");
+      lua_error(L);
+    }
+    for (ret = 1; ret<=nrThreads; ret++)
+    {
+        lua_rawgeti(L,-1,ret);
+        cpus[ret-1] = lua_tounsigned(L,-1);
+        lua_pop(L,1);
+    }
+    numa_setInterleaved(cpus, nrThreads);
+    return 0;
+}
+
+static int lua_likwid_getAffinityInfo(lua_State* L)
+{
+    int i,j;
+    
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cpuinfo = get_cpuInfo();
+        cputopo = get_cpuTopology();
+    }
+    if ((topology_isInitialized) && (cpuinfo == NULL))
+    {
+        cpuinfo = get_cpuInfo();
+    }
+    if ((topology_isInitialized) && (cputopo == NULL))
+    {
+        cputopo = get_cpuTopology();
+    }
+    if (numa_isInitialized == 0)
+    {
+        if (numa_init() == 0)
+        {
+            numa_isInitialized = 1;
+            numainfo = get_numaTopology();
+        }
+    }
+    if ((numa_isInitialized) && (numainfo == NULL))
+    {
+        numainfo = get_numaTopology();
+    }
+    if (affinity_isInitialized == 0)
+    {
+        affinity_init();
+        affinity_isInitialized = 1;
+        affinity = get_affinityDomains();
+    }
+    if ((affinity_isInitialized) && (affinity == NULL))
+    {
+        affinity = get_affinityDomains();
+    }
+
+    if (!affinity)
+    {
+        lua_pushstring(L,"Cannot initialize affinity groups");
+        lua_error(L);
+    }
+    lua_newtable(L);
+    lua_pushstring(L,"numberOfAffinityDomains");
+    lua_pushunsigned(L,affinity->numberOfAffinityDomains);
+    lua_settable(L,-3);
+    lua_pushstring(L,"numberOfSocketDomains");
+    lua_pushunsigned(L,affinity->numberOfSocketDomains);
+    lua_settable(L,-3);
+    lua_pushstring(L,"numberOfNumaDomains");
+    lua_pushunsigned(L,affinity->numberOfNumaDomains);
+    lua_settable(L,-3);
+    lua_pushstring(L,"numberOfProcessorsPerSocket");
+    lua_pushunsigned(L,affinity->numberOfProcessorsPerSocket);
+    lua_settable(L,-3);
+    lua_pushstring(L,"numberOfCacheDomains");
+    lua_pushunsigned(L,affinity->numberOfCacheDomains);
+    lua_settable(L,-3);
+    lua_pushstring(L,"numberOfCoresPerCache");
+    lua_pushunsigned(L,affinity->numberOfCoresPerCache);
+    lua_settable(L,-3);
+    lua_pushstring(L,"numberOfProcessorsPerCache");
+    lua_pushunsigned(L,affinity->numberOfProcessorsPerCache);
+    lua_settable(L,-3);
+    lua_pushstring(L,"domains");
+    lua_newtable(L);
+    for(i=0;i<affinity->numberOfAffinityDomains;i++)
+    {
+        lua_pushunsigned(L, i+1);
+        lua_newtable(L);
+        lua_pushstring(L,"tag");
+        lua_pushstring(L,bdata(affinity->domains[i].tag));
+        lua_settable(L,-3);
+        lua_pushstring(L,"numberOfProcessors");
+        lua_pushunsigned(L,affinity->domains[i].numberOfProcessors);
+        lua_settable(L,-3);
+        lua_pushstring(L,"numberOfCores");
+        lua_pushunsigned(L,affinity->domains[i].numberOfCores);
+        lua_settable(L,-3);
+        lua_pushstring(L,"processorList");
+        lua_newtable(L);
+        for(j=0;j<affinity->domains[i].numberOfProcessors;j++)
+        {
+            lua_pushunsigned(L,j+1);
+            lua_pushunsigned(L,affinity->domains[i].processorList[j]);
+            lua_settable(L,-3);
+        }
+        lua_settable(L,-3);
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+    return 1;
+}
+
+static int lua_likwid_putAffinityInfo(lua_State* L)
+{
+    if (affinity_isInitialized)
+    {
+        affinity_finalize();
+        affinity_isInitialized = 0;
+        affinity = NULL;
+    }
+    return 0;
+}
+
+static int lua_likwid_getPowerInfo(lua_State* L)
+{
+    
+    int i;
+    if (topology_isInitialized == 0)
+    {
+        topology_init();
+        topology_isInitialized = 1;
+        cpuinfo = get_cpuInfo();
+        cputopo = get_cpuTopology();
+    }
+    if ((topology_isInitialized) && (cpuinfo == NULL))
+    {
+        cpuinfo = get_cpuInfo();
+    }
+    if ((topology_isInitialized) && (cputopo == NULL))
+    {
+        cputopo = get_cpuTopology();
+    }
+    if (power_isInitialized == 0)
+    {
+        power_hasRAPL = power_init(0);
+        if (power_hasRAPL)
+        {
+            power_isInitialized = 1;
+            power = get_powerInfo();
+        }
+        else
+        {
+            return 0;
+        }
+    }
+
+
+    lua_newtable(L);
+    lua_pushstring(L,"hasRAPL");
+    lua_pushboolean(L,power_hasRAPL);
+    lua_settable(L,-3);
+    lua_pushstring(L,"baseFrequency");
+    lua_pushnumber(L,power->baseFrequency);
+    lua_settable(L,-3);
+    lua_pushstring(L,"minFrequency");
+    lua_pushnumber(L,power->minFrequency);
+    lua_settable(L,-3);
+    lua_pushstring(L,"powerUnit");
+    lua_pushnumber(L,power->powerUnit);
+    lua_settable(L,-3);
+    lua_pushstring(L,"timeUnit");
+    lua_pushnumber(L,power->timeUnit);
+    lua_settable(L,-3);
+    
+    lua_pushstring(L,"turbo");
+    lua_newtable(L);
+    lua_pushstring(L,"numSteps");
+    lua_pushunsigned(L,power->turbo.numSteps);
+    lua_settable(L,-3);
+    lua_pushstring(L,"steps");
+    lua_newtable(L);
+    for(i=0;i<power->turbo.numSteps;i++)
+    {
+        lua_pushunsigned(L,i+1);
+        lua_pushnumber(L,power->turbo.steps[i]);
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+    lua_settable(L,-3);
+
+    lua_pushstring(L,"domains");
+    lua_newtable(L);
+    for(i=0;i<NUM_POWER_DOMAINS;i++)
+    {
+        lua_pushstring(L,power_names[i]);
+        lua_newtable(L);
+
+        lua_pushstring(L, "ID");
+        lua_pushnumber(L, power->domains[i].type);
+        lua_settable(L,-3);
+        lua_pushstring(L, "energyUnit");
+        lua_pushnumber(L, power->domains[i].energyUnit);
+        lua_settable(L,-3);
+        lua_pushstring(L,"supportStatus");
+        if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_STATUS)
+        {
+            lua_pushboolean(L, 1);
+        }
+        else
+        {
+            lua_pushboolean(L, 0);
+        }
+        lua_settable(L,-3);
+        lua_pushstring(L,"supportPerf");
+        if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_PERF)
+        {
+            lua_pushboolean(L, 1);
+        }
+        else
+        {
+            lua_pushboolean(L, 0);
+        }
+        lua_settable(L,-3);
+        lua_pushstring(L,"supportPolicy");
+        if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_POLICY)
+        {
+            lua_pushboolean(L, 1);
+        }
+        else
+        {
+            lua_pushboolean(L, 0);
+        }
+        lua_settable(L,-3);
+        lua_pushstring(L,"supportLimit");
+        if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+        {
+            lua_pushboolean(L, 1);
+        }
+        else
+        {
+            lua_pushboolean(L, 0);
+        }
+        lua_settable(L,-3);
+        if (power->domains[i].supportFlags & POWER_DOMAIN_SUPPORT_INFO)
+        {
+            lua_pushstring(L,"supportInfo");
+            lua_pushboolean(L, 1);
+            lua_settable(L,-3);
+            lua_pushstring(L,"tdp");
+            lua_pushnumber(L, power->domains[i].tdp);
+            lua_settable(L,-3);
+            lua_pushstring(L,"minPower");
+            lua_pushnumber(L, power->domains[i].minPower);
+            lua_settable(L,-3);
+            lua_pushstring(L,"maxPower");
+            lua_pushnumber(L, power->domains[i].maxPower);
+            lua_settable(L,-3);
+            lua_pushstring(L,"maxTimeWindow");
+            lua_pushnumber(L, power->domains[i].maxTimeWindow);
+            lua_settable(L,-3);
+        }
+        else
+        {
+            lua_pushstring(L,"supportInfo");
+            lua_pushboolean(L, 0);
+            lua_settable(L,-3);
+        }
+
+        lua_settable(L,-3);
+    }
+    lua_settable(L,-3);
+    
+
+    return 1;
+}
+
+static int lua_likwid_putPowerInfo(lua_State* L)
+{
+    if (power_isInitialized)
+    {
+        power_finalize();
+        power_isInitialized = 0;
+        power = NULL;
+    }
+    return 0;
+}
+
+static int lua_likwid_startPower(lua_State* L)
+{
+    PowerData pwrdata;
+    int cpuId = lua_tonumber(L,1);
+    luaL_argcheck(L, cpuId >= 0, 1, "CPU ID must be greater than 0");
+    PowerType type = (PowerType) lua_tounsigned(L,2);
+    luaL_argcheck(L, type >= PKG+1 && type <= DRAM+1, 2, "Type not valid");
+    power_start(&pwrdata, cpuId, type-1);
+    lua_pushnumber(L,pwrdata.before);
+    return 1;
+}
+
+static int lua_likwid_stopPower(lua_State* L)
+{
+    PowerData pwrdata;
+    int cpuId = lua_tonumber(L,1);
+    luaL_argcheck(L, cpuId >= 0, 1, "CPU ID must be greater than 0");
+    PowerType type = (PowerType) lua_tounsigned(L,2);
+    luaL_argcheck(L, type >= PKG+1 && type <= DRAM+1, 2, "Type not valid");
+    power_stop(&pwrdata, cpuId, type-1);
+    lua_pushnumber(L,pwrdata.after);
+    return 1;
+}
+
+static int lua_likwid_printEnergy(lua_State* L)
+{
+    PowerData pwrdata;
+    pwrdata.before = lua_tonumber(L,1);
+    pwrdata.after = lua_tonumber(L,2);
+    pwrdata.domain = lua_tonumber(L,3);
+    lua_pushnumber(L,power_printEnergy(&pwrdata));
+    return 1;
+}
+
+static int lua_likwid_power_limitGet(lua_State* L)
+{
+    int err;
+    int cpuId = lua_tonumber(L,1);
+    int domain = lua_tonumber(L,2);
+    double power = 0.0;
+    double time = 0.0;
+    err = power_limitGet(cpuId, domain, &power, &time);
+    if (err < 0)
+    {
+        lua_pushnumber(L,err);
+        return 1;
+    }
+    lua_pushnumber(L,power);
+    lua_pushnumber(L,time);
+    return 2;
+}
+
+static int lua_likwid_power_limitSet(lua_State* L)
+{
+    int cpuId = lua_tonumber(L,1);
+    int domain = lua_tonumber(L,2);
+    double power = lua_tonumber(L,3);
+    double time = lua_tonumber(L,4);
+    int clamp  = lua_tonumber(L,5);
+    lua_pushinteger(L, power_limitSet(cpuId, domain, power, time, clamp));
+    return 1;
+}
+
+static int lua_likwid_power_limitState(lua_State* L)
+{
+    int cpuId = lua_tonumber(L,1);
+    int domain = lua_tonumber(L,2);
+    lua_pushnumber(L,power_limitState(cpuId, domain));
+    return 1;
+}
+
+static int lua_likwid_getCpuClock(lua_State* L)
+{
+    if (timer_isInitialized == 0)
+    {
+        timer_init();
+        timer_isInitialized = 1;
+    }
+    lua_pushnumber(L,timer_getCpuClock());
+    return 1;
+}
+
+static int isleep(lua_State* L)
+{
+    long interval = lua_tounsigned(L,-1);
+    int remain = 0;
+    remain = sleep(interval);
+    lua_pushinteger(L, remain);
+    return 1;
+}
+
+static int iusleep(lua_State* L)
+{
+    int status = -1;
+    unsigned long interval = lua_tounsigned(L,-1);
+    if (interval < 1000000)
+    {
+        status = usleep(interval);
+    }
+    lua_pushinteger(L, status);
+    return 1;
+}
+
+static int lua_likwid_startClock(lua_State* L)
+{
+    TimerData timer;
+    double value;
+    if (timer_isInitialized == 0)
+    {
+        timer_init();
+        timer_isInitialized = 1;
+    }
+    timer_start(&timer);
+    value = (double)timer.start.int64;
+    lua_pushnumber(L, value);
+    return 1;
+}
+
+static int lua_likwid_stopClock(lua_State* L)
+{
+    TimerData timer;
+    double value;
+    if (timer_isInitialized == 0)
+    {
+        timer_init();
+        timer_isInitialized = 1;
+    }
+    timer_stop(&timer);
+    value = (double)timer.stop.int64;
+    lua_pushnumber(L, value);
+    return 1;
+}
+
+static int lua_likwid_getClockCycles(lua_State* L)
+{
+    TimerData timer;
+    double start, stop;
+    start = lua_tonumber(L,1);
+    stop = lua_tonumber(L,2);
+    timer.start.int64 = (uint64_t)start;
+    timer.stop.int64 = (uint64_t)stop;
+    if (timer_isInitialized == 0)
+    {
+        timer_init();
+        timer_isInitialized = 1;
+    }
+    lua_pushnumber(L, (double)timer_printCycles(&timer));
+    return 1;
+}
+
+static int lua_likwid_getClock(lua_State* L)
+{
+    TimerData timer;
+    double runtime, start, stop;
+    if (timer_isInitialized == 0)
+    {
+        timer_init();
+        timer_isInitialized = 1;
+    }
+    start = lua_tonumber(L,1);
+    stop = lua_tonumber(L,2);
+    timer.start.int64 = (uint64_t)start;
+    timer.stop.int64 = (uint64_t)stop;
+    runtime = timer_print(&timer);
+    lua_pushnumber(L, runtime);
+    return 1;
+}
+
+static int lua_likwid_initTemp(lua_State* L)
+{
+    int cpuid = lua_tounsigned(L,-1);
+    thermal_init(cpuid);
+    return 0;
+}
+
+static int lua_likwid_readTemp(lua_State* L)
+{
+    int cpuid = lua_tounsigned(L,-1);
+    uint32_t data;
+    
+    if (thermal_read(cpuid, &data)) {
+        lua_pushstring(L,"Cannot read thermal data");
+        lua_error(L);
+    }
+    lua_pushnumber(L, data);
+    return 1;
+}
+
+
+static volatile int recv_sigint = 0;
+static void signal_catcher(int signo) 
+{
+    if (signo == SIGINT)
+    {
+        recv_sigint++;
+    }
+    return;
+}
+
+static int lua_likwid_catch_signal(lua_State* L)
+{
+    signal(SIGINT,signal_catcher);
+    return 0;
+}
+
+static int lua_likwid_return_signal_state(lua_State* L)
+{
+    lua_pushnumber(L, recv_sigint);
+    return 1;
+}
+
+void parse(char *line, char **argv)
+{
+     while (*line != '\0') {       /* if not the end of line ....... */ 
+          while (*line == ' ' || *line == '\t' || *line == '\n')
+               *line++ = '\0';     /* replace white spaces with 0    */
+          *argv++ = line;          /* save the argument position     */
+          while (*line != '\0' && *line != ' ' && 
+                 *line != '\t' && *line != '\n') 
+               line++;             /* skip the argument until ...    */
+     }
+     *argv = '\0';                 /* mark the end of argument list  */
+}
+
+static volatile int program_running = 0;
+
+static void catch_sigchild(int signo) {
+    program_running = 0;
+}
+
+static int lua_likwid_startProgram(lua_State* L)
+{
+    pid_t pid, ppid;
+    int status;
+    char *exec;
+    char  *argv[4096];
+    exec = (char *)luaL_checkstring(L, 1);
+
+    parse(exec, argv);
+    ppid = getpid();
+    program_running = 1;
+    pid = fork();
+    if (pid < 0)
+    {
+        return 0;
+    }
+    else if ( pid == 0)
+    {
+        
+        status = execvp(*argv, argv);
+        if (status < 0)
+        {
+            kill(ppid, SIGCHLD);
+            exit(1);
+        }
+        return 0;
+    }
+    else
+    {
+        signal(SIGCHLD, catch_sigchild);
+        lua_pushnumber(L, pid);
+    }
+    return 1;
+}
+
+static int lua_likwid_checkProgram(lua_State* L)
+{
+    lua_pushboolean(L, program_running);
+    return 1;
+}
+
+static int lua_likwid_killProgram(lua_State* L)
+{
+    pid_t pid = lua_tonumber(L, 1);
+    kill(pid, SIGTERM);
+    program_running = 0;
+    return 0;
+}
+
+
+static int lua_likwid_memSweep(lua_State* L)
+{
+    int i;
+    int nrThreads = luaL_checknumber(L,1);
+    luaL_argcheck(L, nrThreads > 0, 1, "Thread count must be greater than 0");
+    int cpus[nrThreads];
+    if (!lua_istable(L, -1)) {
+      lua_pushstring(L,"No table given as second argument");
+      lua_error(L);
+    }
+    for (i = 1; i <= nrThreads; i++)
+    {
+        lua_rawgeti(L,-1,i);
+        cpus[i-1] = lua_tounsigned(L,-1);
+        lua_pop(L,1);
+    }
+    memsweep_threadGroup(cpus, nrThreads);
+    return 0;
+}
+
+static int lua_likwid_memSweepDomain(lua_State* L)
+{
+    int domain = luaL_checknumber(L,1);
+    luaL_argcheck(L, domain >= 0, 1, "Domain ID must be greater or equal 0");
+    memsweep_domain(domain);
+    return 0;
+}
+
+static int lua_likwid_pinProcess(lua_State* L)
+{
+    int cpuID = luaL_checknumber(L,-2);
+    int silent = luaL_checknumber(L,-1);
+    luaL_argcheck(L, cpuID >= 0, 1, "CPU ID must be greater or equal 0");
+    if (affinity_isInitialized == 0)
+    {
+        affinity_init();
+        affinity_isInitialized = 1;
+        affinity = get_affinityDomains();
+    }
+    affinity_pinProcess(cpuID);
+    if (!silent)
+    {
+#ifdef COLOR
+            color_on(BRIGHT, COLOR);
+#endif
+            printf("[likwid-pin] Main PID -> core %d - OK",  cpuID);
+#ifdef COLOR
+            color_reset();
+#endif
+            printf("\n");
+    }
+    return 0;
+}
+
+static int lua_likwid_setenv(lua_State* L)
+{
+    const char* element = (const char*)luaL_checkstring(L, -2);
+    const char* value = (const char*)luaL_checkstring(L, -1);
+    setenv(element, value, 1);
+    return 0;
+}
+
+static int lua_likwid_getpid(lua_State* L)
+{
+    lua_pushunsigned(L,getpid());
+    return 1;
+}
+
+static int lua_likwid_setVerbosity(lua_State* L)
+{
+    int verbosity = lua_tointeger(L,-1);
+    luaL_argcheck(L, (verbosity >= 0 && verbosity <= DEBUGLEV_DEVELOP), -1, 
+                "Verbosity must be between 0 (only errors) and 3 (developer)");
+    perfmon_verbosity = verbosity;
+    return 0;
+}
+
+static int lua_likwid_access(lua_State* L)
+{
+    int flags = 0;
+    const char* file = (const char*)luaL_checkstring(L, 1);
+    const char* perm = (const char*)luaL_checkstring(L, 2);
+    if (!perm)
+    {
+        flags = F_OK;
+    }
+    else
+    {
+        for (int i=0;i<strlen(perm);i++)
+        {
+            if (perm[i] == 'r') {
+                flags |= R_OK;
+            } else if (perm[i] == 'w') {
+                flags |= W_OK;
+            } else if (perm[i] == 'x') {
+                flags |= X_OK;
+            } else if (perm[i] == 'e') {
+                flags |= F_OK;
+            }
+        }
+    }
+    if (file)
+    {
+        lua_pushinteger(L, access(file, flags));
+        return 1;
+    }
+    lua_pushinteger(L, -1);
+    return 1;
+}
+
+int luaopen_liblikwid(lua_State* L){
+    // Configuration functions
+    lua_register(L, "likwid_getConfiguration", lua_likwid_getConfiguration);
+    lua_register(L, "likwid_putConfiguration", lua_likwid_putConfiguration);
+    // Perfmon functions
+    //lua_register(L, "accessClient_setaccessmode",lua_accessClient_setaccessmode);
+    lua_register(L, "likwid_setAccessClientMode",lua_likwid_setAccessMode);
+    lua_register(L, "likwid_init",lua_likwid_init);
+    lua_register(L, "likwid_addEventSet", lua_likwid_addEventSet);
+    lua_register(L, "likwid_setupCounters",lua_likwid_setupCounters);
+    lua_register(L, "likwid_startCounters",lua_likwid_startCounters);
+    lua_register(L, "likwid_stopCounters",lua_likwid_stopCounters);
+    lua_register(L, "likwid_readCounters",lua_likwid_readCounters);
+    lua_register(L, "likwid_switchGroup",lua_likwid_switchGroup);
+    lua_register(L, "likwid_finalize",lua_likwid_finalize);
+    lua_register(L, "likwid_getEventsAndCounters", lua_likwid_getEventsAndCounters);
+    // Perfmon results functions
+    lua_register(L, "likwid_getResult",lua_likwid_getResult);
+    lua_register(L, "likwid_getNumberOfGroups",lua_likwid_getNumberOfGroups);
+    lua_register(L, "likwid_getRuntimeOfGroup", lua_likwid_getRuntimeOfGroup);
+    lua_register(L, "likwid_getIdOfActiveGroup",lua_likwid_getIdOfActiveGroup);
+    lua_register(L, "likwid_getNumberOfEvents",lua_likwid_getNumberOfEvents);
+    lua_register(L, "likwid_getNumberOfThreads",lua_likwid_getNumberOfThreads);
+    // Topology functions
+    lua_register(L, "likwid_getCpuInfo",lua_likwid_getCpuInfo);
+    lua_register(L, "likwid_getCpuTopology",lua_likwid_getCpuTopology);
+    lua_register(L, "likwid_putTopology",lua_likwid_putTopology);
+    lua_register(L, "likwid_getNumaInfo",lua_likwid_getNumaInfo);
+    lua_register(L, "likwid_putNumaInfo",lua_likwid_putNumaInfo);
+    lua_register(L, "likwid_setMemInterleaved", lua_likwid_setMemInterleaved);
+    lua_register(L, "likwid_getAffinityInfo",lua_likwid_getAffinityInfo);
+    lua_register(L, "likwid_putAffinityInfo",lua_likwid_putAffinityInfo);
+    lua_register(L, "likwid_getPowerInfo",lua_likwid_getPowerInfo);
+    lua_register(L, "likwid_putPowerInfo",lua_likwid_putPowerInfo);
+    lua_register(L, "likwid_getOnlineDevices", lua_likwid_getOnlineDevices);
+    lua_register(L, "likwid_printSupportedCPUs", lua_likwid_printSupportedCPUs);
+    // Timer functions
+    lua_register(L, "likwid_getCpuClock",lua_likwid_getCpuClock);
+    lua_register(L, "likwid_startClock",lua_likwid_startClock);
+    lua_register(L, "likwid_stopClock",lua_likwid_stopClock);
+    lua_register(L, "likwid_getClockCycles",lua_likwid_getClockCycles);
+    lua_register(L, "likwid_getClock",lua_likwid_getClock);
+    lua_register(L, "sleep",isleep);
+    lua_register(L, "usleep",iusleep);
+    // Power functions
+    lua_register(L, "likwid_startPower",lua_likwid_startPower);
+    lua_register(L, "likwid_stopPower",lua_likwid_stopPower);
+    lua_register(L, "likwid_printEnergy",lua_likwid_printEnergy);
+    lua_register(L, "likwid_powerLimitGet",lua_likwid_power_limitGet);
+    lua_register(L, "likwid_powerLimitSet",lua_likwid_power_limitSet);
+    lua_register(L, "likwid_powerLimitState",lua_likwid_power_limitState);
+    // Temperature functions
+    lua_register(L, "likwid_initTemp",lua_likwid_initTemp);
+    lua_register(L, "likwid_readTemp",lua_likwid_readTemp);
+    // MemSweep functions
+    lua_register(L, "likwid_memSweep", lua_likwid_memSweep);
+    lua_register(L, "likwid_memSweepDomain", lua_likwid_memSweepDomain);
+    // Pinning functions
+    lua_register(L, "likwid_pinProcess", lua_likwid_pinProcess);
+    // Helper functions
+    lua_register(L, "likwid_setenv", lua_likwid_setenv);
+    lua_register(L, "likwid_getpid", lua_likwid_getpid);
+    lua_register(L, "likwid_access", lua_likwid_access);
+    lua_register(L, "likwid_startProgram", lua_likwid_startProgram);
+    lua_register(L, "likwid_checkProgram", lua_likwid_checkProgram);
+    lua_register(L, "likwid_killProgram", lua_likwid_killProgram);
+    // Verbosity functions
+    lua_register(L, "likwid_setVerbosity", lua_likwid_setVerbosity);
+    lua_register(L, "likwid_catchSignal", lua_likwid_catch_signal);
+    lua_register(L, "likwid_getSignalState", lua_likwid_return_signal_state);
+    return 0;
+}
diff --git a/src/memsweep.c b/src/memsweep.c
index 8abf796..dfeb74d 100644
--- a/src/memsweep.c
+++ b/src/memsweep.c
@@ -5,13 +5,13 @@
  *
  *      Description:  Implementation of sweeper module.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -37,13 +37,12 @@
 #include <error.h>
 #include <types.h>
 #include <memsweep.h>
-#include <cpuid.h>
+#include <topology.h>
 #include <numa.h>
 #include <affinity.h>
 
 extern void _loadData(uint32_t size, void* ptr);
 
-
 /* #####   EXPORTED VARIABLES   ########################################### */
 
 
@@ -57,14 +56,14 @@ static uint64_t  memoryFraction = 80ULL;
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
 
-static void*
+static void* 
 allocateOnNode(size_t size, int domainId)
 {
-    char *ptr; 
+	char *ptr; 
 
-    ptr = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);
+	ptr = mmap(0, size, PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, 0, 0);  
 
-    if (ptr == (char *)-1)
+	if (ptr == (char *)-1)
     {
         ERROR;
     }
@@ -74,7 +73,7 @@ allocateOnNode(size_t size, int domainId)
     return ptr;
 }
 
-static void
+static void 
 initMemory(size_t size, char* ptr, int domainId)
 {
     affinity_pinProcess(numa_info.nodes[domainId].processors[0]);
@@ -101,20 +100,18 @@ findProcessor(uint32_t nodeId, uint32_t coreId)
 }
 
 /* evict all dirty cachelines from last level cache */
-static void cleanupCache(FILE* OUTSTREAM, char* ptr)
+static void cleanupCache(char* ptr)
 {
-#ifdef __x86_64
+#if defined(__x86_64__) || defined(__i386__)
     uint32_t cachesize = 2 * cpuid_topology.cacheLevels[cpuid_topology.numCacheLevels-1].size;
-    if (OUTSTREAM != NULL)
-    {
-        fprintf(OUTSTREAM, "Cleanup LLC using %u MB\n", cachesize / (1000000));
-    }
+    printf("Cleaning LLC with %g MB\n", (double)cachesize/(1024.0 * 1024.0));
     _loadData(cachesize,ptr);
 #else
-    ERROR_PLAIN_PRINT(Cleanup cache is currently only available on 64bit X86 systems.);
+    ERROR_PLAIN_PRINT(Cleanup cache is currently only available on X86 systems.);
 #endif
 }
 
+
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
 void
@@ -125,35 +122,32 @@ memsweep_setMemoryFraction(uint64_t fraction)
 
 
 void
-memsweep_node(FILE* OUTSTREAM)
+memsweep_node(void)
 {
     for ( uint32_t i=0; i < numa_info.numberOfNodes; i++)
     {
-        memsweep_domain(OUTSTREAM, i);
+        memsweep_domain(i);
     }
 }
 
 
 void
-memsweep_domain(FILE* OUTSTREAM, int domainId)
+memsweep_domain(int domainId)
 {
     char* ptr = NULL;
     size_t size = numa_info.nodes[domainId].totalMemory * 1024ULL * memoryFraction / 100ULL;
-    if (OUTSTREAM != NULL)
-    {
-        fprintf(OUTSTREAM, "Sweeping domain %d: Using %g MB of %g MB\n",
-                domainId,
-                size / (1000.0 * 1000.0),
-                numa_info.nodes[domainId].totalMemory/ 1000.0);
-    }
+    printf("Sweeping domain %d: Using %g MB of %g MB\n",
+            domainId,
+            size / (1024.0 * 1024.0),
+            numa_info.nodes[domainId].totalMemory/ 1024.0);
     ptr = (char*) allocateOnNode(size, domainId);
     initMemory(size, ptr, domainId);
-    cleanupCache(OUTSTREAM, ptr);
+    cleanupCache(ptr);
     munmap(ptr, size);
 }
 
 void
-memsweep_threadGroup(FILE* OUTSTREAM, int* processorList, int numberOfProcessors)
+memsweep_threadGroup(int* processorList, int numberOfProcessors)
 {
     for (uint32_t i=0; i<numa_info.numberOfNodes; i++)
     {
@@ -161,10 +155,13 @@ memsweep_threadGroup(FILE* OUTSTREAM, int* processorList, int numberOfProcessors
         {
             if (findProcessor(i,processorList[j]))
             {
-                memsweep_domain(OUTSTREAM, i);
+                memsweep_domain(i);
                 break;
             }
         }
     }
 }
 
+
+
+
diff --git a/src/msr.c b/src/msr.c
index cb867f2..a17e3a9 100644
--- a/src/msr.c
+++ b/src/msr.c
@@ -6,16 +6,17 @@
  *      Description:  Implementation of msr module.
  *                   Provides API to read and write values to the model
  *                   specific registers on x86 processors using the msr
- *                   sys interface of the Linux 2.6 kernel. This module 
+ *                   sys interface of the Linux 2.6 kernel. This module
  *                   is based on the msr-util tools.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com.
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -42,36 +43,40 @@
 #include <unistd.h>
 #include <signal.h>
 #include <sys/types.h>
+#include <sys/wait.h>
 #include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/un.h>
-#include <sys/wait.h>
 
 #include <types.h>
 #include <error.h>
-#include <cpuid.h>
+#include <topology.h>
 #include <accessClient.h>
 #include <msr.h>
 #include <registers.h>
-
+#ifdef LIKWID_PROFILE_COUNTER_READ
+#include <timer.h>
+#endif
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 #define MAX_LENGTH_MSR_DEV_NAME  20
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
 
 /* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-static int FD[MAX_NUM_THREADS];
-static int socket_fd = -1;
+static int FD[MAX_NUM_THREADS] = { [0 ... MAX_NUM_THREADS-1] = -1 };
 static int rdpmc_works = 0;
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+
 static inline int __rdpmc(int counter, uint64_t* value)
 {
     unsigned low, high;
+
     __asm__ volatile("rdpmc" : "=a" (low), "=d" (high) : "c" (counter));
     *value = ((low) | ((uint64_t )(high) << 32));
     return 0;
 }
+
 //Needed for rdpmc check
 void segfault_sigaction(int signal, siginfo_t *si, void *arg)
 {
@@ -89,9 +94,10 @@ int test_rdpmc(int flag)
     sigemptyset(&sa.sa_mask);
     sa.sa_sigaction = segfault_sigaction;
     sa.sa_flags   = SA_SIGINFO;
-
+    
+    
     pid = fork();
-
+    
     if (pid < 0)
     {
         return -1;
@@ -121,64 +127,90 @@ int test_rdpmc(int flag)
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
 
-void
+int
 msr_init(int initSocket_fd)
 {
-    if (accessClient_mode == DAEMON_AM_DIRECT)
+    int fd = 0;
+    int i = 0;
+    
+    if (accessClient_mode == ACCESSMODE_DIRECT)
     {
+        int fd;
         char* msr_file_name = (char*) malloc(MAX_LENGTH_MSR_DEV_NAME * sizeof(char));
+        if (!msr_file_name)
+        {    
+            return -ENOMEM;
+        }
 
         sprintf(msr_file_name,"/dev/msr0");
-        if( access( msr_file_name, F_OK ) == -1 )
+        fd = open(msr_file_name, O_RDWR);
+        if (fd < 0)
         {
             sprintf(msr_file_name,"/dev/cpu/0/msr");
         }
-
-        if (access(msr_file_name, R_OK|W_OK))
+        else
+        {
+            close(fd);
+        }
+        fd = open(msr_file_name, O_RDWR);   
+        if (fd < 0)
         {
-            ERROR_PRINT(Cannot access MSR device file %s: %s.\n
-                        Please check if 'msr' module is loaded and device files have correct permissions\n
-                        Alternatively you might want to look into (sys)daemonmode\n,msr_file_name , strerror(errno));
+            ERROR_PRINT(Cannot access MSR device file %s: %s.,msr_file_name , strerror(errno))
+            ERROR_PLAIN_PRINT(Please check if 'msr' module is loaded and device files have correct permissions);
+            ERROR_PLAIN_PRINT(Alternatively you might want to look into (sys)daemonmode);
             free(msr_file_name);
-            exit(127);
+            return -EPERM;
+        }
+        else
+        {
+            close(fd);
         }
         rdpmc_works = test_rdpmc(0);
 
         /* NOTICE: This assumes consecutive processor Ids! */
-        for ( uint32_t i=0; i < cpuid_topology.numHWThreads; i++ )
+        for ( i=0; i < cpuid_topology.numHWThreads; i++ )
         {
-            sprintf(msr_file_name,"/dev/msr%d",i);
-            if( access( msr_file_name, F_OK ) == -1 )
+            sprintf(msr_file_name,"/dev/msr%d",cpuid_topology.threadPool[i].apicId);
+            fd = open(msr_file_name, O_RDWR); 
+            if (fd < 0)
+            {
+                sprintf(msr_file_name,"/dev/cpu/%d/msr",cpuid_topology.threadPool[i].apicId);
+            }
+            else
             {
-                sprintf(msr_file_name,"/dev/cpu/%d/msr",i);
+                close(fd);
             }
-            FD[i] = open(msr_file_name, O_RDWR);
-            if ( FD[i] < 0 )
+            FD[cpuid_topology.threadPool[i].apicId] = open(msr_file_name, O_RDWR);
+            if ( FD[cpuid_topology.threadPool[i].apicId] < 0 )
             {
-                ERROR_PRINT(Cannot access MSR device file %s: %s\n,
-                                msr_file_name , strerror(errno));
+                ERROR_PRINT(Cannot access MSR device file %s in direct mode, msr_file_name);
                 free(msr_file_name);
-                ERROR;
+                return -EPERM;
             }
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, Opened MSR for CPU %d FD %d,
+                                        cpuid_topology.threadPool[i].apicId,
+                                        FD[cpuid_topology.threadPool[i].apicId]);
         }
         free(msr_file_name);
     }
     else
     {
+        fd = 1;
         socket_fd = initSocket_fd;
     }
+    return 0;
 }
 
 void
 msr_finalize(void)
 {
-    if (accessClient_mode == DAEMON_AM_DIRECT)
+    int i = 0;
+    if (accessClient_mode == ACCESSMODE_DIRECT)
     {
-        for ( uint32_t i=0; i < cpuid_topology.numHWThreads; i++ )
+        for (i=0; i < cpuid_topology.numHWThreads; i++ )
         {
             close(FD[i]);
         }
-        rdpmc_works = 0;
     }
     else
     {
@@ -187,121 +219,193 @@ msr_finalize(void)
 }
 
 
-uint64_t 
-msr_tread(const int tsocket_fd, const int cpu, uint32_t reg)
+int
+msr_tread(const int tsocket_fd, const int cpu, uint32_t reg, uint64_t *data)
 {
-    if (accessClient_mode == DAEMON_AM_DIRECT) 
-    {
-        uint64_t data;
+    int ret = 0;
 
-        if (rdpmc_works && reg >= MSR_PMC0 && reg <=MSR_PMC3)
+    if (accessClient_mode == ACCESSMODE_DIRECT)
+    {
+        if (rdpmc_works && reg >= MSR_PMC0 && reg <=MSR_PMC7)
         {
-            if (__rdpmc(reg - MSR_PMC0, &data) )
+            if (__rdpmc(reg - MSR_PMC0, data) )
             {
-                ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
-                        reg,cpu);
+                //ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d, reg, cpu);
+                return -EIO;
             }
         }
-        else if (rdpmc_works && reg >= MSR_PERF_FIXED_CTR0 && reg <= MSR_PERF_FIXED_CTR2)
+        /*else if (rdpmc_works && reg >= MSR_PERF_FIXED_CTR0 && reg <= MSR_PERF_FIXED_CTR2)
         {
-            if (__rdpmc(0x4000000ULL + (reg - MSR_PERF_FIXED_CTR0), &data) )
+            if (__rdpmc(0x4000000ULL + (reg - MSR_PERF_FIXED_CTR0), data) )
             {
-                ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
-                        reg,cpu);
+                ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d,reg,cpu);
+                return -EIO;
             }
-        }
+        }*/
         else
         {
-            if ( pread(FD[cpu], &data, sizeof(data), reg) != sizeof(data) )
+            if (FD[cpu] > 0)
             {
-                ERROR_PRINT(Cannot read MSR reg 0x%x with RDMSR instruction on CPU %d\n,
-                        reg, cpu);
+                ret = pread(FD[cpu], data, sizeof(*data), reg);
+                if (ret  != sizeof(*data) )
+                {
+                    //ERROR_PRINT(Cannot read MSR reg 0x%x with RDMSR instruction on CPU %d, reg, cpu);
+                    return -EIO;
+                }
+            }
+            else
+            {
+                //ERROR_PRINT(MSR device for CPU %d not found, cpu);
+                return -EBADFD;
             }
         }
-
-        return data;
     }
     else
     { /* daemon or sysdaemon-mode */
-        return accessClient_read(tsocket_fd, cpu, DAEMON_AD_MSR, reg);
+        if (tsocket_fd != -1)
+        {
+            ret = accessClient_read(tsocket_fd, cpu, DAEMON_AD_MSR, reg, data);
+            if (ret)
+            {
+                //ERROR_PRINT(Cannot read MSR reg 0x%x through accessDaemon on CPU %d, reg, cpu);
+                return ret;
+            }
+        }
+        else
+        {
+            //ERROR_PLAIN_PRINT(Bad socket to accessDaemon);
+            return -EBADFD;
+        }
     }
+    return 0;
 }
 
 
-void 
+int 
 msr_twrite(const int tsocket_fd, const int cpu, uint32_t reg, uint64_t data)
 {
-    if (accessClient_mode == DAEMON_AM_DIRECT) 
+    int ret;
+    if (accessClient_mode == ACCESSMODE_DIRECT) 
     {
-        if (pwrite(FD[cpu], &data, sizeof(data), reg) != sizeof(data))
+        if (FD[cpu] > 0)
         {
-            ERROR_PRINT(Cannot write MSR reg 0x%x with WRMSR instruction on CPU %d\n,
-                        reg, cpu);
+            ret = pwrite(FD[cpu], &data, sizeof(data), reg);
+            if (ret != sizeof(data))
+            {
+                //ERROR_PRINT(Cannot write MSR reg 0x%x with WRMSR instruction on CPU %d\n,
+                //            reg, cpu);
+                return -EIO;
+            }
+        }
+        else
+        {
+            //ERROR_PRINT(MSR device for CPU %d not found, cpu);
+            return -EBADFD;
         }
     }
     else
     { /* daemon or sysdaemon-mode */
-        accessClient_write(tsocket_fd, cpu, DAEMON_AD_MSR, reg, data);
+        if (tsocket_fd != -1)
+        {
+            ret = accessClient_write(tsocket_fd, cpu, DAEMON_AD_MSR, reg, data);
+            if (ret)
+            {
+                return ret;
+            }
+        }
+        else
+        {
+            //ERROR_PLAIN_PRINT(Bad socket to accessDaemon);
+            return -EBADFD;
+        }
     }
+    return 0;
 }
 
 
-uint64_t 
-msr_read( const int cpu, uint32_t reg)
+int
+msr_read( const int cpu, uint32_t reg, uint64_t *data)
 {
-    if (accessClient_mode == DAEMON_AM_DIRECT) 
+    int ret;
+    if (accessClient_mode == ACCESSMODE_DIRECT) 
     {
-        uint64_t data;
-
-        if (rdpmc_works && reg >= MSR_PMC0 && reg <=MSR_PMC3)
+        if (rdpmc_works && reg >= MSR_PMC0 && reg <=MSR_PMC7)
         {
-            if (__rdpmc(reg - MSR_PMC0, &data) )
+            if (__rdpmc(reg - MSR_PMC0, data) )
             {
                 ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
                         reg,cpu);
+                return -EIO;
             }
         }
         else if (rdpmc_works && reg >= MSR_PERF_FIXED_CTR0 && reg <= MSR_PERF_FIXED_CTR2)
         {
-            if (__rdpmc(0x4000000ULL + (reg - MSR_PERF_FIXED_CTR0), &data) )
+            if (__rdpmc(0x4000000ULL + (reg - MSR_PERF_FIXED_CTR0), data) )
             {
-                ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d\n,
-                        reg,cpu);
+                ERROR_PRINT(Cannot read MSR reg 0x%x with RDPMC instruction on CPU %d,reg,cpu);
+                return -EIO;
             }
         }
         else
         {
-            if ( pread(FD[cpu], &data, sizeof(data), reg) != sizeof(data) )
+            if ( pread(FD[cpu], data, sizeof(*data), reg) != sizeof(*data) )
             {
-                ERROR_PRINT(Cannot read MSR reg 0x%x with RDMSR instruction on CPU %d\n,
+                ERROR_PRINT(Cannot read MSR reg 0x%x with RDMSR instruction on CPU %d,
                         reg, cpu);
+                return -EIO;
             }
         }
-
-        return data;
     }
     else
     { /* daemon or sysdaemon-mode */
-        return accessClient_read(socket_fd, cpu, DAEMON_AD_MSR, reg);
+        if (socket_fd != -1)
+        {
+            ret = accessClient_read(socket_fd, cpu, DAEMON_AD_MSR, reg, data);
+            if (ret)
+            {
+                ERROR_PRINT(Cannot read MSR reg 0x%x through accessDaemon on CPU %d, reg, cpu);
+                return ret;
+            }
+        }
+        else
+        {
+            ERROR_PLAIN_PRINT(Bad socket to accessDaemon);
+            return -EBADFD;
+        }
     }
+    return 0;
 }
 
 
-void
+int
 msr_write( const int cpu, uint32_t reg, uint64_t data)
 {
-    if (accessClient_mode == DAEMON_AM_DIRECT) 
+    int ret;
+    if (accessClient_mode == ACCESSMODE_DIRECT) 
     {
-        if (pwrite(FD[cpu], &data, sizeof(data), reg) != sizeof(data))
+        ret = pwrite(FD[cpu], &data, sizeof(data), reg);
+        if (ret != sizeof(data))
         {
-            ERROR_PRINT(Cannot write MSR reg 0x%x with WRMSR instruction on CPU %d\n,
-                        reg, cpu);
+            return ret;
         }
     }
     else
     { /* daemon or sysdaemon-mode */
-        accessClient_write(socket_fd, cpu, DAEMON_AD_MSR, reg, data);
+        if (socket_fd != -1)
+        {
+            ret = accessClient_write(socket_fd, cpu, DAEMON_AD_MSR, reg, data);
+            if (ret)
+            {
+                return ret;
+            }
+        }
+        else
+        {
+            ERROR_PLAIN_PRINT(Bad socket to accessDaemon);
+            return -EBADFD;
+        }
     }
+    return 0;
 }
 
 
diff --git a/src/multiplex.c b/src/multiplex.c
deleted file mode 100644
index 68a6b88..0000000
--- a/src/multiplex.c
+++ /dev/null
@@ -1,165 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  multiplex.c
- *
- *      Description:  
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <errno.h>
-#include <sys/types.h>
-#include <signal.h>
-#include <sys/time.h>
-
-#include <timer.h>
-#include <perfmon.h>
-#include <multiplex.h>
-
-#if 0
-static int currentCollection = -1;
-static MultiplexCollections* multiplex_set = NULL;
-static TimerData timeData;
-static int  multiplex_useMarker = 0;
-
-void
-multiplex_printCounters ()
-{
-
-
-
-}
-
-
-
-void
-multiplex_swapEventSet ()
-{
-    int threadId;
-    PerfmonEventSet* collection;
-
-    /* collection from last run */
-    collection = multiplex_set->collections + currentCollection;
-
-    for (threadId = 0; threadId < perfmon_numThreads; threadId++)
-    {
-        /* Stop counters */
-        if (!multiplex_useMarker) perfmon_stopCountersThread(threadId);
-        /* Accumulate counters */
-        for (int i=0; i<collection->numberOfEvents; i++)
-        {
-//            collection->events[i].result[threadId] += 
- //               (double) perfmon_threadData[threadId].counters[collection->events[i].index].counterData;
-        }
-    }
-
-    /* switch to next collection */
-    if( currentCollection == multiplex_set->numberOfCollections-1)
-    {
-        currentCollection = 0;
-    }
-    else
-    {
-        currentCollection++;
-    }
-    collection = multiplex_set->collections + currentCollection;
-
-    for (threadId = 0; threadId < perfmon_numThreads; threadId++)
-    {
-        /* Reconfigure counters */
-        for (int i=0; i<collection->numberOfEvents; i++)
-        {
-            perfmon_setupCounterThread(threadId,
-                    collection->events[i].event.eventId,
-                    collection->events[i].event.umask,
-                    collection->events[i].index);
-        }
-
-        /* Start counters */
-       if (!multiplex_useMarker)  perfmon_startCountersThread(threadId);
-    }
-}
-
-void
-multiplex_init(MultiplexCollections* set)
-{
-    int i;
-
-    multiplex_set = set;
-
-    for (i=0;i<multiplex_set->numberOfCollections; i++)
-    {
-//        perfmon_initEventset(multiplex_set->collections+i);
-    }
-}
-
-void
-multiplex_start()
-{
-    struct itimerval val;
-    struct sigaction sa;
-
-//    multiplex_useMarker = useMarker;
-
-    val.it_interval.tv_sec = 0;
-    val.it_interval.tv_usec = 500;
-    val.it_value.tv_sec = 0; 
-    val.it_value.tv_usec = 100;
-
-    sa.sa_handler = multiplex_printCounters;
-    sigemptyset(&sa.sa_mask);
-    sa.sa_flags = SA_RESTART;
-    if (sigaction(SIGALRM, &sa, NULL) == -1)
-    {
-        /* Handle error */;
-        perror("sigaction");
-    }
-
-    perfmon_startCounters();
-    setitimer(ITIMER_REAL, &val,0);
-    timer_start(&timeData);
-}
-
-void
-multiplex_stop()
-{
-    struct itimerval val;
-
-    val.it_interval.tv_sec = 0;
-    val.it_interval.tv_usec = 0;
-    val.it_value.tv_sec = 0; 
-    val.it_value.tv_usec = 0;
-
-    timer_stop(&timeData);
-    setitimer(ITIMER_REAL, &val,0);
-    perfmon_stopCounters();
-
-    multiplex_set->time = timer_print(&timeData);
-}
-
-#endif
-
-
diff --git a/src/numa.c b/src/numa.c
index 2f72765..970e4e1 100644
--- a/src/numa.c
+++ b/src/numa.c
@@ -3,15 +3,17 @@
  *
  *      Filename:  numa.c
  *
- *      Description:  Implementation of Linux NUMA interface
+ *      Description:  Implementation of Linux NUMA interface. Selects between hwloc and
+ *                    procfs/sysfs backends.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -37,352 +39,157 @@
 #include <sched.h>
 #include <sys/syscall.h>
 #include <sys/types.h>
+#include <error.h>
 #include <dirent.h>
 #ifdef HAS_MEMPOLICY
 #include <linux/mempolicy.h>
 #endif
+#include <topology.h>
+
+#include <configuration.h>
 
 #include <error.h>
 #include <bstrlib.h>
+//#include <strUtil.h>
+
 #include <numa.h>
-#include <strUtil.h>
+#include <numa_proc.h>
 
-/* #####   EXPORTED VARIABLES   ########################################### */
+#ifdef LIKWID_USE_HWLOC
+#include <hwloc.h>
+#include <topology_hwloc.h>
+#include <numa_hwloc.h>
+#endif
 
 
-NumaTopology numa_info;
+/* #####   EXPORTED VARIABLES   ########################################### */
+NumaTopology numa_info = {0,NULL};
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#ifdef HAS_MEMPOLICY
-#define get_mempolicy(policy,nmask,maxnode,addr,flags) syscall(SYS_get_mempolicy,policy,nmask,maxnode,addr,flags)
-#define set_mempolicy(mode,nmask,maxnode) syscall(SYS_set_mempolicy,mode,nmask,maxnode)
-#define mbind(start, len, nmask, maxnode, flags) syscall(SYS_mbind,(start),len,MPOL_BIND,(nmask),maxnode,flags)
-#endif
-
 /* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static int maxIdConfiguredNode = 0;
-
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
-static void
-setConfiguredNodes(void)
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+int str2int(const char* str)
 {
-    DIR *dir;
-    struct dirent *de;
-
-    dir = opendir("/sys/devices/system/node");
+    char* endptr;
+    errno = 0;
+    unsigned long val;
+    val = strtoul(str, &endptr, 10);
 
-    if (!dir) 
+    if ((errno == ERANGE && val == LONG_MAX)
+        || (errno != 0 && val == 0))
     {
-        maxIdConfiguredNode = 0;
+        fprintf(stderr, "Value in string out of range\n");
+        return -EINVAL;
     }
-    else
-    {
-        while ((de = readdir(dir)) != NULL) 
-        {
-            int nd;
-            if (strncmp(de->d_name, "node", 4))
-            {
-                continue;
-            }
-
-            nd = str2int(de->d_name+4);
 
-            if (maxIdConfiguredNode < nd)
-            {
-                maxIdConfiguredNode = nd;
-            }
-        }
-        closedir(dir);
-    }
-}
-
-
-static void
-nodeMeminfo(int node, uint64_t* totalMemory, uint64_t* freeMemory)
-{
-    FILE *fp;
-    bstring filename;
-    bstring totalString = bformat("MemTotal:");
-    bstring freeString  = bformat("MemFree:");
-    int i;
-
-    filename = bformat("/sys/devices/system/node/node%d/meminfo", node);
-
-    if (NULL != (fp = fopen (bdata(filename), "r"))) 
-    {
-        bstring src = bread ((bNread) fread, fp);
-        struct bstrList* tokens = bsplit(src,(char) '\n');
-
-        for (i=0;i<tokens->qty;i++)
-        {
-            if (binstr(tokens->entry[i],0,totalString) != BSTR_ERR)
-            {
-                 bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18 );
-                 bltrimws(tmp);
-                 struct bstrList* subtokens = bsplit(tmp,(char) ' ');
-                 *totalMemory = str2int(bdata(subtokens->entry[0]));
-            }
-            else if (binstr(tokens->entry[i],0,freeString) != BSTR_ERR)
-            {
-                 bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18  );
-                 bltrimws(tmp);
-                 struct bstrList* subtokens = bsplit(tmp,(char) ' ');
-                 *freeMemory = str2int(bdata(subtokens->entry[0]));
-            }
-        }
-    }
-    else
+    if (endptr == str)
     {
-        ERROR;
+        fprintf(stderr, "No digits were found\n");
+        return -EINVAL;
     }
 
-    fclose(fp);
+    return (int) val;
 }
 
-static int
-nodeProcessorList(int node, uint32_t** list)
+int
+empty_numa_init()
 {
-    FILE *fp;
-    bstring filename;
-    int count = 0;
-    bstring src;
-    int i,j;
-    struct bstrList* tokens;
-    unsigned long val;
-    char* endptr;
-    int cursor=0;
-//    int unitSize = (int) (sizeof(unsigned long)*8);
-    int unitSize = (int) 32; /* 8 nibbles */
-
-    *list = (uint32_t*) malloc(MAX_NUM_THREADS * sizeof(uint32_t));
-
-    /* the cpumap interface should be always there */
-    filename = bformat("/sys/devices/system/node/node%d/cpumap", node); 
-
-    if (NULL != (fp = fopen (bdata(filename), "r"))) 
-    {
-
-        src = bread ((bNread) fread, fp);
-        tokens = bsplit(src,',');
-
-        for (i=(tokens->qty-1); i>=0 ;i--)
-        {
-            val = strtoul((char*) tokens->entry[i]->data, &endptr, 16);
-
-            if ((errno != 0 && val == LONG_MAX )
-                    || (errno != 0 && val == 0)) 
-            {
-                ERROR;
-            }
-
-            if (endptr == (char*) tokens->entry[i]->data) 
-            {
-                ERROR_PLAIN_PRINT(No digits were found);
-            }
-
-            if (val != 0UL)
-            {
-                for (j=0; j<unitSize; j++)
-                {
-                    if (val&(1UL<<j))
-                    {
-                        if (count < MAX_NUM_THREADS)
-                        {
-                            (*list)[count] = (j+cursor);
-                        }
-                        else
-                        {
-                            ERROR_PRINT(Number Of threads %d too large,count);
-                        }
-                        count++;
-                    }
-                }
-            }
-            cursor += unitSize;
-        }
-
-        bstrListDestroy(tokens);
-        bdestroy(src);
-        bdestroy(filename);
-        fclose(fp); 
-
-        /* FIXME: CPU list here is not physical cores first but numerical sorted */
-
-        return count;
-    }
-
-    /* something went wrong */
-    return -1;
+    printf("MEMPOLICY NOT supported in kernel!\n");
+    return 0;
 }
- 
-static int
-nodeDistanceList(int node, int numberOfNodes, uint32_t** list)
-{
-    FILE *fp;
-    bstring filename;
-    int count = 0;
-    bstring src;
-    struct bstrList* tokens;
-
-    *list = (uint32_t*) malloc(numberOfNodes * sizeof(uint32_t));
-
-    /* the distance interface should be always there */
-    filename = bformat("/sys/devices/system/node/node%d/distance", node);
-
-    if (NULL != (fp = fopen (bdata(filename), "r")))
-    {
-
-        src = bread ((bNread) fread, fp);
-        tokens = bsplit(src,' ');
-
-        for (int i=0; i<(tokens->qty); i++)
-        {
-            if (count < numberOfNodes)
-            {
-                (*list)[count] = (uint32_t)strtoul((char*) (tokens->entry[i]->data), NULL, 10);
-            }
-            else
-            {
-                ERROR_PRINT(Number Of nodes %d too large,count);
-            }
-            count++;
-        }
-
-        bstrListDestroy(tokens);
-        bdestroy(src);
-        bdestroy(filename);
-        fclose(fp);
-        return count;
-    }
 
-    /* something went wrong */
-    return -1;
+void 
+empty_numa_setInterleaved(int* processorList, int numberOfProcessors)
+{
+    printf("MEMPOLICY NOT supported in kernel!\n");
+    return;
 }
 
-
-
-static int
-findProcessor(uint32_t nodeId, uint32_t coreId)
+void
+empty_numa_membind(void* ptr, size_t size, int domainId)
 {
-    int i;
-
-    for (i=0; i<numa_info.nodes[nodeId].numberOfProcessors; i++)
-    {
-        if (numa_info.nodes[nodeId].processors[i] == coreId)
-        {
-            return 1;
-        }
-    }
-    return 0;
+    printf("MBIND NOT supported in kernel!\n");
+    return;
 }
 
 
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-#ifdef HAS_MEMPOLICY
-int
-numa_init()
-{
-    int errno;
-    uint32_t i;
-
-    if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS)
-    {
-        return -1; 
-    }
+const struct numa_functions numa_funcs = {
+#ifndef HAS_MEMPOLICY
+    .numa_init = empty_numa_init,
+    .numa_setInterleaved = empty_numa_setInterleaved,
+    .numa_membind = empty_numa_membind
+#else
+#ifdef LIKWID_USE_HWLOC
+    .numa_init = hwloc_numa_init,
+#else
+    .numa_init = proc_numa_init,
+#endif
+    .numa_setInterleaved = proc_numa_setInterleaved,
+    .numa_membind = proc_numa_membind
+#endif
+};
 
-    /* First determine maximum number of nodes */
-    setConfiguredNodes();
-    numa_info.numberOfNodes = maxIdConfiguredNode+1;
-    numa_info.nodes = (NumaNode*) malloc(numa_info.numberOfNodes * sizeof(NumaNode));
 
-    for (i=0; i<numa_info.numberOfNodes; i++)
-    {
-        nodeMeminfo(i, &numa_info.nodes[i].totalMemory, &numa_info.nodes[i].freeMemory);
-        numa_info.nodes[i].id = i;
-        numa_info.nodes[i].numberOfProcessors = nodeProcessorList(i,&numa_info.nodes[i].processors);
-        numa_info.nodes[i].numberOfDistances = nodeDistanceList(i, numa_info.numberOfNodes, &numa_info.nodes[i].distances);
-    }
+int numa_init(void)
+{
+    const struct numa_functions funcs = numa_funcs;
 
-    if (numa_info.nodes[0].numberOfProcessors < 0)
-    {
-        return -1;
-    }
-    else
+    if (init_config == 0)
     {
-        return 0;
+        init_configuration();
     }
-}
-
-void 
-numa_setInterleaved(int* processorList, int numberOfProcessors)
-{
-    long i;
-    int j;
-    int ret=0;
-    unsigned long numberOfNodes = 65;
-    unsigned long mask = 0UL;
 
-    for (i=0; i<numa_info.numberOfNodes; i++)
+    if (access(config.topologyCfgFileName, R_OK) && numa_info.numberOfNodes <= 0)
     {
-        for (j=0; j<numberOfProcessors; j++)
+        cpu_set_t cpuSet;
+        CPU_ZERO(&cpuSet);
+        sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
+        if (cpuid_topology.activeHWThreads < cpuid_topology.numHWThreads)
         {
-            if (findProcessor(i,processorList[j]))
-            {
-                mask |= (1UL<<i);
-                break;
-            }
+            return proc_numa_init();
         }
+        return funcs.numa_init();
     }
-
-    ret = set_mempolicy(MPOL_INTERLEAVE,&mask,numberOfNodes);
-
-    if (ret < 0)
-    {
-        ERROR;
-    }
+    return 0;
 }
 
-void
-numa_membind(void* ptr, size_t size, int domainId)
+void numa_setInterleaved(int* processorList, int numberOfProcessors)
 {
-    int ret=0;
-    unsigned long mask = 0UL;
-    unsigned int flags = 0U;
-
-    flags |= MPOL_MF_STRICT;
-    mask |= (1UL<<domainId);
-
-    ret = mbind(ptr, size, &mask, numa_info.numberOfNodes+1, flags);
-
-    if (ret < 0)
-    {
-        ERROR;
-    }
+    const struct numa_functions funcs = numa_funcs;
+    return funcs.numa_setInterleaved(processorList, numberOfProcessors);
 }
 
-#else
-int
-numa_init()
+void numa_membind(void* ptr, size_t size, int domainId)
 {
-    printf("MEMPOLICY NOT supported in kernel!\n");
+    const struct numa_functions funcs = numa_funcs;
+    return funcs.numa_membind(ptr, size, domainId);
 }
 
-void 
-numa_setInterleaved(int* processorList, int numberOfProcessors)
+#ifndef HAS_MEMPOLICY
+void numa_finalize(void)
 {
-    printf("MEMPOLICY NOT supported in kernel!\n");
+    return;
 }
-
-void
-numa_membind(void* ptr, size_t size, int domainId)
+#else
+void numa_finalize(void)
 {
-    printf("MBIND NOT supported in kernel!\n");
+    int i;
+    for(i=0;i<numa_info.numberOfNodes;i++)
+    {
+        if (numa_info.nodes[i].processors)
+        {
+            free(numa_info.nodes[i].processors);
+        }
+        if (numa_info.nodes[i].distances)
+        {
+            free(numa_info.nodes[i].distances);
+        }
+    }
+    if (numa_info.nodes)
+    {
+        free(numa_info.nodes);
+    }
+    return;
 }
-
 #endif
-
-
diff --git a/src/numa_hwloc.c b/src/numa_hwloc.c
new file mode 100644
index 0000000..0c5fd1b
--- /dev/null
+++ b/src/numa_hwloc.c
@@ -0,0 +1,393 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  numa_hwloc.c
+ *
+ *      Description:  Interface to hwloc for NUMA topology
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <error.h>
+
+#include <numa.h>
+#include <topology.h>
+#ifdef LIKWID_USE_HWLOC
+#include <hwloc.h>
+#include <topology_hwloc.h>
+#endif
+
+
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+#ifdef LIKWID_USE_HWLOC
+uint64_t getFreeNodeMem(int nodeId)
+{
+    FILE *fp;
+    bstring filename;
+    uint64_t free = 0;
+    bstring freeString  = bformat("MemFree:");
+    int i;
+    
+    filename = bformat("/sys/devices/system/node/node%d/meminfo", nodeId);
+
+    if (NULL != (fp = fopen (bdata(filename), "r"))) 
+    {
+        bstring src = bread ((bNread) fread, fp);
+        struct bstrList* tokens = bsplit(src,(char) '\n');
+
+        for (i=0;i<tokens->qty;i++)
+        {
+            if (binstr(tokens->entry[i],0,freeString) != BSTR_ERR)
+            {
+                 bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18  );
+                 bltrimws(tmp);
+                 struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+                 free = str2int(bdata(subtokens->entry[0]));
+            }
+        }
+        fclose(fp);
+    }
+    else if (!access("/proc/meminfo", R_OK))
+    {
+        filename = bfromcstr("/proc/meminfo");
+        if (NULL != (fp = fopen (bdata(filename), "r"))) 
+        {
+            bstring src = bread ((bNread) fread, fp);
+            struct bstrList* tokens = bsplit(src,(char) '\n');
+            for (i=0;i<tokens->qty;i++)
+            {
+                if (binstr(tokens->entry[i],0,freeString) != BSTR_ERR)
+                {
+                     bstring tmp = bmidstr (tokens->entry[i], 10, blength(tokens->entry[i])-10  );
+                     bltrimws(tmp);
+                     struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+                     free = str2int(bdata(subtokens->entry[0]));
+                }
+            }
+            fclose(fp);
+        }
+    }
+    else
+    {
+        ERROR;
+    }
+
+    
+    return free;
+    
+}
+
+uint64_t getTotalNodeMem(int nodeId)
+{
+    FILE *fp;
+    bstring filename;
+    uint64_t free = 0;
+    bstring freeString  = bformat("MemTotal:");
+    int i;
+    filename = bformat("/sys/devices/system/node/node%d/meminfo", nodeId);
+
+    if (NULL != (fp = fopen (bdata(filename), "r"))) 
+    {
+        bstring src = bread ((bNread) fread, fp);
+        struct bstrList* tokens = bsplit(src,(char) '\n');
+
+        for (i=0;i<tokens->qty;i++)
+        {
+            if (binstr(tokens->entry[i],0,freeString) != BSTR_ERR)
+            {
+                 bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18  );
+                 bltrimws(tmp);
+                 struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+                 free = str2int(bdata(subtokens->entry[0]));
+            }
+        }
+        fclose(fp);
+    }
+    else if (!access("/proc/meminfo", R_OK))
+    {
+        filename = bfromcstr("/proc/meminfo");
+        if (NULL != (fp = fopen (bdata(filename), "r"))) 
+        {
+            bstring src = bread ((bNread) fread, fp);
+            struct bstrList* tokens = bsplit(src,(char) '\n');
+            for (i=0;i<tokens->qty;i++)
+            {
+                if (binstr(tokens->entry[i],0,freeString) != BSTR_ERR)
+                {
+                     bstring tmp = bmidstr (tokens->entry[i], 10, blength(tokens->entry[i])-10  );
+                     bltrimws(tmp);
+                     struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+                     free = str2int(bdata(subtokens->entry[0]));
+                }
+            }
+            fclose(fp);
+        }
+    }
+    else
+    {
+        ERROR;
+    }
+
+    
+    return free;
+    
+}
+
+int hwloc_findProcessor(int nodeID, int cpuID)
+{
+    hwloc_obj_t obj;
+    int i;
+    int pu_count = hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PU);
+    
+    for (i=0; i<pu_count; i++)
+    {
+        obj = hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_PU, i);
+        if (!obj)
+        {
+            continue;
+        }
+        else
+        {
+            if (obj->os_index == cpuID)
+            {
+                return 1;
+            }
+        }
+    }
+    return 0;
+
+}
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+int hwloc_numa_init(void)
+{
+    int errno;
+    uint32_t i;
+    int d;
+    int depth;
+    int cores_per_socket;
+    hwloc_obj_t obj;
+    const struct hwloc_distances_s* distances;
+    hwloc_obj_type_t hwloc_type = HWLOC_OBJ_NODE;
+
+    if (!hwloc_topology)
+    {
+        hwloc_topology_init(&hwloc_topology);
+        hwloc_topology_load(hwloc_topology);
+    }
+
+    numa_info.numberOfNodes = hwloc_get_nbobjs_by_type(hwloc_topology, hwloc_type);
+
+    /* If the amount of NUMA nodes == 0, there is actually no NUMA node, hence
+       aggregate all sockets in the system into the single virtually created NUMA node */
+    if (numa_info.numberOfNodes == 0)
+    {
+        hwloc_type = HWLOC_OBJ_SOCKET;
+        numa_info.numberOfNodes = 1;
+
+        numa_info.nodes = (NumaNode*) malloc(sizeof(NumaNode));
+        if (!numa_info.nodes)
+        {
+            fprintf(stderr,"No memory to allocate %ld byte for nodes array\n",sizeof(NumaNode));
+            return -1;
+        }
+        
+        numa_info.nodes[0].id = 0;
+        numa_info.nodes[0].numberOfProcessors = 0;
+        numa_info.nodes[0].totalMemory = getTotalNodeMem(0);
+        numa_info.nodes[0].freeMemory = getFreeNodeMem(0);
+        numa_info.nodes[0].processors = (uint32_t*) malloc(MAX_NUM_THREADS * sizeof(uint32_t));
+        if (!numa_info.nodes[0].processors)
+        {
+            fprintf(stderr,"No memory to allocate %ld byte for processors array of NUMA node %d\n",MAX_NUM_THREADS * sizeof(uint32_t),0);
+            return -1;
+        }
+        numa_info.nodes[0].distances = (uint32_t*) malloc(sizeof(uint32_t));
+        if (!numa_info.nodes[0].distances)
+        {
+            fprintf(stderr,"No memory to allocate %ld byte for distances array of NUMA node %d\n",sizeof(uint32_t),0);
+            return -1;
+        }
+        numa_info.nodes[0].distances[0] = 10;
+        numa_info.nodes[0].numberOfDistances = 1;
+        cores_per_socket = cpuid_topology.numHWThreads/cpuid_topology.numSockets;
+        
+        for (d=0; d<hwloc_get_nbobjs_by_type(hwloc_topology, hwloc_type); d++)
+        {
+            obj = hwloc_get_obj_by_type(hwloc_topology, hwloc_type, d);
+            /* depth is here used as index in the processors array */        
+            depth = d * cores_per_socket;
+            numa_info.nodes[0].numberOfProcessors += hwloc_record_objs_of_type_below_obj(
+                    hwloc_topology, obj, HWLOC_OBJ_PU, &depth, &numa_info.nodes[0].processors);
+        }
+    }
+    else
+    {
+        numa_info.nodes = (NumaNode*) malloc(numa_info.numberOfNodes * sizeof(NumaNode));
+        if (!numa_info.nodes)
+        {
+            fprintf(stderr,"No memory to allocate %ld byte for nodes array\n",numa_info.numberOfNodes * sizeof(NumaNode));
+            return -1;
+        }
+        depth = hwloc_get_type_depth(hwloc_topology, hwloc_type);
+        distances = hwloc_get_whole_distance_matrix_by_type(hwloc_topology, hwloc_type);
+        for (i=0; i<numa_info.numberOfNodes; i++)
+        {
+            obj = hwloc_get_obj_by_depth(hwloc_topology, depth, i);
+
+            numa_info.nodes[i].id = obj->os_index;
+
+            if (obj->memory.local_memory != 0)
+            {
+                numa_info.nodes[i].totalMemory = (uint64_t)(obj->memory.local_memory/1024);
+            }
+            else if (obj->memory.total_memory != 0)
+            {
+                numa_info.nodes[i].totalMemory = (uint64_t)(obj->memory.total_memory/1024);
+            }
+            else
+            {
+                numa_info.nodes[i].totalMemory = getTotalNodeMem(numa_info.nodes[i].id);
+            }
+            
+            /* freeMemory not detected by hwloc, do it the native way */
+            numa_info.nodes[i].freeMemory = getFreeNodeMem(numa_info.nodes[i].id);
+            numa_info.nodes[i].processors = (uint32_t*) malloc(MAX_NUM_THREADS * sizeof(uint32_t));
+            if (!numa_info.nodes[i].processors)
+            {
+                fprintf(stderr,"No memory to allocate %ld byte for processors array of NUMA node %d\n",MAX_NUM_THREADS * sizeof(uint32_t), i);
+                return -1;
+            }
+            d = 0;
+            numa_info.nodes[i].numberOfProcessors = hwloc_record_objs_of_type_below_obj(
+                    hwloc_topology, obj, HWLOC_OBJ_PU, &d, &numa_info.nodes[i].processors);
+            
+            numa_info.nodes[i].distances = (uint32_t*) malloc(numa_info.numberOfNodes * sizeof(uint32_t));
+            if (!numa_info.nodes[i].distances)
+            {
+                fprintf(stderr,"No memory to allocate %ld byte for distances array of NUMA node %d\n",numa_info.numberOfNodes*sizeof(uint32_t),i);
+                return -1;
+            }
+            if (distances)
+            {
+                numa_info.nodes[i].numberOfDistances = distances->nbobjs;
+                for(d=0;d<distances->nbobjs;d++)
+                {
+                    numa_info.nodes[i].distances[d] = distances->latency[i*distances->nbobjs + d] * distances->latency_base;
+                }
+            }
+            else
+            {
+                numa_info.nodes[i].numberOfDistances = numa_info.numberOfNodes;
+                for(d=0;d<numa_info.numberOfNodes;d++)
+                {
+                    numa_info.nodes[i].distances[d] = 10;
+                }
+            }
+
+        }
+    
+    }
+
+    if (numa_info.nodes[0].numberOfProcessors == 0)
+    {
+        return -1;
+    }
+    else
+    {
+        return 0;
+    }
+}
+
+void hwloc_numa_membind(void* ptr, size_t size, int domainId)
+{
+    int ret = 0;
+    hwloc_membind_flags_t flags = HWLOC_MEMBIND_STRICT|HWLOC_MEMBIND_PROCESS;
+    hwloc_nodeset_t nodeset = hwloc_bitmap_alloc();
+    
+    hwloc_bitmap_zero(nodeset);
+    hwloc_bitmap_set(nodeset, domainId);
+    
+    ret = hwloc_set_area_membind_nodeset(hwloc_topology, ptr, size, nodeset, HWLOC_MEMBIND_BIND, flags);
+    
+    hwloc_bitmap_free(nodeset);
+
+    if (ret < 0)
+    {
+        ERROR;
+    }
+}
+
+
+
+void hwloc_numa_setInterleaved(int* processorList, int numberOfProcessors)
+{
+    int i,j;
+    int ret = 0;
+    hwloc_cpuset_t cpuset = hwloc_bitmap_alloc();
+    hwloc_membind_flags_t flags = HWLOC_MEMBIND_STRICT|HWLOC_MEMBIND_PROCESS;
+    
+    hwloc_bitmap_zero(cpuset);
+    
+    for (i=0; i<numa_info.numberOfNodes; i++)
+    {
+        for (j=0; j<numberOfProcessors; j++)
+        {
+            if (hwloc_findProcessor(i,processorList[j]))
+            {
+                hwloc_bitmap_set(cpuset, i);
+            }
+        }
+    }
+    
+    
+    ret = hwloc_set_membind(hwloc_topology, cpuset, HWLOC_MEMBIND_INTERLEAVE, flags);
+    
+    hwloc_bitmap_free(cpuset);
+    
+    if (ret < 0)
+    {
+        ERROR;
+    }
+}
+#else
+int hwloc_numa_init(void)
+{
+    return 1;
+}
+
+void hwloc_numa_membind(void* ptr, size_t size, int domainId)
+{
+    return;
+}
+
+void hwloc_numa_setInterleaved(int* processorList, int numberOfProcessors)
+{
+    return;
+}
+
+#endif
diff --git a/src/numa_proc.c b/src/numa_proc.c
new file mode 100644
index 0000000..6819e2b
--- /dev/null
+++ b/src/numa_proc.c
@@ -0,0 +1,372 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  numa_proc.c
+ *
+ *      Description:  Get NUMA topology from procfs and sysfs
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+ 
+#include <stdlib.h>
+#include <stdio.h>
+
+#include <dirent.h>
+#include <error.h>
+//#include <strUtil.h>
+#include <unistd.h>
+#include <sys/syscall.h>
+#ifdef HAS_MEMPOLICY
+#include <linux/mempolicy.h>
+#endif
+
+#include <numa.h>
+#include <topology.h>
+
+/* #####   EXPORTED VARIABLES   ########################################### */
+
+
+
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+#ifdef HAS_MEMPOLICY
+#define get_mempolicy(policy,nmask,maxnode,addr,flags) syscall(SYS_get_mempolicy,policy,nmask,maxnode,addr,flags)
+#define set_mempolicy(mode,nmask,maxnode) syscall(SYS_set_mempolicy,mode,nmask,maxnode)
+#define mbind(start, len, nmask, maxnode, flags) syscall(SYS_mbind,(start),len,MPOL_BIND,(nmask),maxnode,flags)
+#endif
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+int
+proc_findProcessor(uint32_t nodeId, uint32_t coreId)
+{
+    int i;
+
+    for (i=0; i<numa_info.nodes[nodeId].numberOfProcessors; i++)
+    {
+        if (numa_info.nodes[nodeId].processors[i] == coreId)
+        {
+            return 1;
+        }
+    }
+    return 0;
+}
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+static int
+setConfiguredNodes(void)
+{
+    DIR *dir;
+    struct dirent *de;
+    int maxIdConfiguredNode = 0;
+
+    dir = opendir("/sys/devices/system/node");
+
+    if (!dir) 
+    {
+        maxIdConfiguredNode = 0;
+    }
+    else
+    {
+        while ((de = readdir(dir)) != NULL) 
+        {
+            int nd;
+            if (strncmp(de->d_name, "node", 4))
+            {
+                continue;
+            }
+
+            nd = str2int(de->d_name+4);
+
+            if (maxIdConfiguredNode < nd)
+            {
+                maxIdConfiguredNode = nd;
+            }
+        }
+        closedir(dir);
+    }
+    return maxIdConfiguredNode;
+}
+
+
+static void
+nodeMeminfo(int node, uint64_t* totalMemory, uint64_t* freeMemory)
+{
+    FILE *fp;
+    bstring filename;
+    bstring totalString = bformat("MemTotal:");
+    bstring freeString  = bformat("MemFree:");
+    int i;
+
+    filename = bformat("/sys/devices/system/node/node%d/meminfo", node);
+
+    if (NULL != (fp = fopen (bdata(filename), "r"))) 
+    {
+        bstring src = bread ((bNread) fread, fp);
+        struct bstrList* tokens = bsplit(src,(char) '\n');
+
+        for (i=0;i<tokens->qty;i++)
+        {
+            if (binstr(tokens->entry[i],0,totalString) != BSTR_ERR)
+            {
+                 bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18 );
+                 bltrimws(tmp);
+                 struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+                 *totalMemory = str2int(bdata(subtokens->entry[0]));
+            }
+            else if (binstr(tokens->entry[i],0,freeString) != BSTR_ERR)
+            {
+                 bstring tmp = bmidstr (tokens->entry[i], 18, blength(tokens->entry[i])-18  );
+                 bltrimws(tmp);
+                 struct bstrList* subtokens = bsplit(tmp,(char) ' ');
+                 *freeMemory = str2int(bdata(subtokens->entry[0]));
+            }
+        }
+    }
+    else
+    {
+        ERROR;
+    }
+
+    fclose(fp);
+}
+
+static int
+nodeProcessorList(int node, uint32_t** list)
+{
+    FILE *fp;
+    bstring filename;
+    int count = 0;
+    bstring src;
+    int i,j;
+    struct bstrList* tokens;
+    unsigned long val;
+    char* endptr;
+    int cursor=0;
+//    int unitSize = (int) (sizeof(unsigned long)*8);
+    int unitSize = (int) 32; /* 8 nibbles */
+
+    *list = (uint32_t*) malloc(MAX_NUM_THREADS * sizeof(uint32_t));
+    if (!(*list))
+    {
+        return -ENOMEM;
+    }
+
+    /* the cpumap interface should be always there */
+    filename = bformat("/sys/devices/system/node/node%d/cpumap", node); 
+
+    if (NULL != (fp = fopen (bdata(filename), "r"))) 
+    {
+
+        src = bread ((bNread) fread, fp);
+        tokens = bsplit(src,',');
+
+        for (i=(tokens->qty-1); i>=0 ;i--)
+        {
+            val = strtoul((char*) tokens->entry[i]->data, &endptr, 16);
+
+            if ((errno != 0 && val == LONG_MAX )
+                    || (errno != 0 && val == 0)) 
+            {
+                return -EFAULT;
+            }
+
+            if (endptr == (char*) tokens->entry[i]->data) 
+            {
+                ERROR_PLAIN_PRINT(No digits were found);
+                return -EFAULT;
+            }
+
+            if (val != 0UL)
+            {
+                for (j=0; j<unitSize; j++)
+                {
+                    if (val&(1UL<<j))
+                    {
+                        if (count < MAX_NUM_THREADS)
+                        {
+                            (*list)[count] = (j+cursor);
+                        }
+                        else
+                        {
+                            ERROR_PRINT(Number Of threads %d too large,count);
+                            return -EFAULT;
+                        }
+                        count++;
+                    }
+                }
+            }
+            cursor += unitSize;
+        }
+
+        bstrListDestroy(tokens);
+        bdestroy(src);
+        bdestroy(filename);
+        fclose(fp); 
+
+        /* FIXME: CPU list here is not physical cores first but numerical sorted */
+
+
+        return count;
+    }
+
+    /* something went wrong */
+    return -1;
+}
+ 
+static int
+nodeDistanceList(int node, int numberOfNodes, uint32_t** list)
+{
+    FILE *fp;
+    bstring filename;
+    int count = 0;
+    bstring src;
+    struct bstrList* tokens;
+
+    *list = (uint32_t*) malloc(numberOfNodes * sizeof(uint32_t));
+    if (!(*list))
+    {
+        return -ENOMEM;
+    }
+
+    /* the distance interface should be always there */
+    filename = bformat("/sys/devices/system/node/node%d/distance", node);
+
+    if (NULL != (fp = fopen (bdata(filename), "r")))
+    {
+
+        src = bread ((bNread) fread, fp);
+        tokens = bsplit(src,' ');
+
+        for (int i=0; i<(tokens->qty); i++)
+        {
+            if (count < numberOfNodes)
+            {
+                (*list)[count] = (uint32_t)strtoul((char*) (tokens->entry[i]->data), NULL, 10);
+            }
+            else
+            {
+                ERROR_PRINT(Number Of nodes %d too large,count);
+                return -EFAULT;
+            }
+            count++;
+        }
+
+        bstrListDestroy(tokens);
+        bdestroy(src);
+        bdestroy(filename);
+        fclose(fp);
+        return count;
+    }
+
+    /* something went wrong */
+    return -1;
+}
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+int proc_numa_init(void)
+{
+    int errno;
+    uint32_t i;
+
+    if (get_mempolicy(NULL, NULL, 0, 0, 0) < 0 && errno == ENOSYS)
+    {
+        numa_info.numberOfNodes = 0;
+        numa_info.nodes = NULL;
+        return -1; 
+    }
+    /* First determine maximum number of nodes */
+    numa_info.numberOfNodes = setConfiguredNodes()+1;
+    numa_info.nodes = (NumaNode*) malloc(numa_info.numberOfNodes * sizeof(NumaNode));
+    if (!numa_info.nodes)
+    {
+        return -ENOMEM;
+    }
+
+    for (i=0; i<numa_info.numberOfNodes; i++)
+    {
+        numa_info.nodes[i].id = i;
+        nodeMeminfo(i, &numa_info.nodes[i].totalMemory, &numa_info.nodes[i].freeMemory);
+        numa_info.nodes[i].numberOfProcessors = nodeProcessorList(i,&numa_info.nodes[i].processors);
+        if (numa_info.nodes[i].numberOfProcessors == 0)
+        {
+            return -EFAULT;
+        }
+        numa_info.nodes[i].numberOfDistances = nodeDistanceList(i, numa_info.numberOfNodes, &numa_info.nodes[i].distances);
+        if (numa_info.nodes[i].numberOfDistances == 0)
+        {
+            return -EFAULT;
+        }
+    }
+
+    return 0;
+}
+
+void 
+proc_numa_setInterleaved(int* processorList, int numberOfProcessors)
+{
+    long i;
+    int j;
+    int ret=0;
+    unsigned long numberOfNodes = 65;
+    unsigned long mask = 0UL;
+
+    for (i=0; i<numa_info.numberOfNodes; i++)
+    {
+        for (j=0; j<numberOfProcessors; j++)
+        {
+            if (proc_findProcessor(i,processorList[j]))
+            {
+                mask |= (1UL<<i);
+                break;
+            }
+        }
+    }
+
+    ret = set_mempolicy(MPOL_INTERLEAVE,&mask,numberOfNodes);
+
+    if (ret < 0)
+    {
+        ERROR;
+    }
+}
+
+void
+proc_numa_membind(void* ptr, size_t size, int domainId)
+{
+    int ret=0;
+    unsigned long mask = 0UL;
+    unsigned int flags = 0U;
+
+    flags |= MPOL_MF_STRICT;
+    mask |= (1UL<<domainId);
+
+    ret = mbind(ptr, size, &mask, numa_info.numberOfNodes+1, flags);
+
+    if (ret < 0)
+    {
+        ERROR;
+    }
+}
diff --git a/src/pci.c b/src/pci.c
index 2e8a22f..17ed1ea 100644
--- a/src/pci.c
+++ b/src/pci.c
@@ -8,13 +8,14 @@
  *                   performance monitoring registers in PCI Cfg space
  *                   for Intel Sandy Bridge Processors.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -50,35 +51,24 @@
 #include <bstrlib.h>
 #include <error.h>
 #include <pci.h>
-#include <cpuid.h>
+#include <topology.h>
 #include <affinity.h>
+#ifdef LIKWID_USE_HWLOC
+#include <pci_hwloc.h>
+#else
+#include <pci_proc.h>
+#endif
 
 /* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
 #define STRINGIFY(x) #x
 #define TOSTRING(x) STRINGIFY(x)
 
 #define PCI_ROOT_PATH  "/proc/bus/pci/"
+#define PCM_PCI_CLASS  0x1101
 
 /* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
 
-static int socket_fd = -1;
-static int FD[MAX_NUM_NODES][MAX_NUM_DEVICES];
-
-static char* pci_DevicePath[MAX_NUM_DEVICES] = {
- "13.5",   /* PCI_R3QPI_DEVICE_LINK_0 */
- "13.6",   /* PCI_R3QPI_DEVICE_LINK_1 */
- "13.1",   /* PCI_R2PCIE_DEVICE */
- "10.0",   /* PCI_IMC_DEVICE_CH_0 */
- "10.1",   /* PCI_IMC_DEVICE_CH_1 */
- "10.4",   /* PCI_IMC_DEVICE_CH_2 */
- "10.5",   /* PCI_IMC_DEVICE_CH_3 */
- "0e.1",   /* PCI_HA_DEVICE */
- "08.2",   /* PCI_QPI_DEVICE_PORT_0 */
- "09.2",   /* PCI_QPI_DEVICE_PORT_1 */
- "08.6",   /* PCI_QPI_MASK_DEVICE_PORT_0 */
- "09.6",   /* PCI_QPI_MASK_DEVICE_PORT_1 */
- "08.0",   /* PCI_QPI_MISC_DEVICE_PORT_0 */
- "09.0" }; /* PCI_QPI_MISC_DEVICE_PORT_1 */
+static int FD[MAX_NUM_NODES][MAX_NUM_PCI_DEVICES];
 
 /* Socket to bus mapping -- will be determined at runtime;
  * typical mappings are:
@@ -89,111 +79,112 @@ static char* pci_DevicePath[MAX_NUM_DEVICES] = {
  *   3                  0xff
  */
 static char* socket_bus[MAX_NUM_NODES];
-static int socket_count = 0;
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
+/* Dirty hack to avoid nonull warnings */
+int (*ownaccess)(const char*, int);
+int (*ownopen)(const char*, int, ...);
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
-void
+int
 pci_init(int initSocket_fd)
 {
-    FILE *fptr;
-    char buf[1024];
-    uint32_t testDevice;
-    uint32_t sbus, sdevfn, svend;
-    int cntr = 0;
-    int active_devs = 0;
-
-    for ( int j=0; j<MAX_NUM_NODES; j++ )
+    uint16_t testDevice;
+    int nr_sockets = 0;
+    int i=0;
+    int j=0;
+    int ret = 0;
+    int access_flags = 0;
+    ownaccess = &access;
+    ownopen = &open;
+
+    for (i=0; i<MAX_NUM_NODES; i++ )
     {
-        socket_bus[j] = "N-A";
-        for (int i=0; i<MAX_NUM_DEVICES; i++)
+        socket_bus[i] = "N-A";
+        for(j=1;j<MAX_NUM_PCI_DEVICES;j++)
         {
-            FD[j][i] = 0;
+            FD[i][j] = -2;
         }
     }
-
-    if (cpuid_info.model == SANDYBRIDGE_EP)
-    {
-        testDevice = 0x80863c44;
-    }
-    else if (cpuid_info.model == IVYBRIDGE_EP)
+    /* PCI is only provided by Intel systems */
+    if (!cpuid_info.isIntel)
     {
-        testDevice = 0x80860e36;
-    }
-    else
-    {
-        /*
-        fprintf(stderr, "Unsupported architecture for pci based uncore. \
-                Thus, no support for PCI based Uncore counters.\n");
-                */
-        return;
+        DEBUG_PLAIN_PRINT(DEBUGLEV_DETAIL, PCI based Uncore performance monitoring only supported on Intel systems);
+        return -ENODEV;
     }
 
-    if ( (fptr = fopen( "/proc/bus/pci/devices", "r")) == NULL )
+    switch (cpuid_info.model)
     {
-        fprintf(stderr, "Unable to open /proc/bus/pci/devices. \
-                Thus, no support for PCI based Uncore counters.\n");
-        return;
+        case SANDYBRIDGE_EP:
+            testDevice = 0x3c44;
+            break;
+        case IVYBRIDGE_EP:
+            testDevice = 0x0e36;
+            break;
+        case HASWELL_EP:
+            testDevice = 0x2f30;
+            break;
+        default:
+            DEBUG_PRINT(DEBUGLEV_INFO,CPU model %s does not support PCI based Uncore performance monitoring, cpuid_info.name);
+            return -ENODEV;
+            break;
     }
 
-    while( fgets(buf, sizeof(buf)-1, fptr) )
-    {
-        if ( sscanf(buf, "%2x%2x %8x", &sbus, &sdevfn, &svend) == 3 &&
-             svend == testDevice )
+#ifdef LIKWID_USE_HWLOC
+        DEBUG_PLAIN_PRINT(DEBUGLEV_DETAIL, Using hwloc to find pci devices);
+        ret = hwloc_pci_init(testDevice, socket_bus, &nr_sockets);
+        if (ret)
         {
-            socket_bus[cntr] = (char*)malloc(4);
-            sprintf(socket_bus[cntr++], "%02x/", sbus);
+            ERROR_PLAIN_PRINT(Using hwloc to find pci devices failed);
+            return -ENODEV;
         }
-    }
-    fclose(fptr);
+#else
+        DEBUG_PLAIN_PRINT(DEBUGLEV_DETAIL, Using procfs to find pci devices);
+        ret = proc_pci_init(testDevice, socket_bus, &nr_sockets);
+        if (ret)
+        {
+            ERROR_PLAIN_PRINT(Using procfs to find pci devices failed);
+            return -ENODEV;
+        }
+#endif
 
-    if ( cntr == 0 )
+    if (accessClient_mode == ACCESSMODE_DIRECT)
     {
-        fprintf(stderr, "Uncore not supported on this system\n");
-        return;
+        access_flags = R_OK|W_OK;
     }
-
-    socket_count = cntr;
-
-    bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
-    bcatcstr(filepath, socket_bus[0]);
-    bcatcstr(filepath, pci_DevicePath[0] );
-
-
-    if (access(bdata(filepath),F_OK))
+    else
     {
-        fprintf(stderr, "INFO\n");
-        fprintf(stderr, "This system has no support for PCI based Uncore counters.\n");
-        fprintf(stderr, "This means you cannot use performance groups as MEM, which require Uncore counters.\n\n");
-        return;
+        access_flags = F_OK;
     }
-    bdestroy(filepath);
 
-    for (int j=0; j<socket_count; j++)
+    for(i=0;i<nr_sockets;i++)
     {
-        for (int i=0; i<MAX_NUM_DEVICES; i++)
+        for(j=1;j<MAX_NUM_PCI_DEVICES;j++)
         {
-
-            bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
-            bcatcstr(filepath, socket_bus[j]);
-            bcatcstr(filepath, pci_DevicePath[i] );
-
-            if (!access(bdata(filepath),F_OK))
-            {
-                FD[j][i] = 0;
-            }
-            else
+            if (pci_devices[j].path != NULL)
             {
-                FD[j][i] = -2;
+                bstring filepath = bformat("%s%s%s",PCI_ROOT_PATH,
+                                                    socket_bus[i],
+                                                    pci_devices[j].path);
+                if (!ownaccess(bdata(filepath),access_flags))
+                {
+                    FD[i][j] = 0;
+                    pci_devices[j].online = 1;
+                    if (i==0)
+                    {
+                        DEBUG_PRINT(DEBUGLEV_DETAIL, PCI device %s (%d) online for socket %d at path %s, pci_devices[j].name,j, i,bdata(filepath));
+                    }
+                }
+                else
+                {
+                    pci_devices[j].online = 0;
+                }
             }
-            bdestroy(filepath);
         }
     }
 
-    if (accessClient_mode == DAEMON_AM_DIRECT)
+    if (accessClient_mode == ACCESSMODE_DIRECT)
     {
         if(geteuid() != 0)
         {
@@ -207,192 +198,282 @@ pci_init(int initSocket_fd)
     {
         socket_fd = initSocket_fd;
     }
+    return 0;
 }
 
 
 void
 pci_finalize()
 {
-    for (int j=0; j<socket_count; j++)
+    int i=0;
+    int j=0;
+    if (accessClient_mode != ACCESSMODE_DIRECT)
     {
-        for (int i=0; i<MAX_NUM_DEVICES; i++)
+        for (i=0; i<MAX_NUM_NODES; i++)
         {
-            if (FD[j][i] > 0)
+            for (j=1; j<MAX_NUM_PCI_DEVICES; j++)
             {
-                close(FD[j][i]);
+                if (FD[i][j] > 0)
+                {
+                    close(FD[i][j]);
+                }
             }
         }
     }
-
-    if (accessClient_mode != DAEMON_AM_DIRECT)
+    else
     {
         socket_fd = -1;
     }
 }
 
 
-uint32_t
-pci_read(int cpu, PciDeviceIndex device, uint32_t reg)
+int
+pci_read(int cpu, PciDeviceIndex device, uint32_t reg, uint32_t* data)
 {
     int socketId = affinity_core2node_lookup[cpu];
-    if ( FD[socketId][device] == -2)
+    bstring filepath = NULL;
+    uint64_t tmp;
+    int err;
+
+    if (device == MSR_DEV)
     {
-        fprintf(stderr, "Trying to access non-existent PCI device (%s) for reading\n", pci_DevicePath[device]);
-        return 0;
+        return -ENODEV;
     }
 
-    if (accessClient_mode == DAEMON_AM_DIRECT)
+    if (accessClient_mode == ACCESSMODE_DIRECT)
     {
-        uint32_t data = 0;
-        if ( !FD[socketId][device] )
+        if (FD[socketId][device] < 0)
+        {
+            *data = 0;
+            return -ENODEV;
+        }
+        else if ( !FD[socketId][device] )
         {
-            bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
+            filepath =  bfromcstr ( PCI_ROOT_PATH );
             bcatcstr(filepath, socket_bus[socketId]);
-            bcatcstr(filepath, pci_DevicePath[device] );
-            FD[socketId][device] = open( bdata(filepath), O_RDWR);
+            bcatcstr(filepath, pci_devices[device].path);
+            FD[socketId][device] = ownopen( bdata(filepath), O_RDWR);
 
             if ( FD[socketId][device] < 0)
             {
-                fprintf(stderr, "ERROR in pci_read: failed to open pci device %s: %s!\n",
-                        bdata(filepath), strerror(errno));
+                ERROR_PRINT(Failed to open PCI device %s at path %s\n, 
+                                pci_devices[device].name,
+                                bdata(filepath));
+                *data = 0;
+                return -EACCES;
             }
-            bdestroy(filepath);
+            DEBUG_PRINT(DEBUGLEV_DETAIL, Opened PCI device %s, pci_devices[device].name);
         }
 
         if ( FD[socketId][device] > 0 &&
-             pread(FD[socketId][device], &data, sizeof data, reg) != sizeof data )
+             pread(FD[socketId][device], &tmp, sizeof(tmp), reg) != sizeof(tmp) ) 
         {
-            ERROR_PRINT("ERROR in pci_read: failed on CPU %d Register 0x%x", cpu, reg);
+            ERROR_PRINT(Read from PCI device %s at register 0x%x failed,pci_devices[device].name, reg);
+            *data = 0;
+            return -EIO;
         }
-
-        return data;
     }
     else
     { /* daemon or sysdaemon-mode */
-        return (uint32_t) accessClient_read(socket_fd, socketId, device, reg);
+        if (FD[socketId][device] < 0)
+        {
+            return -ENODEV;
+        }
+        err = accessClient_read(socket_fd, socketId, device, reg, &tmp);
+        if (err)
+        {
+            ERROR_PRINT(Read from PCI device %s at register 0x%x through access daemon failed,pci_devices[device].name, reg);
+            return err;
+        } 
     }
+    *data = tmp;
+    return 0;
 }
 
 
 
-void
+int
 pci_write(int cpu, PciDeviceIndex device, uint32_t reg, uint32_t data)
 {
     int socketId = affinity_core2node_lookup[cpu];
+    bstring filepath = NULL;
+    int err;
 
-    if ( FD[socketId][device] == -2)
+    if (device == MSR_DEV)
     {
-        fprintf(stderr, "Trying to access non-existent PCI device (%s) for writing\n", pci_DevicePath[device]);
-        return;
+        return -ENODEV;
     }
-    if (accessClient_mode == DAEMON_AM_DIRECT)
+    if (accessClient_mode == ACCESSMODE_DIRECT)
     {
-        if ( !FD[socketId][device] )
+        if (FD[socketId][device] < 0)
+        {
+            return -ENODEV;
+        }
+        else if ( !FD[socketId][device] )
         {
-            bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
+            filepath = bfromcstr ( PCI_ROOT_PATH );
             bcatcstr(filepath, socket_bus[socketId]);
-            bcatcstr(filepath, pci_DevicePath[device] );
-            FD[socketId][device] = open( bdata(filepath), O_RDWR);
+            bcatcstr(filepath, pci_devices[device].path );
+            
+            FD[socketId][device] = ownopen( bdata(filepath), O_RDWR);
 
             if ( FD[socketId][device] < 0)
             {
-                fprintf(stderr, "ERROR in pci_write: failed to open pci device %s: %s!\n",
-                        bdata(filepath), strerror(errno));
+                ERROR_PRINT(Failed to open PCI device %s at path %s\n, 
+                                    pci_devices[device].name,
+                                    bdata(filepath));
+                return -EACCES;
             }
-            bdestroy(filepath);
+            DEBUG_PRINT(DEBUGLEV_DETAIL, Opened PCI device %s, pci_devices[device].name);
         }
 
         if ( FD[socketId][device] > 0 &&
-             pwrite(FD[socketId][device], &data, sizeof data, reg) != sizeof data)
+             pwrite(FD[socketId][device], &data, sizeof data, reg) != sizeof data) 
         {
-            ERROR_PRINT("ERROR in pci_write: failed on CPU %d Register 0x%x", cpu, reg);
-        }
+            ERROR_PRINT(Write to PCI device %s at register 0x%x failed,pci_devices[device].name, reg);
+            return -EIO;
+        }    
     }
     else
     { /* daemon or sysdaemon-mode */
-        accessClient_write(socket_fd, socketId, device, reg, (uint64_t) data);
+        if (FD[socketId][device] < 0)
+        {
+            return -ENODEV;
+        }
+        err = accessClient_write(socket_fd, socketId, device, reg, (uint64_t) data);
+        if (err)
+        {
+            ERROR_PRINT(Write to PCI device %s at register 0x%x through access daemon failed,pci_devices[device].name, reg);
+            return err;
+        }
     }
+    return 0;
 }
 
-uint32_t
-pci_tread(const int tsocket_fd, const int cpu, PciDeviceIndex device, uint32_t reg)
+int
+pci_tread(const int tsocket_fd, const int cpu, PciDeviceIndex device, uint32_t reg, uint32_t *data)
 {
     int socketId = affinity_core2node_lookup[cpu];
-    if ( FD[socketId][device] == -2)
-    {
-        return 0;
-    }
+    bstring filepath = NULL;
+    uint64_t tmp;
+    int err;
 
-    if (accessClient_mode == DAEMON_AM_DIRECT)
+    if (accessClient_mode == ACCESSMODE_DIRECT)
     {
-        uint32_t data = 0;
-        if ( !FD[socketId][device] )
+        *data = 0;
+        if (FD[socketId][device] < 0)
+        {
+            return -ENODEV;
+        }
+        else if ( !FD[socketId][device] )
         {
-            bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
+            filepath =  bfromcstr ( PCI_ROOT_PATH );
             bcatcstr(filepath, socket_bus[socketId]);
-            bcatcstr(filepath, pci_DevicePath[device] );
+            bcatcstr(filepath, pci_devices[device].path );
 
-            FD[socketId][device] = open( bdata(filepath), O_RDWR);
+            FD[socketId][device] = ownopen( bdata(filepath), O_RDWR);
 
             if ( FD[socketId][device] < 0)
             {
-                fprintf(stderr, "ERROR in pci_tread:\n    failed to open pci device %s: %s!\n",
-                        bdata(filepath), strerror(errno));
+                ERROR_PRINT(Failed to open PCI device %s at path %s\n, 
+                                pci_devices[device].name,
+                                bdata(filepath));
+                return -EACCES;
             }
-            bdestroy(filepath);
+            DEBUG_PRINT(DEBUGLEV_DETAIL, Opened PCI device %s, pci_devices[device].name);
         }
 
         if ( FD[socketId][device] > 0 &&
-             pread(FD[socketId][device], &data, sizeof data, reg) != sizeof data )
+             pread(FD[socketId][device], &tmp, sizeof(tmp), reg) != sizeof(tmp) ) 
         {
-            ERROR_PRINT("ERROR in pci_tread: failed on CPU %d Register 0x%x", cpu, reg);
+            ERROR_PRINT(Read from PCI device %s at register 0x%x failed,pci_devices[device].name, reg);
+            *data = 0;
+            return -EIO;
         }
-
-        return data;
     }
     else
     { /* daemon or sysdaemon-mode */
-        return accessClient_read(tsocket_fd, socketId, device, reg);
+        if (FD[socketId][device] < 0)
+        {
+            return -ENODEV;
+        }
+        err = accessClient_read(tsocket_fd, socketId, device, reg, &tmp);
+        if (err)
+        {
+            ERROR_PRINT(Read from PCI device %s at register 0x%x through access daemon failed,pci_devices[device].name, reg);
+            return err;
+        }
+        
     }
+    *data = tmp;
+    return 0;
 }
 
-void
+int
 pci_twrite( const int tsocket_fd, const int cpu, PciDeviceIndex device, uint32_t reg, uint32_t data)
 {
     int socketId = affinity_core2node_lookup[cpu];
-    if ( FD[socketId][device] == -2)
-    {
-        return;
-    }
-    if (accessClient_mode == DAEMON_AM_DIRECT)
+    bstring filepath = NULL;
+    int err;
+
+    if (accessClient_mode == ACCESSMODE_DIRECT)
     {
-        if ( !FD[socketId][device] )
+        if (FD[socketId][device] < 0)
+        {
+            return -ENODEV;
+        }
+        else if ( !FD[socketId][device] )
         {
-            bstring filepath =  bfromcstr ( PCI_ROOT_PATH );
+            filepath =  bfromcstr ( PCI_ROOT_PATH );
             bcatcstr(filepath, socket_bus[socketId]);
-            bcatcstr(filepath, pci_DevicePath[device] );
+            bcatcstr(filepath, pci_devices[device].path );
 
-            FD[socketId][device] = open( bdata(filepath), O_RDWR);
+            FD[socketId][device] = ownopen( bdata(filepath), O_RDWR);
 
             if ( FD[socketId][device] < 0)
             {
-                fprintf(stderr, "ERROR in pci_twrite: failed to open pci device %s: %s!\n",
-                        bdata(filepath), strerror(errno));
+                ERROR_PRINT(Failed to open PCI device %s at path %s\n, 
+                                pci_devices[device].name,
+                                bdata(filepath));
+                return -EACCES;
             }
-            bdestroy(filepath);
+            DEBUG_PRINT(DEBUGLEV_DETAIL, Opened PCI device %s, pci_devices[device].name);
         }
 
         if ( FD[socketId][device] > 0 &&
-             pwrite(FD[socketId][device], &data, sizeof data, reg) != sizeof data)
+             pwrite(FD[socketId][device], &data, sizeof data, reg) != sizeof data) 
         {
-            ERROR_PRINT("ERROR in pci_twrite: failed on CPU %d Register 0x%x", cpu, reg);
+            ERROR_PRINT(Write to PCI device %s at register 0x%x failed,pci_devices[device].name, reg);
+            return -EIO;
         }
     }
     else
     { /* daemon or sysdaemon-mode */
-        accessClient_write(tsocket_fd, socketId, device, reg, data);
+        if (FD[socketId][device] < 0)
+        {
+            return -ENODEV;
+        }
+        err = accessClient_write(tsocket_fd, socketId, device, reg, data);
+        if (err)
+        {
+            ERROR_PRINT(Write to PCI device %s at register 0x%x through access daemon failed,pci_devices[device].name, reg);
+            return -EIO;
+        }
     }
+    return 0;
 }
 
-
+int pci_checkDevice(PciDeviceIndex index, int cpu)
+{
+    int socketId = affinity_core2node_lookup[cpu];
+    if (index == MSR_DEV)
+    {
+        return 1;
+    }
+    else if ((pci_devices[index].online == 1) || (FD[socketId][index] >= 0))
+    {
+        return 1;
+    }
+    return 0;
+}
 
diff --git a/src/pci_hwloc.c b/src/pci_hwloc.c
new file mode 100644
index 0000000..5840f3c
--- /dev/null
+++ b/src/pci_hwloc.c
@@ -0,0 +1,82 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  pci_hwloc.c
+ *
+ *      Description:  Interface to hwloc for PCI device lookup
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+
+
+#include <hwloc.h>
+#include <types.h>
+#include <accessClient.h>
+#include <bstrlib.h>
+#include <affinity.h>
+#include <topology.h>
+#include <topology_hwloc.h>
+#include <error.h>
+
+int 
+hwloc_pci_init(uint16_t testDevice, char** socket_bus, int* nrSockets)
+{
+    int cntr = 0;
+    uint16_t testVendor = 0x8086;
+    hwloc_obj_t obj;
+    int flags;
+    int i;
+
+    if (!hwloc_topology)
+    {
+        hwloc_topology_init(&hwloc_topology);
+        hwloc_topology_set_flags(hwloc_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_IO );
+        hwloc_topology_load(hwloc_topology);
+    }
+
+    for(i = 0; i < hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PCI_DEVICE); i++)
+    {
+        obj = hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_PCI_DEVICE, i);
+        if (obj->attr->pcidev.vendor_id != testVendor)
+        {
+            continue;
+        }
+        if ((obj->attr->pcidev.vendor_id == testVendor) && (obj->attr->pcidev.device_id == testDevice))
+        {
+            socket_bus[cntr] = (char*)malloc(4);
+            sprintf(socket_bus[cntr++], "%02x/", obj->attr->pcidev.bus);
+        }
+    }
+    *nrSockets = cntr;
+
+    if (cntr == 0)
+    {
+        return -ENODEV;
+    }
+
+    return 0;
+}
diff --git a/src/pci_proc.c b/src/pci_proc.c
new file mode 100644
index 0000000..a25bed0
--- /dev/null
+++ b/src/pci_proc.c
@@ -0,0 +1,126 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  pci_proc.c
+ *
+ *      Description:  Interface to procfs/sysfs for PCI device lookup
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Author:   Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdint.h>
+#include <fcntl.h>
+
+
+#include <types.h>
+#include <accessClient.h>
+#include <bstrlib.h>
+#include <affinity.h>
+#include <topology.h>
+#include <error.h>
+
+int getBusFromSocket(const uint32_t socket)
+{
+    int cur_bus = 0;
+    uint32_t cur_socket = 0;
+    char pci_filepath[1024];
+    int fp;
+    int ret = 0;
+    while(cur_socket <= socket)
+    {
+        sprintf(pci_filepath, "/proc/bus/pci/%02x/05.0", cur_bus);
+        fp = open(pci_filepath, O_RDONLY);
+        if (fp < 0)
+        {
+            return -1;
+        }
+        uint32_t cpubusno = 0;
+        ret = pread(fp, &cpubusno, sizeof(uint32_t), 0x108);
+        if (ret != sizeof(uint32_t))
+        {
+            close(fp);
+            return -1;
+        }
+        cur_bus = (cpubusno >> 8) & 0x0ff;
+        close(fp);
+        if(socket == cur_socket)
+            return cur_bus;
+        ++cur_socket;
+        ++cur_bus;
+        if(cur_bus > 0x0ff)
+           return -1;
+    }
+
+    return -1;
+}
+
+int
+proc_pci_init(uint16_t testDevice, char** socket_bus, int* nrSockets)
+{
+    FILE *fptr;
+    char buf[1024];
+    int cntr = 0;
+    uint16_t testVendor = 0x8086;
+    uint32_t sbus, sdevfn, svend, sdev;
+    int busID;
+    
+
+    if ( (fptr = fopen( "/proc/bus/pci/devices", "r")) == NULL )
+    {
+        fprintf(stderr, "Unable to open /proc/bus/pci/devices. \
+                Thus, no support for PCI based Uncore counters.\n");
+        return -ENODEV;
+    }
+
+    while( fgets(buf, sizeof(buf)-1, fptr) )
+    {
+        if ( sscanf(buf, "%2x%2x %4x%4x", &sbus, &sdevfn, &svend, &sdev) == 4 &&
+             svend == testVendor && sdev == testDevice )
+        {
+            socket_bus[cntr] = (char*)malloc(4);
+            busID = getBusFromSocket(cntr);
+            if (busID == sbus)
+            {
+                sprintf(socket_bus[cntr], "%02x/", sbus);
+            }
+            else
+            {
+                sprintf(socket_bus[cntr], "%02x/", busID);
+            }
+            cntr++;
+        }
+    }
+    fclose(fptr);
+    
+    *nrSockets = cntr;
+    
+    if ( cntr == 0 )
+    {
+        //fprintf(stderr, "Uncore not supported on this system\n");
+        return -ENODEV;
+    }
+    
+    return 0;
+}
diff --git a/src/perfmon.c b/src/perfmon.c
index 30cacba..d718ec5 100644
--- a/src/perfmon.c
+++ b/src/perfmon.c
@@ -3,15 +3,16 @@
  *
  *      Filename:  perfmon.c
  *
- *      Description:  Implementation of perfmon Module.
+ *      Description:  Main implementation of the performance monitoring module
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -28,8 +29,6 @@
  * =======================================================================================
  */
 
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -37,100 +36,19 @@
 #include <float.h>
 #include <unistd.h>
 #include <sys/types.h>
-#include <assert.h>
+
 
 #include <types.h>
+#include <likwid.h>
 #include <bitUtil.h>
-#include <bstrlib.h>
-#include <strUtil.h>
-#include <bitUtil.h>
-#include <error.h>
 #include <timer.h>
-#include <accessClient.h>
 #include <msr.h>
 #include <pci.h>
 #include <lock.h>
-#include <cpuid.h>
-#include <affinity.h>
-#include <tree.h>
-#include <power.h>
-#include <thermal.h>
 #include <perfmon.h>
-#include <asciiTable.h>
 #include <registers.h>
-
-
-/* #####   EXPORTED VARIABLES   ########################################### */
-
-int perfmon_verbose = 0;
-int perfmon_csvoutput = 0;
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static PerfmonGroup groupSet = _NOGROUP;
-static PerfmonEvent* eventHash;
-static PerfmonCounterMap* counter_map;
-static PerfmonGroupMap* group_map;
-static PerfmonGroupHelp* group_help;
-static EventSetup * eventSetup;
-
-static TimerData timeData;
-static double rdtscTime;
-static PerfmonEventSet perfmon_set;
-static int perfmon_numGroups;
-static int perfmon_numCounters;
-static int perfmon_numArchEvents;
-static int perfmon_numThreads;
-static int perfmon_numRegions;
-static FILE* OUTSTREAM;
-static double** perfmon_threadState;
-static PerfmonThread* perfmon_threadData;
-
-static int socket_fd = -1;
-static int socket_lock[MAX_NUM_NODES];
-
-/* #####   PROTOTYPES  -  LOCAL TO THIS SOURCE FILE   ##################### */
-
-static void initResultTable(PerfmonResultTable* tableData,
-        bstrList* firstColumn,
-        int numRows,
-        int numColumns);
-
-static void initStatisticTable(PerfmonResultTable* tableData,
-        bstrList* firstColumn,
-        int numRows);
-
-static void printResultTable(PerfmonResultTable* tableData);
-static void freeResultTable(PerfmonResultTable* tableData);
-static void initThread(int , int );
-
-/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
-
-#define CHECKERROR \
-        if (ret == EOF) \
-        { \
-            fprintf (stderr, "sscanf: Failed to read marker file!\n" ); \
-            exit (EXIT_FAILURE);}
-
-#define bstrListAdd(bl,id,name) \
-    label = bfromcstr(#name);  \
-    (bl)->entry[id] = bstrcpy(label);  \
-    (bl)->qty++; \
-    bdestroy(label);
-
-#define INIT_EVENTS   \
-    fc = bstrListCreate(); \
-    bstrListAlloc(fc, numRows+1); \
-    bstrListAdd(fc,0,Event); \
-    for (i=0; i<numRows; i++) \
-    { \
-        fc->entry[1+i] = \
-           bfromcstr(perfmon_set.events[i].event.name); }
-
-#define INIT_BASIC  \
-    fc = bstrListCreate(); \
-    bstrListAlloc(fc, numRows+1); \
-    bstrListAdd(fc,0,Metric);
+#include <topology.h>
+#include <access.h>
 
 #include <perfmon_pm.h>
 #include <perfmon_atom.h>
@@ -148,318 +66,155 @@ static void initThread(int , int );
 #include <perfmon_interlagos.h>
 #include <perfmon_kabini.h>
 #include <perfmon_silvermont.h>
+#include <perfmon_broadwell.h>
+
+
+PerfmonEvent* eventHash;
+RegisterMap* counter_map = NULL;
+BoxMap* box_map = NULL;
+PciDevice* pci_devices = NULL;
+int perfmon_numCounters = 0;
+int perfmon_numCoreCounters = 0;
+int perfmon_numArchEvents = 0;
+int perfmon_verbosity = DEBUGLEV_ONLY_ERROR;
+
+int socket_fd = -1;
+int thread_sockets[MAX_NUM_THREADS] = { [0 ... MAX_NUM_THREADS-1] = -1};
+
+
+PerfmonGroupSet* groupSet = NULL;
+
+int (*perfmon_startCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+int (*perfmon_stopCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+int (*perfmon_readCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+int (*perfmon_setupCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+int (*perfmon_finalizeCountersThread) (int thread_id, PerfmonEventSet* eventSet);
+
+int (*initThreadArch) (int cpu_id);
+
+
+char* eventOptionTypeName[NUM_EVENT_OPTIONS] = {
+    "NONE",
+    "OPCODE",
+    "MATCH0",
+    "MATCH1",
+    "MATCH2",
+    "MATCH3",
+    "MASK0",
+    "MASK1",
+    "MASK2",
+    "MASK3",
+    "NID",
+    "TID",
+    "STATE",
+    "EDGEDETECT",
+    "THRESHOLD",
+    "INVERT",
+    "KERNEL",
+    "ANYTHREAD",
+    "OCCUPANCY",
+    "OCCUPANCY_FILTER",
+    "OCCUPANCY_EDGEDETECT",
+    "OCCUPANCY_INVERT",
+    "IN_TRANSACTION",
+    "IN_TRANSACTION_ABORTED"
+};
 
-/* #####  EXPORTED  FUNCTION POINTERS   ################################### */
-void (*perfmon_startCountersThread) (int thread_id);
-void (*perfmon_stopCountersThread) (int thread_id);
-void (*perfmon_readCountersThread) (int thread_id);
-void (*perfmon_setupCounterThread) (int thread_id,
-        PerfmonEvent* event, PerfmonCounterIndex index);
-void (*printDerivedMetrics) (PerfmonGroup group);
-void (*logDerivedMetrics) (PerfmonGroup group, double time, double timeStamp);
-void (*perfmon_getDerivedCounterValuesArch)(PerfmonGroup group, float * values, float * out_max, float * out_min);
-
-
-/* #####   FUNCTION POINTERS  -  LOCAL TO THIS SOURCE FILE ################ */
-
-static void (*initThreadArch) (PerfmonThread *thread);
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
-static int getIndex (bstring reg, PerfmonCounterIndex* index)
+static int
+getIndexAndType (bstring reg, RegisterIndex* index, RegisterType* type)
 {
-    int ret = FALSE;
     int err = 0;
+    int ret = FALSE;
     uint64_t tmp;
+    int (*ownstrcmp)(const char*, const char*);
+    ownstrcmp = &strcmp;
     for (int i=0; i< perfmon_numCounters; i++)
     {
         if (biseqcstr(reg, counter_map[i].key))
         {
             *index = counter_map[i].index;
+            *type = counter_map[i].type;
             ret = TRUE;
+            break;
         }
     }
-    if ((ret) && (counter_map[*index].type != THERMAL) && (counter_map[*index].type != POWER))
-    {
-        if (counter_map[*index].device == 0)
-        {
-            tmp = msr_read(0, counter_map[*index].configRegister);
-            msr_write(0, counter_map[*index].configRegister,0x0ULL);
-        }
-        else
-        {
-            tmp = pci_read(0, counter_map[*index].device, counter_map[*index].configRegister);
-            pci_write(0, counter_map[*index].device, counter_map[*index].configRegister, 0x0U);
-        }
-    }
-    else if ((ret) && (counter_map[*index].type == POWER))
-    {
-        tmp = msr_read(0, counter_map[*index].counterRegister);
-    }
-
-    return ret;
-}
-
-
-static int
-getEvent(bstring event_str, PerfmonEvent* event)
-{
-    for (int i=0; i< perfmon_numArchEvents; i++)
-    {
-        if (biseqcstr(event_str, eventHash[i].name))
-        {
-            *event = eventHash[i];
-
-            if (perfmon_verbose)
-            {
-                fprintf(OUTSTREAM,"Found event %s : \
-                    Event_id 0x%02X Umask 0x%02X CfgBits 0x%02X Cmask 0x%02X \n",
-                        bdata( event_str),
-                        event->eventId,
-                        event->umask,
-                        event->cfgBits,
-                        event->cmask);
-            }
-            return TRUE;
-        }
-    }
-
-    return FALSE;
-}
-
-static void
-initThread(int thread_id, int cpu_id)
-{
-    for (int i=0; i<NUM_PMC; i++)
-    {
-        perfmon_threadData[thread_id].counters[i].init = FALSE;
-    }
-
-    perfmon_threadData[thread_id].processorId = cpu_id;
-    initThreadArch(&perfmon_threadData[thread_id]);
-}
-
-struct cbsScan{
-    /* Parse state */
-    bstring src;
-    int line;
-    LikwidResults* results;
-};
-
-static int lineCb (void* parm, int ofs, int len)
-{
-    int ret;
-    struct cbsScan* st = (struct cbsScan*) parm;
-    struct bstrList* strList;
-    bstring line;
-
-    if (!len) return 1;
-    strList = bstrListCreate();
-
-    line = blk2bstr (st->src->data + ofs, len);
-
-    if (st->line < perfmon_numRegions)
+    if (ret && (ownstrcmp(bdata(reg), counter_map[*index].key) != 0))
     {
-        int id;
-        strList = bsplit(line,':');
-
-        if( strList->qty < 2 )
-        {
-            ERROR_PLAIN_PRINT(Failed to read marker file);
-        }
-        ret = sscanf (bdata(strList->entry[0]), "%d", &id); CHECKERROR;
-        st->results[id].tag = bstrcpy(line);
-	 bdelete(st->results[id].tag, 0, blength(strList->entry[0])+1);
+        *type = NOTYPE;
+        return FALSE;
     }
-    else
+    if (!pci_checkDevice(counter_map[*index].device, 0))
     {
-        int tagId;
-        int threadId;
-
-        strList = bsplit(line,32);
-
-        if( strList->qty < (3+NUM_PMC))
-        {
-            ERROR_PLAIN_PRINT(Failed to read marker file);
-        }
-
-        ret = sscanf(bdata(strList->entry[0]), "%d", &tagId); CHECKERROR;
-        ret = sscanf(bdata(strList->entry[1]), "%d", &threadId); CHECKERROR;
-        ret = sscanf(bdata(strList->entry[2]), "%u", &st->results[tagId].count[threadId]); CHECKERROR;
-        ret = sscanf(bdata(strList->entry[3]), "%lf", &st->results[tagId].time[threadId]); CHECKERROR;
-
-        for (int i=0;i<NUM_PMC; i++)
-        {
-            ret = sscanf(bdata(strList->entry[4+i]), "%lf", &st->results[tagId].counters[threadId][i]); CHECKERROR;
-        }
+        *type = NOTYPE;
+        return FALSE;
     }
-
-    bstrListDestroy(strList);
-    st->line++;
-    bdestroy(line);
-    return 1;
-}
-
-static void
-readMarkerFile(bstring filename, LikwidResults** resultsRef)
-{
-    int numberOfThreads=0;
-    int ret;
-    int i,j,k;
-    struct cbsScan sl;
-    FILE * fp;
-    LikwidResults* results = *resultsRef;
-
-    if (NULL != (fp = fopen (bdata(filename), "r")))
+    if ((ret) && (*type != THERMAL) && (*type != POWER) && (*type != WBOX0FIX))
     {
-        bstring src = bread ((bNread) fread, fp);
-
-        /* read header info */
-        ret = sscanf (bdata(src), "%d %d", &numberOfThreads, &perfmon_numRegions); CHECKERROR;
-        results = (LikwidResults*) malloc(perfmon_numRegions * sizeof(LikwidResults));
-
-        if (perfmon_numRegions == 0)
-        {
-            fprintf(OUTSTREAM,"ERROR: No region results are listed in marker file\n");
-            ERROR_PLAIN_PRINT(No region results in marker file);
-        }
-        else if (numberOfThreads != perfmon_numThreads)
+        int check_settings = 1;
+        uint32_t reg = counter_map[*index].configRegister;
+        if (reg == 0x0)
         {
-            fprintf(OUTSTREAM,"ERROR: Is the number of threads for likwid-perfctr equal to the number in the measured application?\n");
-            fprintf(OUTSTREAM,"likwid_markerInit and likwid_markerClose must be called in serial region.\n");
-
-            ERROR_PRINT(Number of threads %d in marker file unequal to number of threads in likwid-perfCtr %d,numberOfThreads,perfmon_numThreads);
+            reg = counter_map[*index].counterRegister;
+            check_settings = 0;
         }
-
-        /* allocate  LikwidResults struct */
-        for (i=0;i<perfmon_numRegions; i++)
+        err = HPMread(0, counter_map[*index].device, reg, &tmp);
+        if (err != 0)
         {
-            results[i].time = (double*) malloc(numberOfThreads * sizeof(double));
-            results[i].count = (uint32_t*) malloc(numberOfThreads * sizeof(uint32_t));
-            results[i].counters = (double**) malloc(numberOfThreads * sizeof(double*));
-
-            for (j=0;j<numberOfThreads; j++)
+            if (err == -ENODEV)
             {
-                results[i].time[j] = 0.0;
-                results[i].counters[j] = (double*) malloc(NUM_PMC * sizeof(double));
-
-                for (k=0;k<NUM_PMC; k++)
-                {
-                        results[i].counters[j][k] = 0.0;
-                }
+                DEBUG_PRINT(DEBUGLEV_DETAIL, Device %s not accessible on this machine,
+                                         pci_devices[box_map[*type].device].name);
             }
+            else
+            {
+                DEBUG_PRINT(DEBUGLEV_DETAIL, Counter %s not readable on this machine,
+                                             counter_map[*index].key);
+            }
+            *type = NOTYPE;
+            ret = FALSE;
         }
-
-        sl.src = src;
-        sl.line = 0;
-        sl.results = results;
-        bsplitcb (src, (char) '\n', bstrchr(src,10)+1, lineCb, &sl);
-
-        fclose (fp);
-        bdestroy (src);
-    }
-    else
-    {
-        fprintf(OUTSTREAM,"ERROR: The marker result file could not be found!\n");
-        fprintf(OUTSTREAM,"Did you call likwid_markerClose() at the end of your measurement?\n");
-        ERROR;
-    }
-
-    *resultsRef = results;
-    bstring exeString = bformat("rm  -f %s",bdata(filename));
-    ret = system(bdata(exeString));
-
-    if (ret == EOF)
-    {
-        ERROR;
-    }
-
-    bdestroy(exeString);
-}
-
-static void
-printResultTable(PerfmonResultTable * tableData)
-{
-    if (perfmon_csvoutput)
-    {
-        int r, c;
-        for (c = 0; c < tableData->header->qty; c++)
-        {
-            fprintf(OUTSTREAM, "%s%s", ((c == 0) ? "\n" : ","), tableData->header->entry[c]->data);
-        }
-        fprintf(OUTSTREAM, "%s", "\n");
-
-        for (r = 0; r < tableData->numRows; r++)
+        else if (tmp == 0x0)
         {
-            fprintf(OUTSTREAM, "%s", tableData->rows[r].label->data);
-
-            for (c = 0; c < tableData->numColumns; c++)
+            err = HPMwrite(0, counter_map[*index].device, reg, 0x0ULL);
+            if (err != 0)
             {
-                if (!isnan(tableData->rows[r].value[c]))
+                if (err == -ENODEV)
                 {
-                    fprintf(OUTSTREAM, ",%lf", tableData->rows[r].value[c]);
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, Device %s not accessible on this machine,
+                                             pci_devices[box_map[*type].device].name);
                 }
                 else
                 {
-                    fprintf(OUTSTREAM, ",%s", "nan");
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, Counter %s not writeable on this machine,
+                                             counter_map[*index].key);
                 }
+                *type = NOTYPE;
+                ret = FALSE;
             }
-            fprintf(OUTSTREAM, "%s", "\n");
         }
-        fprintf(OUTSTREAM, "%s", "\n");
-    }
-    else
-    {
-        int i,j;
-        TableContainer* table;
-        bstrList* labelStrings = NULL;
-        bstring label = bfromcstr("NO");
-
-        table = asciiTable_allocate(tableData->numRows,
-                tableData->numColumns+1,
-                tableData->header);
-        asciiTable_setOutput(OUTSTREAM);
-
-        labelStrings = bstrListCreate();
-        bstrListAlloc(labelStrings, tableData->numColumns+1);
-
-        for (i=0; i<tableData->numRows; i++)
+        else if (check_settings)
         {
-            labelStrings->qty = 0;
-            labelStrings->entry[0] = bstrcpy(tableData->rows[i].label);
-            labelStrings->qty++;
-
-            for (j=0; j<(tableData->numColumns);j++)
-            {
-                label = bformat("%g", tableData->rows[i].value[j]);
-                labelStrings->entry[1+j] = bstrcpy(label);
-                labelStrings->qty++;
-            }
-            asciiTable_appendRow(table,labelStrings);
+            DEBUG_PRINT(DEBUGLEV_DETAIL, Counter %s has bits set but we ignore it,
+                                             counter_map[*index].key);
         }
-
-        asciiTable_print(table);
-        bdestroy(label);
-        bstrListDestroy(labelStrings);
-        asciiTable_free(table);
     }
-}
-
-static int
-getGroupId(bstring groupStr,PerfmonGroup* group)
-{
-    *group = _NOGROUP;
-
-    for (int i=0; i<perfmon_numGroups; i++)
+    else if ((ret) && ((*type == POWER) || (*type == WBOX0FIX) || (*type == THERMAL)))
     {
-        if (biseqcstr(groupStr,group_map[i].key))
+        err = HPMread(0, MSR_DEV, counter_map[*index].counterRegister, &tmp);
+        if (err != 0)
         {
-            *group = group_map[i].index;
-            return i;
+            DEBUG_PRINT(DEBUGLEV_DETAIL, Counter %s not readable on this machine,
+                                         counter_map[*index].key);
+            *type = NOTYPE;
+            ret = FALSE;
         }
     }
-
-    return -1;
+    else
+    {
+        *type = NOTYPE;
+        ret = FALSE;
+    }
+    return ret;
 }
 
 static int
@@ -467,795 +222,341 @@ checkCounter(bstring counterName, const char* limit)
 {
     int i;
     struct bstrList* tokens;
-    int value = FALSE;
+    int ret = FALSE;
     bstring limitString = bfromcstr(limit);
 
     tokens = bstrListCreate();
     tokens = bsplit(limitString,'|');
-
     for(i=0; i<tokens->qty; i++)
     {
         if(bstrncmp(counterName, tokens->entry[i], blength(tokens->entry[i])))
         {
-            value = FALSE;
+            ret = FALSE;
         }
         else
         {
-            value = TRUE;
+            ret = TRUE;
             break;
         }
     }
-
     bdestroy(limitString);
     bstrListDestroy(tokens);
-    return value;
+    return ret;
 }
 
-static void
-freeResultTable(PerfmonResultTable* tableData)
+static int
+getEvent(bstring event_str, bstring counter_str, PerfmonEvent* event)
 {
-    int i;
-
-    bstrListDestroy(tableData->header);
-
-    for (i=0; i<tableData->numRows; i++)
+    int ret = FALSE;
+    int (*ownstrncmp)(const char *, const char *, size_t);
+    ownstrncmp = &strncmp;
+    for (int i=0; i< perfmon_numArchEvents; i++)
     {
-        free(tableData->rows[i].value);
+        if (biseqcstr(event_str, eventHash[i].name))
+        {
+            if (!checkCounter(counter_str, eventHash[i].limit))
+            {
+                continue;
+            }
+            *event = eventHash[i];
+            ret = TRUE;
+            break;
+        }
     }
 
-    free(tableData->rows);
+    return ret;
 }
 
-static void
-initResultTable(PerfmonResultTable* tableData,
-        bstrList* firstColumn,
-        int numRows,
-        int numColumns)
+static int 
+assignOption(PerfmonEvent* event, bstring entry, int index, EventOptionType type, int zero_value)
 {
-    int i;
-    bstrList* header;
-    bstring label;
-
-    header = bstrListCreate();
-    bstrListAlloc(header, numColumns+1);
-    header->entry[0] = bstrcpy(firstColumn->entry[0]); header->qty++;
-
-    for (i=0; i<perfmon_numThreads;i++)
+    int found_double = -1;
+    int return_index = index;
+    long long unsigned int value;
+    for (int k = 0; k < index; k++)
     {
-        label = bformat("core %d",perfmon_threadData[i].processorId);
-        header->entry[1+i] = bstrcpy(label); header->qty++;
+        if (event->options[k].type == type)
+        {
+            found_double = k;
+            break;
+        }
     }
-
-    tableData->numRows = numRows;
-    tableData->numColumns = numColumns;
-    tableData->header = header;
-    tableData->rows = (PerfmonResult*) malloc(numRows*sizeof(PerfmonResult));
-
-    for (i=0; i<numRows; i++)
+    if (found_double >= 0)
     {
-        tableData->rows[i].label = firstColumn->entry[1+i];
-        tableData->rows[i].value =
-            (double*) malloc((numColumns)*sizeof(double));
+        index = found_double;
     }
-}
-
-static void
-initStatisticTable(PerfmonResultTable* tableData,
-        bstrList* firstColumn,
-        int numRows)
-{
-    int i;
-    int numColumns = 4;
-    bstrList* header;
-    bstring label;
-
-    header = bstrListCreate();
-    bstrListAlloc(header, numColumns+1);
-    header->entry[0] = bstrcpy(firstColumn->entry[0]); header->qty++;
-
-    label = bformat("Sum");
-    header->entry[1] = bstrcpy(label); header->qty++;
-    label = bformat("Max");
-    header->entry[2] = bstrcpy(label); header->qty++;
-    label = bformat("Min");
-    header->entry[3] = bstrcpy(label); header->qty++;
-    label = bformat("Avg");
-    header->entry[4] = bstrcpy(label); header->qty++;
-
-    tableData->numRows = numRows;
-    tableData->numColumns = numColumns;
-    tableData->header = header;
-    tableData->rows = (PerfmonResult*) malloc(numRows*sizeof(PerfmonResult));
-
-    for (i=0; i<numRows; i++)
-    {
-        tableData->rows[i].label = firstColumn->entry[1+i];
-        bcatcstr(tableData->rows[i].label," STAT");
-        tableData->rows[i].value =
-            (double*) malloc((numColumns)*sizeof(double));
+    else
+    {
+        return_index++;
     }
-}
-
-static void printDerivedMetricsFixed(void)
-{
-    int threadId;
-    double time = rdtscTime;
-    double inverseClock = 1.0 /(double) timer_getCpuClock();
-    PerfmonResultTable tableData;
-    int numRows;
-    int numColumns = perfmon_numThreads;
-    bstring label;
-    bstrList* fc;
-    double tmpValue;
-
-    numRows = 4;
-    INIT_BASIC;
-
-    bstrListAdd(fc,1,Runtime (RDTSC) [s]);
-    bstrListAdd(fc,2,Runtime unhalted [s]);
-    bstrListAdd(fc,3,Clock [MHz]);
-    bstrListAdd(fc,4,CPI);
-
-    initResultTable(&tableData, fc, numRows, numColumns);
-
-    for(threadId=0; threadId < perfmon_numThreads; threadId++)
-    {
-        tmpValue = time;
-        if (!isnan(tmpValue))
-        {
-            tableData.rows[0].value[threadId] = tmpValue;
-        }
-        else
-        {
-            tableData.rows[0].value[threadId] = 0.0;
-        }
-
-        tmpValue = perfmon_getResult(threadId,"FIXC1")*inverseClock;
-        if (!isnan(tmpValue))
-        {
-            tableData.rows[1].value[threadId] = tmpValue;
-        }
-        else
-        {
-            tableData.rows[1].value[threadId] = 0.0;
-        }
-
-        tmpValue = 1.E-06*(perfmon_getResult(threadId,"FIXC1")/perfmon_getResult(threadId,"FIXC2"))/inverseClock;
-        if (!isnan(tmpValue))
-        {
-            tableData.rows[2].value[threadId] = tmpValue;
-        }
-        else
-        {
-            tableData.rows[2].value[threadId] = 0.0;
-        }
-
-        tmpValue = perfmon_getResult(threadId,"FIXC1")/perfmon_getResult(threadId,"FIXC0");
-        if (!isnan(tmpValue))
-        {
-            tableData.rows[3].value[threadId] = tmpValue;
-        }
-        else
-        {
-            tableData.rows[3].value[threadId] = 0.0;
-        }
-
+    event->options[index].type = type;
+    if (zero_value)
+    {
+        event->options[index].value = 0;
     }
-    printResultTable(&tableData);
-    freeResultTable(&tableData);
-}
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-void
-perfmon_setCSVMode(int v)
-{
-    perfmon_csvoutput = v;
-}
-
-void
-perfmon_printCounters(void)
-{
-    fprintf(OUTSTREAM,"This architecture has %d counters.\n", perfmon_numCounters);
-    fprintf(OUTSTREAM,"Counters names:  ");
-
-    for (int i=0; i<perfmon_numCounters; i++)
+    else
     {
-        fprintf(OUTSTREAM,"%s\t",counter_map[i].key);
+        value = 0;
+        sscanf(bdata(entry), "%llx", &value);
+        event->options[index].value = value;
     }
-    fprintf(OUTSTREAM,".\n");
+    return return_index;
 }
 
-void
-perfmon_printEvents(void)
+static int
+parseOptions(struct bstrList* tokens, PerfmonEvent* event, RegisterIndex index)
 {
-    int i;
+    int i,j;
+    struct bstrList* subtokens;
 
-    fprintf(OUTSTREAM,"This architecture has %d events.\n", perfmon_numArchEvents);
-    fprintf(OUTSTREAM,"Event tags (tag, id, umask, counters):\n");
-
-    for (i=0; i<perfmon_numArchEvents; i++)
+    for (i = event->numberOfOptions; i < MAX_EVENT_OPTIONS; i++)
     {
-        fprintf(OUTSTREAM,"%s, 0x%X, 0x%X, %s \n",
-                eventHash[i].name,
-                eventHash[i].eventId,
-                eventHash[i].umask,
-                eventHash[i].limit);
+        event->options[i].type = EVENT_OPTION_NONE;
     }
-}
-
-
-double
-perfmon_getResult(int threadId, char* counterString)
-{
-    bstring counter = bfromcstr(counterString);
-    PerfmonCounterIndex  index;
 
-   if (getIndex(counter,&index))
-   {
-           return perfmon_threadData[threadId].counters[index].counterData;
-   }
-
-   fprintf (stderr, "perfmon_getResult: Failed to get counter Index!\n" );
-   return 0.0;
-}
-
-
-void
-perfmon_initEventSet(StrUtilEventSet* eventSetConfig, PerfmonEventSet* set)
-{
-    set->numberOfEvents = eventSetConfig->numberOfEvents;
-    set->events = (PerfmonEventSetEntry*)
-        malloc(set->numberOfEvents * sizeof(PerfmonEventSetEntry));
-
-    for (int i=0; i<set->numberOfEvents; i++)
+    if (tokens->qty-2 > MAX_EVENT_OPTIONS)
     {
-        /* get register index */
-        if (!getIndex(eventSetConfig->events[i].counterName,
-                    &set->events[i].index))
-        {
-            ERROR_PRINT(Counter register %s not supported,bdata(
-                  eventSetConfig->events[i].counterName));
-        }
-
-        /* setup event */
-        if (!getEvent(eventSetConfig->events[i].eventName,
-                    &set->events[i].event))
-        {
-            ERROR_PRINT(Event %s not found for current architecture,
-                bdata(eventSetConfig->events[i].eventName));
-        }
-
-        /* is counter allowed for event */
-        if (!checkCounter(eventSetConfig->events[i].counterName,
-                    set->events[i].event.limit))
-        {
-            ERROR_PRINT(Register not allowed  for event  %s,
-                bdata(eventSetConfig->events[i].eventName));
-        }
+        bstrListDestroy(tokens);
+        return -ERANGE;
     }
-}
 
-void
-perfmon_printMarkerResults(bstring filepath)
-{
-    int i;
-    int j;
-    int region;
-    LikwidResults* results = NULL;
-    PerfmonResultTable tableData;
-    PerfmonResultTable regionData;
-    int numRows = perfmon_set.numberOfEvents;
-    int numColumns = perfmon_numThreads;
-    bstrList* fc;
-    bstrList* regionLabels;
-    bstring label;
-    INIT_EVENTS;
-
-    readMarkerFile(filepath, &results);
-    initResultTable(&tableData, fc, numRows, numColumns);
-    regionLabels = bstrListCreate();
-    bstrListAlloc(regionLabels, 3);
-    bstrListAdd(regionLabels, 0, Region Info);
-    bstrListAdd(regionLabels, 1, RDTSC Runtime [s]);
-    bstrListAdd(regionLabels, 2, call count);
-
-    for (region=0; region<perfmon_numRegions; region++)
-    {
-        initResultTable(&tableData, fc, numRows, numColumns);
-        fprintf(OUTSTREAM,"\n=====================\n");
-        fprintf(OUTSTREAM,"Region: %s \n", bdata(results[region].tag));
-        fprintf(OUTSTREAM,"=====================\n");
-        initResultTable(&regionData, regionLabels, 2, numColumns);
-
-        for (j=0; j<numColumns; j++)
-        {
-            regionData.rows[0].value[j] = results[region].time[j];
-            regionData.rows[1].value[j] = (double) results[region].count[j];
-        }
-        printResultTable(&regionData);
+    subtokens = bstrListCreate();
 
-        for (i=0; i<numRows; i++)
+    for (i=2;i<tokens->qty;i++)
+    {
+        subtokens = bsplit(tokens->entry[i],'=');
+        btolower(subtokens->entry[0]);
+        if (subtokens->qty == 1)
         {
-            for (j=0; j<numColumns; j++)
+            if (biseqcstr(subtokens->entry[0], "edgedetect") == 1)
             {
-                tableData.rows[i].value[j] =
-                    results[region].counters[j][perfmon_set.events[i].index];
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_EDGE, 1);
             }
-        }
-
-        printResultTable(&tableData);
-
-        for (j=0; j<numColumns; j++)
-        {
-            for (i=0; i<numRows; i++)
+            else if (biseqcstr(subtokens->entry[0], "invert") == 1)
             {
-                perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData =
-                    results[region].counters[j][perfmon_set.events[i].index];
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_INVERT, 1);
             }
+            else if (biseqcstr(subtokens->entry[0], "kernel") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_COUNT_KERNEL, 1);
+            }
+            else if (biseqcstr(subtokens->entry[0], "anythread") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_ANYTHREAD, 1);
+            }
+            else if (biseqcstr(subtokens->entry[0], "occ_edgedetect") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_OCCUPANCY_EDGE, 1);
+            }
+            else if (biseqcstr(subtokens->entry[0], "occ_invert") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_OCCUPANCY_INVERT, 1);
+            }
+            else if (biseqcstr(subtokens->entry[0], "in_trans") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_IN_TRANS, 1);
+            }
+            else if (biseqcstr(subtokens->entry[0], "in_trans_aborted") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_IN_TRANS_ABORT, 1);
+            }
+            else
+            {
+                continue;
+            }
+            event->options[event->numberOfOptions].value = 0;
         }
-        rdtscTime = results[region].time[0];
-        if (groupSet != _NOGROUP)
-        {
-            printDerivedMetrics(groupSet);
-        }
-        else if ( cpuid_info.family == P6_FAMILY )
-        {
-            printDerivedMetricsFixed();
-        }
-    }
-
-    for (i=0;i<perfmon_numRegions; i++)
-    {
-        for (j=0;j<perfmon_numThreads; j++)
+        else if (subtokens->qty == 2)
         {
-            free(results[i].counters[j]);
-        }
-
-        free(results[i].counters);
-        free(results[i].time);
-    }
-
-    freeResultTable(&tableData);
-    freeResultTable(&regionData);
-    bstrListDestroy(fc);
-    bstrListDestroy(regionLabels);
-}
-
-void
-perfmon_logCounterResults(double time)
-{
-    int i;
-    int j;
-    double tmp;
-    static double timeStamp = 0.0;
-
-    timeStamp += time;
-
-    for (i=0; i<perfmon_set.numberOfEvents; i++)
-    {
-        fprintf(OUTSTREAM, "%s %e ", perfmon_set.events[i].event.name, timeStamp);
-        for (j=0; j<perfmon_numThreads; j++)
-        {
-            fprintf(OUTSTREAM, "%e ",
-                    (double) (perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData) - perfmon_threadState[j][perfmon_set.events[i].index]);
-            perfmon_threadState[j][perfmon_set.events[i].index] = perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData;
-        }
-        fprintf(OUTSTREAM,"\n");
-    }
-
-    if (groupSet != _NOGROUP)
-    {
-        logDerivedMetrics(groupSet, time, timeStamp);
-    }
-
-    fflush(OUTSTREAM);
-}
-
-void
-perfmon_printCounterResults()
-{
-    int i;
-    int j;
-    PerfmonResultTable tableData;
-    int numRows = perfmon_set.numberOfEvents;
-    int numColumns = perfmon_numThreads;
-    double stat[perfmon_set.numberOfEvents][4]; /* 0:sum, 1:max, 2:min, 3:avg */
-    bstrList* fc;
-    bstring label;
-    INIT_EVENTS;
-
-    for (i=0; i<numRows; i++)
-    {
-        stat[i][0] = 0;
-        stat[i][1] = 0;
-        stat[i][2] = DBL_MAX;
-    }
-
-    initResultTable(&tableData, fc, numRows, numColumns);
-
-    /* print raw event data */
-    for (i=0; i<numRows; i++)
-    {
-        for (j=0; j<numColumns; j++)
-        {
-            tableData.rows[i].value[j] =
-                (double) perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData;
-            stat[i][0] +=
-                (double) perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData;
-            stat[i][1] =  MAX(stat[i][1],
-                    (double) perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData);
-            stat[i][2] =  MIN(stat[i][2],
-                    (double) perfmon_threadData[j].counters[perfmon_set.events[i].index].counterData);
-        }
-    }
-    printResultTable(&tableData);
-    freeResultTable(&tableData);
-
-
-    /* for threaded results print sum, max, min and avg */
-    if (perfmon_numThreads > 1)
-    {
-        initStatisticTable(&tableData, fc, numRows);
-
-        for (i=0; i<numRows; i++)
-        {
-            stat[i][3] =  stat[i][0]/perfmon_numThreads;
-
-            for (j=0; j<4; j++)
+            if (biseqcstr(subtokens->entry[0], "opcode") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_OPCODE, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "match0") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MATCH0, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "match1") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MATCH1, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "match2") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MATCH2, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "match3") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MATCH3, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "mask0") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MASK0, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "mask1") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MASK1, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "mask2") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MASK2, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "mask3") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_MASK3, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "nid") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_NID, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "tid") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_TID, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "state") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_STATE, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "threshold") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_THRESHOLD, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "occupancy") == 1)
             {
-                tableData.rows[i].value[j] = stat[i][j];
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_OCCUPANCY, 0);
+            }
+            else if (biseqcstr(subtokens->entry[0], "occ_filter") == 1)
+            {
+                event->numberOfOptions = assignOption(event, subtokens->entry[1],
+                                    event->numberOfOptions, EVENT_OPTION_OCCUPANCY_FILTER, 0);
+            }
+            else
+            {
+                continue;
             }
         }
-        printResultTable(&tableData);
-        freeResultTable(&tableData);
     }
-
-    if (groupSet != _NOGROUP)
-    {
-        /* print derived metrics */
-        printDerivedMetrics(groupSet);
-    }
-    else if ( cpuid_info.family == P6_FAMILY )
+    for(i=event->numberOfOptions-1;i>=0;i--)
     {
-        printDerivedMetricsFixed();
-    }
-}
-
-double
-perfmon_getEventResult(int thread, int index)
-{
-    return (double) perfmon_threadData[thread].counters[perfmon_set.events[index].index].counterData;
-}
-
-EventSetup perfmon_prepareEventSetup(char* eventGroupString){
-     EventSetup setup;
-     bstring eventString = bfromcstr(eventGroupString);
-
-     setup.eventSetConfig = malloc(sizeof(setup.eventSetConfig));
-     setup.perfmon_set = malloc(sizeof(setup.perfmon_set));
-
-     int groupId = getGroupId(eventString, & setup.groupSet);
-     setup.groupName = strdup(eventGroupString);
-     setup.groupIndex = groupId;
-     if (setup.groupSet == _NOGROUP)
-     {
-        /* eventString is a custom eventSet */
-        bstr_to_eventset(setup.eventSetConfig, eventString);
-     }
-     else
-     {
-        /* eventString is a group */
-        eventString = bfromcstr(group_map[groupId].config);
-        bstr_to_eventset(setup.eventSetConfig, eventString);
-     }
-
-     perfmon_initEventSet(setup.eventSetConfig, setup.perfmon_set);
-     bdestroy(eventString);
-
-     setup.eventNames = (const char**) malloc(setup.perfmon_set->numberOfEvents * sizeof(const char*));
-
-     setup.numberOfEvents = setup.perfmon_set->numberOfEvents;
-     for (int i=0; i< setup.perfmon_set->numberOfEvents; i++)
-     {
-        setup.eventNames[i] = setup.perfmon_set->events[i].event.name;
-     }
-
-     setup.numberOfDerivedCounters = group_map[groupId].derivedCounters;
-     setup.derivedNames = (const char**) malloc(setup.numberOfDerivedCounters * sizeof(const char*));
-
-     for(int i=0; i < group_map[groupId].derivedCounters; i++){
-        setup.derivedNames[i] = group_map[groupId].derivedCounterNames[i];
-     }
-
-     return setup;
-}
-
-
-void perfmon_setupCountersForEventSet(EventSetup * setup){
-    perfmon_set = *setup->perfmon_set;
-    groupSet = setup->groupSet;
-    eventSetup = setup;
-    perfmon_setupCounters();
-}
-
-void perfmon_getEventCounterValues(uint64_t * values, uint64_t * out_max, uint64_t * out_min){
-
-    for(int e = 0; e < perfmon_set.numberOfEvents; e++ ){
-        uint64_t sum = 0;
-        uint64_t min = (uint64_t) -1;
-        uint64_t max = 0;
-
-        for(int i = 0; i < perfmon_numThreads; i++){
-            uint64_t cur = perfmon_threadData[i].counters[e].counterData;
-            sum += cur;
-            max = max > cur ? max : cur;
-            min = min < cur ? min : cur;
+        if (!(OPTIONS_TYPE_MASK(event->options[i].type) & (counter_map[index].optionMask|event->optionMask)))
+        {
+            DEBUG_PRINT(DEBUGLEV_INFO,Removing Option %s not valid for register %s,
+                        eventOptionTypeName[event->options[i].type],
+                        counter_map[index].key);
+            event->options[i].type = EVENT_OPTION_NONE;
+            event->numberOfOptions--;
         }
-        values[e] = sum / perfmon_numThreads;
-        out_min[e] = min;
-        out_max[e] = max;
-    }
-}
-
-void perfmon_getDerivedCounterValues(float * values, float * out_max, float * out_min){
-    perfmon_getDerivedCounterValuesArch(eventSetup->groupSet, values, out_max, out_min);
-}
-
-int
-perfmon_setupEventSetC(char* eventCString, const char*** eventnames)
-{
-     int i;
-     bstring eventString = bfromcstr(eventCString);
-     StrUtilEventSet eventSetConfig;
-     int groupId;
-
-     groupId = getGroupId(eventString, &groupSet);
-     if (groupSet == _NOGROUP)
-     {
-        /* eventString is a custom eventSet */
-        bstr_to_eventset(&eventSetConfig, eventString);
-     }
-     else
-     {
-        /* eventString is a group */
-        eventString = bfromcstr(group_map[groupId].config);
-        bstr_to_eventset(&eventSetConfig, eventString);
-     }
-
-     perfmon_initEventSet(&eventSetConfig, &perfmon_set);
-     perfmon_setupCounters();
-     bdestroy(eventString);
-
-     (*eventnames) = (const char**) malloc(perfmon_set.numberOfEvents * sizeof(const char*));
-
-     for (i=0; i<perfmon_set.numberOfEvents; i++)
-     {
-         (*eventnames)[i] = perfmon_set.events[i].event.name;
-     }
-
-     return perfmon_set.numberOfEvents;
-}
-
-void
-perfmon_setupEventSet(bstring eventString, BitMask* counterMask)
-{
-    int groupId;
-    int eventBool = FALSE;
-    StrUtilEventSet eventSetConfig;
-    PerfmonEvent eventSet;
-    struct bstrList* subStr;
-
-    groupId = getGroupId(eventString, &groupSet);
-
-    if (groupSet == _NOGROUP)
-    {
-        subStr = bstrListCreate();
-        subStr = bsplit(eventString,':');
-        eventBool = getEvent(subStr->entry[0], &eventSet);
-        bstrListDestroy(subStr);
     }
 
-    if (groupSet == _NOGROUP && eventBool != FALSE)
+    for(i=0;i<event->numberOfOptions;i++)
     {
-        /* eventString is a custom eventSet */
-        /* append fixed counters for Intel processors */
-        if ( cpuid_info.family == P6_FAMILY )
+        if (event->options[i].type == EVENT_OPTION_EDGE)
         {
-            if (cpuid_info.perf_num_fixed_ctr > 0)
+            int threshold_set = FALSE;
+            for (j=0;j<event->numberOfOptions;j++)
             {
-                bcatcstr(eventString,",INSTR_RETIRED_ANY:FIXC0");
+                if (event->options[i].type == EVENT_OPTION_THRESHOLD)
+                {
+                    threshold_set = TRUE;
+                    break;
+                }
             }
-            if (cpuid_info.perf_num_fixed_ctr > 1)
+            if ((threshold_set == FALSE) && (event->numberOfOptions < MAX_EVENT_OPTIONS))
             {
-                bcatcstr(eventString,",CPU_CLK_UNHALTED_CORE:FIXC1");
+                event->options[event->numberOfOptions].type = EVENT_OPTION_THRESHOLD;
+                event->options[event->numberOfOptions].value = 0x1;
+                event->numberOfOptions++;
             }
-            if (cpuid_info.perf_num_fixed_ctr > 2)
+            else
             {
-                bcatcstr(eventString,",CPU_CLK_UNHALTED_REF:FIXC2");
+                ERROR_PLAIN_PRINT(Cannot set threshold option to default. no more space in options list);
             }
         }
-        bstr_to_eventset(&eventSetConfig, eventString);
-    }
-    else if (groupId < 0 && eventBool == FALSE)
-    {
-        ERROR_PLAIN_PRINT(Unsupported group or event for this architecture!);
-        exit(EXIT_FAILURE);
-    }
-    else
-    {
-        if ( group_map[groupId].isUncore )
+        else if (event->options[i].type == EVENT_OPTION_OCCUPANCY)
         {
-            if ( (cpuid_info.model != SANDYBRIDGE_EP) &&
-                    (cpuid_info.model != IVYBRIDGE_EP) &&
-                    (cpuid_info.model != WESTMERE_EX) &&
-                    (cpuid_info.model != NEHALEM_EX))
+            int threshold_set = FALSE;
+            int edge_set = FALSE;
+            int invert_set = FALSE;
+            for (j=0;j<event->numberOfOptions;j++)
             {
-                ERROR_PLAIN_PRINT(Uncore not supported on Desktop processors!);
-                exit(EXIT_FAILURE);
+                if (event->options[i].type == EVENT_OPTION_THRESHOLD)
+                {
+                    threshold_set = TRUE;
+                    break;
+                }
+                if (event->options[i].type == EVENT_OPTION_EDGE)
+                {
+                    edge_set = TRUE;
+                    break;
+                }
+                if (event->options[i].type == EVENT_OPTION_INVERT)
+                {
+                    invert_set = TRUE;
+                    break;
+                }
             }
-        }
-
-        fprintf(OUTSTREAM,"Measuring group %s\n", group_map[groupId].key);
-        /* eventString is a group */
-        eventString = bfromcstr(group_map[groupId].config);
-        bstr_to_eventset(&eventSetConfig, eventString);
-    }
-
-    perfmon_initEventSet(&eventSetConfig, &perfmon_set);
-    perfmon_setupCounters();
-
-    if ( counterMask != NULL )
-    {
-        bitMask_init((*counterMask));
-        /* Extract counter mask from first thread */
-        for (int index=0; index<perfmon_numCounters; index++)
-        {
-            if ( perfmon_threadData[0].counters[index].init == TRUE )
+            if ((threshold_set == FALSE) && (event->numberOfOptions < MAX_EVENT_OPTIONS) && 
+                (edge_set == TRUE || invert_set == TRUE ))
             {
-                bitMask_set((*counterMask),index);
+                event->options[event->numberOfOptions].type = EVENT_OPTION_THRESHOLD;
+                event->options[event->numberOfOptions].value = 0x1;
+                event->numberOfOptions++;
+            }
+            else
+            {
+                ERROR_PLAIN_PRINT(Cannot set threshold option to default. no more space in options list);
             }
         }
     }
-}
-
-
-void
-perfmon_setupCounters()
-{
-    for (int j=0; j<perfmon_set.numberOfEvents; j++)
-    {
-        for (int i=0; i<perfmon_numThreads; i++)
-        {
-            perfmon_setupCounterThread(i,
-                    &perfmon_set.events[j].event,
-                    perfmon_set.events[j].index);
-        }
-    }
-}
-
-void
-perfmon_startCounters(void)
-{
-    for (int i=0;i<perfmon_numThreads;i++)
-    {
-        perfmon_startCountersThread(i);
-    }
-
-    timer_start(&timeData);
-}
-
-void
-perfmon_stopCounters(void)
-{
-    int i;
-
-    timer_stop(&timeData);
-
-    for (i=0;i<perfmon_numThreads;i++)
-    {
-        perfmon_stopCountersThread(i);
-    }
-
-    rdtscTime = timer_print(&timeData);
-}
-
-void
-perfmon_readCounters(void)
-{
-    int i;
 
-    for (i=0;i<perfmon_numThreads;i++)
-    {
-        perfmon_readCountersThread(i);
-    }
+    bstrListDestroy(subtokens);
+    return event->numberOfOptions;
 }
 
-
-void
-perfmon_printAvailableGroups()
+int
+getCounterTypeOffset(int index)
 {
-    int i;
-
-    fprintf(OUTSTREAM,"Available groups on %s:\n",cpuid_info.name);
-
-    for(i=0; i<perfmon_numGroups; i++)
+    int off = 0;
+    for (int j=index-1;j>=NUM_COUNTERS_CORE_IVYBRIDGE;j--)
     {
-        if ( group_map[i].isUncore )
+        if (counter_map[index].type == counter_map[j].type)
         {
-            if ( (cpuid_info.model == SANDYBRIDGE_EP) ||
-                 (cpuid_info.model == IVYBRIDGE_EP) ||
-                 (cpuid_info.model == WESTMERE_EX) ||
-                 (cpuid_info.model == NEHALEM_EX))
-            {
-                fprintf(OUTSTREAM,"%s: %s\n",group_map[i].key,
-                        group_map[i].info);
-            }
+            off++;
         }
         else
         {
-            fprintf(OUTSTREAM,"%s: %s\n",group_map[i].key,
-                    group_map[i].info);
+            break;
         }
     }
+    return off;
 }
 
-void
-perfmon_printGroupHelp(bstring group)
-{
-    int i;
-    PerfmonGroup groupDummy;
-
-    if ((i = getGroupId(group,&groupDummy))<0)
-    {
-        ERROR_PLAIN_PRINT(Group not found);
-    }
-    else
-    {
-        fprintf(OUTSTREAM,"Group %s:\n",bdata(group));
-        fprintf(OUTSTREAM,"%s",group_help[i].msg);
-    }
-}
-
-
 
 void
-perfmon_init(int numThreads_local, int threads[], FILE* outstream)
+perfmon_init_maps(void)
 {
-    if (!lock_check())
-    {
-        fprintf(stderr,"Access to performance counters is locked.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    perfmon_numThreads = numThreads_local;
-    perfmon_threadData = (PerfmonThread*)
-        malloc(perfmon_numThreads * sizeof(PerfmonThread));
-    /* This is specific for daemon mode. */
-    perfmon_threadState = (double**)
-        malloc(perfmon_numThreads * sizeof(double*));
-
-    for (int i=0; i<perfmon_numThreads; i++)
-    {
-        perfmon_threadState[i] = (double*)
-            malloc(NUM_PMC * sizeof(double));
-        for(int j=0; j<NUM_PMC;j++)
-        {
-            perfmon_threadState[i][j] = 0.0;
-        }
-    }
-
-    OUTSTREAM = outstream;
-
-    for(int i=0; i<MAX_NUM_NODES; i++) socket_lock[i] = LOCK_INIT;
-
-    if (accessClient_mode != DAEMON_AM_DIRECT)
-    {
-        accessClient_init(&socket_fd);
-    }
-
-    msr_init(socket_fd);
-
+    box_map = NULL;
     switch ( cpuid_info.family )
     {
         case P6_FAMILY:
@@ -1263,75 +564,37 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
             switch ( cpuid_info.model )
             {
                 case PENTIUM_M_BANIAS:
-
                 case PENTIUM_M_DOTHAN:
-
                     eventHash = pm_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEvents_pm;
-
-                    group_map = pm_group_map;
-                 //   group_help = pm_group_help;
-                    perfmon_numGroups = perfmon_numGroups_pm;
-
                     counter_map = pm_counter_map;
+                    box_map = pm_box_map;
                     perfmon_numCounters = perfmon_numCounters_pm;
-
-                    initThreadArch = perfmon_init_pm;
-                    printDerivedMetrics = perfmon_printDerivedMetrics_pm;
-                    assert(FALSE && "NOT SUPPORTED");
-                    perfmon_startCountersThread = perfmon_startCountersThread_pm;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_pm;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_pm;
                     break;
 
                 case ATOM_45:
-
                 case ATOM_32:
-
                 case ATOM_22:
-
                 case ATOM:
-
                     eventHash = atom_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEventsAtom;
-
-                    group_map = atom_group_map;
-                    group_help = atom_group_help;
-                    perfmon_numGroups = perfmon_numGroupsAtom;
-
                     counter_map = core2_counter_map;
                     perfmon_numCounters = perfmon_numCountersCore2;
-
-                    initThreadArch = perfmon_init_core2;
-                    printDerivedMetrics = perfmon_printDerivedMetricsAtom;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesAtom;
-                    perfmon_startCountersThread = perfmon_startCountersThread_core2;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_core2;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_core2;
+                    box_map = core2_box_map;
                     break;
 
-                case ATOM_SILVERMONT_C:
                 case ATOM_SILVERMONT_E:
-                case ATOM_SILVERMONT_F1:
-                case ATOM_SILVERMONT_F2:
-                case ATOM_SILVERMONT_F3:
-                    power_init(0);
-                    thermal_init(0);
+                case ATOM_SILVERMONT_C:
+                case ATOM_SILVERMONT_Z1:
+                case ATOM_SILVERMONT_Z2:
+                case ATOM_SILVERMONT_F:
+                case ATOM_SILVERMONT_AIR:
                     eventHash = silvermont_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEventsSilvermont;
-
-                    group_map = silvermont_group_map;
-                    group_help = silvermont_group_help;
-                    perfmon_numGroups = perfmon_numGroupsSilvermont;
-
                     counter_map = silvermont_counter_map;
+                    box_map = silvermont_box_map;
                     perfmon_numCounters = perfmon_numCountersSilvermont;
-
-                    initThreadArch = perfmon_init_silvermont;
-                    printDerivedMetrics = perfmon_printDerivedMetricsSilvermont;
-                    perfmon_startCountersThread = perfmon_startCountersThread_silvermont;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_silvermont;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_silvermont;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersSilvermont;
                     break;
 
                 case CORE_DUO:
@@ -1339,216 +602,334 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
                     break;
 
                 case XEON_MP:
-
                 case CORE2_65:
-
                 case CORE2_45:
-
                     eventHash = core2_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEventsCore2;
-
-                    group_map = core2_group_map;
-                    group_help = core2_group_help;
-                    perfmon_numGroups = perfmon_numGroupsCore2;
-
                     counter_map = core2_counter_map;
                     perfmon_numCounters = perfmon_numCountersCore2;
-
-                    initThreadArch = perfmon_init_core2;
-                    printDerivedMetrics = perfmon_printDerivedMetricsCore2;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesCore2;
-
-                    logDerivedMetrics = perfmon_logDerivedMetricsCore2;
-                    perfmon_startCountersThread = perfmon_startCountersThread_core2;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_core2;
-                    perfmon_readCountersThread = perfmon_readCountersThread_core2;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_core2;
+                    box_map = core2_box_map;
                     break;
 
                 case NEHALEM_EX:
-
                     eventHash = nehalemEX_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEventsNehalemEX;
-
-                    group_map = nehalemEX_group_map;
-                    group_help = nehalemEX_group_help;
-                    perfmon_numGroups = perfmon_numGroupsNehalemEX;
-
-                    counter_map = westmereEX_counter_map;
-                    perfmon_numCounters = perfmon_numCountersWestmereEX;
-
-                    initThreadArch = perfmon_init_nehalemEX;
-                    printDerivedMetrics = perfmon_printDerivedMetricsNehalemEX;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesNehalemEX;
-                    logDerivedMetrics = perfmon_logDerivedMetricsNehalemEX;
-                    perfmon_startCountersThread = perfmon_startCountersThread_nehalemEX;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_nehalemEX;
-                    perfmon_readCountersThread = perfmon_readCountersThread_nehalemEX;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_nehalemEX;
+                    counter_map = nehalemEX_counter_map;
+                    perfmon_numCounters = perfmon_numCountersNehalemEX;
+                    box_map = nehalemEX_box_map;
                     break;
 
                 case WESTMERE_EX:
-
                     eventHash = westmereEX_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEventsWestmereEX;
-
-                    group_map = westmereEX_group_map;
-                    group_help = westmereEX_group_help;
-                    perfmon_numGroups = perfmon_numGroupsWestmereEX;
-
                     counter_map = westmereEX_counter_map;
                     perfmon_numCounters = perfmon_numCountersWestmereEX;
-
-                    initThreadArch = perfmon_init_westmereEX;
-                    printDerivedMetrics = perfmon_printDerivedMetricsWestmereEX;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesWestmereEX;
-                    logDerivedMetrics = perfmon_logDerivedMetricsWestmereEX;
-                    perfmon_startCountersThread = perfmon_startCountersThread_westmereEX;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_westmereEX;
-                    perfmon_readCountersThread = perfmon_readCountersThread_westmereEX;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_westmereEX;
+                    box_map = westmereEX_box_map;
                     break;
 
                 case NEHALEM_BLOOMFIELD:
-
                 case NEHALEM_LYNNFIELD:
-
-                    thermal_init(0);
-
+                case NEHALEM_LYNNFIELD_M:
                     eventHash = nehalem_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEventsNehalem;
-
-                    group_map = nehalem_group_map;
-                    group_help = nehalem_group_help;
-                    perfmon_numGroups = perfmon_numGroupsNehalem;
-
                     counter_map = nehalem_counter_map;
                     perfmon_numCounters = perfmon_numCountersNehalem;
-
-                    initThreadArch = perfmon_init_nehalem;
-                    printDerivedMetrics = perfmon_printDerivedMetricsNehalem;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesNehalem;
-
-                    logDerivedMetrics = perfmon_logDerivedMetricsNehalem;
-                    perfmon_startCountersThread = perfmon_startCountersThread_nehalem;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_nehalem;
-                    perfmon_readCountersThread = perfmon_readCountersThread_nehalem;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_nehalem;
+                    box_map = nehalem_box_map;
                     break;
 
                 case NEHALEM_WESTMERE_M:
-
                 case NEHALEM_WESTMERE:
-
-                    thermal_init(0);
-
                     eventHash = westmere_arch_events;
                     perfmon_numArchEvents = perfmon_numArchEventsWestmere;
-
-                    group_map = westmere_group_map;
-                    group_help = westmere_group_help;
-                    perfmon_numGroups = perfmon_numGroupsWestmere;
-
                     counter_map = nehalem_counter_map;
                     perfmon_numCounters = perfmon_numCountersNehalem;
-
-                    initThreadArch = perfmon_init_nehalem;
-                    printDerivedMetrics = perfmon_printDerivedMetricsWestmere;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesWestmere;
-
-                    logDerivedMetrics = perfmon_logDerivedMetricsWestmere;
-                    perfmon_startCountersThread = perfmon_startCountersThread_nehalem;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_nehalem;
-                    perfmon_readCountersThread = perfmon_readCountersThread_nehalem;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_nehalem;
+                    box_map = nehalem_box_map;
                     break;
 
+                case IVYBRIDGE_EP:
+                    pci_devices = ivybridgeEP_pci_devices;
+                    box_map = ivybridgeEP_box_map;
+                    eventHash = ivybridgeEP_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsIvybridgeEP;
+                    counter_map = ivybridgeEP_counter_map;
+                    perfmon_numCounters = perfmon_numCountersIvybridgeEP;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersIvybridgeEP;
+                    break;
                 case IVYBRIDGE:
+                    eventHash = ivybridge_arch_events;
+                    box_map = ivybridge_box_map;
+                    perfmon_numArchEvents = perfmon_numArchEventsIvybridge;
+                    counter_map = ivybridge_counter_map;
+                    perfmon_numCounters = perfmon_numCountersIvybridge;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersIvybridge;
+                    break;
 
-                case IVYBRIDGE_EP:
+                case HASWELL_EP:
+                    eventHash = haswellEP_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsHaswellEP;
+                    counter_map = haswellEP_counter_map;
+                    perfmon_numCounters = perfmon_numCountersHaswellEP;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersHaswellEP;
+                    box_map = haswellEP_box_map;
+                    pci_devices = haswellEP_pci_devices;
+                    break;
+                case HASWELL:
+                case HASWELL_M1:
+                case HASWELL_M2:
+                    eventHash = haswell_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsHaswell;
+                    counter_map = haswell_counter_map;
+                    perfmon_numCounters = perfmon_numCountersHaswell;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersHaswell;
+                    box_map = haswell_box_map;
+                    break;
 
-                    power_init(0); /* FIXME Static coreId is dangerous */
-                    thermal_init(0);
-                    pci_init(socket_fd);
+                case SANDYBRIDGE_EP:
+                    pci_devices = sandybridgeEP_pci_devices;
+                    box_map = sandybridgeEP_box_map;
+                    eventHash = sandybridgeEP_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsSandybridgeEP;
+                    counter_map = sandybridgeEP_counter_map;
+                    perfmon_numCounters = perfmon_numCountersSandybridgeEP;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersSandybridgeEP;
+                    break;
+                case SANDYBRIDGE:
+                    box_map = sandybridge_box_map;
+                    eventHash = sandybridge_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsSandybridge;
+                    counter_map = sandybridge_counter_map;
+                    perfmon_numCounters = perfmon_numCountersSandybridge;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersSandybridge;
+                    break;
 
-                    eventHash = ivybridge_arch_events;
-                    perfmon_numArchEvents = perfmon_numArchEventsIvybridge;
+                case BROADWELL:
+                case BROADWELL_E:
+                case BROADWELL_D:
+                    box_map = broadwell_box_map;
+                    eventHash = broadwell_arch_events;
+                    counter_map = broadwell_counter_map;
+                    perfmon_numArchEvents = perfmon_numArchEventsBroadwell;
+                    perfmon_numCounters = perfmon_numCountersBroadwell;
+                    perfmon_numCoreCounters = perfmon_numCoreCountersBroadwell;
+                    break;
 
-                    group_map = ivybridge_group_map;
-                    group_help = ivybridge_group_help;
-                    perfmon_numGroups = perfmon_numGroupsIvybridge;
+                default:
+                    ERROR_PLAIN_PRINT(Unsupported Processor);
+                    break;
+            }
+            break;
 
-                    counter_map = ivybridge_counter_map;
-                    perfmon_numCounters = perfmon_numCountersIvybridge;
+        case MIC_FAMILY:
 
-                    initThreadArch = perfmon_init_ivybridge;
-                    printDerivedMetrics = perfmon_printDerivedMetricsIvybridge;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesIvybridge;
+            switch ( cpuid_info.model )
+            {
+                case XEON_PHI:
+                    eventHash = phi_arch_events;
+                    perfmon_numArchEvents = perfmon_numArchEventsPhi;
+                    counter_map = phi_counter_map;
+                    box_map = phi_box_map;
+                    perfmon_numCounters = perfmon_numCountersPhi;
+                    break;
 
-                    logDerivedMetrics = perfmon_logDerivedMetricsIvybridge;
-                    perfmon_startCountersThread = perfmon_startCountersThread_ivybridge;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_ivybridge;
-                    perfmon_readCountersThread = perfmon_readCountersThread_ivybridge;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_ivybridge;
+                default:
+                    ERROR_PLAIN_PRINT(Unsupported Processor);
                     break;
+            }
+            break;
 
-                case HASWELL:
+        case K8_FAMILY:
+            eventHash = k8_arch_events;
+            perfmon_numArchEvents = perfmon_numArchEventsK8;
+            counter_map = k10_counter_map;
+            box_map = k10_box_map;
+            perfmon_numCounters = perfmon_numCountersK10;
+            break;
 
-                case HASWELL_EX:
+        case K10_FAMILY:
+            eventHash = k10_arch_events;
+            perfmon_numArchEvents = perfmon_numArchEventsK10;
+            counter_map = k10_counter_map;
+            box_map = k10_box_map;
+            perfmon_numCounters = perfmon_numCountersK10;
+            break;
 
-                case HASWELL_M1:
+        case K15_FAMILY:
+            eventHash = interlagos_arch_events;
+            perfmon_numArchEvents = perfmon_numArchEventsInterlagos;
+            counter_map = interlagos_counter_map;
+            box_map = interlagos_box_map;
+            perfmon_numCounters = perfmon_numCountersInterlagos;
+            break;
 
-                case HASWELL_M2:
+        case K16_FAMILY:
+            eventHash = kabini_arch_events;
+            perfmon_numArchEvents = perfmon_numArchEventsKabini;
+            counter_map = kabini_counter_map;
+            box_map = kabini_box_map;
+            perfmon_numCounters = perfmon_numCountersKabini;
+           break;
 
-                    power_init(0); /* FIXME Static coreId is dangerous */
-                    thermal_init(0);
+        default:
+            ERROR_PLAIN_PRINT(Unsupported Processor);
+            break;
+    }
+    return;
+}
 
-                    eventHash = haswell_arch_events;
-                    perfmon_numArchEvents = perfmon_numArchEventsHaswell;
+void
+perfmon_init_funcs(int* init_power, int* init_temp)
+{
+    int initialize_power = FALSE;
+    int initialize_thermal = FALSE;
+    switch ( cpuid_info.family )
+    {
+        case P6_FAMILY:
 
-                    group_map = haswell_group_map;
-                    group_help = haswell_group_help;
-                    perfmon_numGroups = perfmon_numGroupsHaswell;
+            switch ( cpuid_info.model )
+            {
+                case PENTIUM_M_BANIAS:
+                case PENTIUM_M_DOTHAN:
+                    initThreadArch = perfmon_init_pm;
+                    perfmon_startCountersThread = perfmon_startCountersThread_pm;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_pm;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_pm;
+                    perfmon_readCountersThread = perfmon_readCountersThread_pm;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_pm;
+                    break;
 
-                    counter_map = haswell_counter_map;
-                    perfmon_numCounters = perfmon_numCountersHaswell;
+                case ATOM_45:
+                case ATOM_32:
+                case ATOM_22:
+                case ATOM:
+                    initThreadArch = perfmon_init_core2;
+                    perfmon_startCountersThread = perfmon_startCountersThread_core2;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_core2;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_core2;
+                    perfmon_readCountersThread = perfmon_readCountersThread_core2;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_core2;
+                    break;
 
-                    initThreadArch = perfmon_init_haswell;
-                    printDerivedMetrics = perfmon_printDerivedMetricsHaswell;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesHaswell;
-                    logDerivedMetrics = perfmon_logDerivedMetricsHaswell;
-                    perfmon_startCountersThread = perfmon_startCountersThread_haswell;
-                    perfmon_stopCountersThread = perfmon_stopCountersThread_haswell;
-                    perfmon_readCountersThread = perfmon_readCountersThread_haswell;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_haswell;
+                case ATOM_SILVERMONT_E:
+                case ATOM_SILVERMONT_C:
+                case ATOM_SILVERMONT_Z1:
+                case ATOM_SILVERMONT_Z2:
+                case ATOM_SILVERMONT_F:
+                case ATOM_SILVERMONT_AIR:
+                    initialize_power = TRUE;
+                    initialize_thermal = TRUE;
+                    initThreadArch = perfmon_init_silvermont;
+                    perfmon_startCountersThread = perfmon_startCountersThread_silvermont;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_silvermont;
+                    perfmon_setupCountersThread = perfmon_setupCountersThread_silvermont;
+                    perfmon_readCountersThread = perfmon_readCountersThread_silvermont;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_silvermont;
                     break;
 
-                case SANDYBRIDGE:
 
-                case SANDYBRIDGE_EP:
+                case CORE_DUO:
+                    ERROR_PLAIN_PRINT(Unsupported Processor);
+                    break;
 
-                    power_init(0); /* FIXME Static coreId is dangerous */
-                    thermal_init(0);
-                    pci_init(socket_fd);
+                case XEON_MP:
+                case CORE2_65:
+                case CORE2_45:
+                    initThreadArch = perfmon_init_core2;
+                    perfmon_startCountersThread = perfmon_startCountersThread_core2;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_core2;
+                    perfmon_readCountersThread = perfmon_readCountersThread_core2;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_core2;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_core2;
+                    break;
 
-                    eventHash = sandybridge_arch_events;
-                    perfmon_numArchEvents = perfmon_numArchEventsSandybridge;
+                case NEHALEM_EX:
+                    initThreadArch = perfmon_init_nehalemEX;
+                    perfmon_startCountersThread = perfmon_startCountersThread_nehalemEX;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_nehalemEX;
+                    perfmon_readCountersThread = perfmon_readCountersThread_nehalemEX;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_nehalemEX;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_nehalemEX;
+                    break;
 
-                    group_map = sandybridge_group_map;
-                    group_help = sandybridge_group_help;
-                    perfmon_numGroups = perfmon_numGroupsSandybridge;
+                case WESTMERE_EX:
+                    initThreadArch = perfmon_init_westmereEX;
+                    perfmon_startCountersThread = perfmon_startCountersThread_westmereEX;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_westmereEX;
+                    perfmon_readCountersThread = perfmon_readCountersThread_westmereEX;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_westmereEX;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_westmereEX;
+                    break;
 
-                    counter_map = sandybridge_counter_map;
-                    perfmon_numCounters = perfmon_numCountersSandybridge;
+                case NEHALEM_BLOOMFIELD:
+                case NEHALEM_LYNNFIELD:
+                    initialize_thermal = TRUE;
+                    initThreadArch = perfmon_init_nehalem;
+                    perfmon_startCountersThread = perfmon_startCountersThread_nehalem;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_nehalem;
+                    perfmon_readCountersThread = perfmon_readCountersThread_nehalem;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_nehalem;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_nehalem;
+                    break;
+
+                case NEHALEM_WESTMERE_M:
+                case NEHALEM_WESTMERE:
+                    initialize_thermal = TRUE;
+                    initThreadArch = perfmon_init_nehalem;
+                    perfmon_startCountersThread = perfmon_startCountersThread_nehalem;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_nehalem;
+                    perfmon_readCountersThread = perfmon_readCountersThread_nehalem;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_nehalem;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_nehalem;
+                    break;
+
+                case IVYBRIDGE_EP:
+                case IVYBRIDGE:
+                    initialize_power = TRUE;
+                    initialize_thermal = TRUE;
+                    initThreadArch = perfmon_init_ivybridge;
+                    perfmon_startCountersThread = perfmon_startCountersThread_ivybridge;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_ivybridge;
+                    perfmon_readCountersThread = perfmon_readCountersThread_ivybridge;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_ivybridge;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_ivybridge;
+                    break;
+
+                case HASWELL_EP:
+                case HASWELL:
+                case HASWELL_M1:
+                case HASWELL_M2:
+                    initialize_power = TRUE;
+                    initialize_thermal = TRUE;
+                    initThreadArch = perfmon_init_haswell;
+                    perfmon_startCountersThread = perfmon_startCountersThread_haswell;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_haswell;
+                    perfmon_readCountersThread = perfmon_readCountersThread_haswell;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_haswell;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_haswell;
+                    break;
 
+                case SANDYBRIDGE_EP:
+                case SANDYBRIDGE:
+                    initialize_power = TRUE;
+                    initialize_thermal = TRUE;
                     initThreadArch = perfmon_init_sandybridge;
-                    printDerivedMetrics = perfmon_printDerivedMetricsSandybridge;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesSandybridge;
-                    logDerivedMetrics = perfmon_logDerivedMetricsSandybridge;
                     perfmon_startCountersThread = perfmon_startCountersThread_sandybridge;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_sandybridge;
                     perfmon_readCountersThread = perfmon_readCountersThread_sandybridge;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_sandybridge;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_sandybridge;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_sandybridge;
+                    break;
+
+                case BROADWELL:
+                case BROADWELL_E:
+                case BROADWELL_D:
+                    initialize_power = TRUE;
+                    initialize_thermal = TRUE;
+                    initThreadArch = perfmon_init_broadwell;
+                    perfmon_startCountersThread = perfmon_startCountersThread_broadwell;
+                    perfmon_stopCountersThread = perfmon_stopCountersThread_broadwell;
+                    perfmon_readCountersThread = perfmon_readCountersThread_broadwell;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_broadwell;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_broadwell;
                     break;
 
                 default:
@@ -1562,25 +943,12 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
             switch ( cpuid_info.model )
             {
                 case XEON_PHI:
-
-                    eventHash = phi_arch_events;
-                    perfmon_numArchEvents = perfmon_numArchEventsPhi;
-
-                    group_map = phi_group_map;
-                    group_help = phi_group_help;
-                    perfmon_numGroups = perfmon_numGroupsPhi;
-
-                    counter_map = phi_counter_map;
-                    perfmon_numCounters = perfmon_numCountersPhi;
-
                     initThreadArch = perfmon_init_phi;
-                    printDerivedMetrics = perfmon_printDerivedMetricsPhi;
-                    perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesPhi;
-                    logDerivedMetrics = perfmon_logDerivedMetricsPhi;
                     perfmon_startCountersThread = perfmon_startCountersThread_phi;
                     perfmon_stopCountersThread = perfmon_stopCountersThread_phi;
                     perfmon_readCountersThread = perfmon_readCountersThread_phi;
-                    perfmon_setupCounterThread = perfmon_setupCounterThread_phi;
+                    perfmon_setupCountersThread = perfmon_setupCounterThread_phi;
+                    perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_phi;
                     break;
 
                 default:
@@ -1590,115 +958,647 @@ perfmon_init(int numThreads_local, int threads[], FILE* outstream)
             break;
 
         case K8_FAMILY:
-            eventHash = k8_arch_events;
-            perfmon_numArchEvents = perfmon_numArchEventsK8;
-
-            group_map = k8_group_map;
-            group_help = k8_group_help;
-            perfmon_numGroups = perfmon_numGroupsK8;
-
-            counter_map = k10_counter_map;
-            perfmon_numCounters = perfmon_numCountersK10;
-
             initThreadArch = perfmon_init_k10;
-            printDerivedMetrics = perfmon_printDerivedMetricsK8;
-            perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesK8;
-            logDerivedMetrics = perfmon_logDerivedMetricsK8;
             perfmon_startCountersThread = perfmon_startCountersThread_k10;
             perfmon_stopCountersThread = perfmon_stopCountersThread_k10;
             perfmon_readCountersThread = perfmon_readCountersThread_k10;
-            perfmon_setupCounterThread = perfmon_setupCounterThread_k10;
+            perfmon_setupCountersThread = perfmon_setupCounterThread_k10;
+            perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_k10;
             break;
 
         case K10_FAMILY:
-            eventHash = k10_arch_events;
-            perfmon_numArchEvents = perfmon_numArchEventsK10;
-
-            group_map = k10_group_map;
-            group_help = k10_group_help;
-            perfmon_numGroups = perfmon_numGroupsK10;
-
-            counter_map = k10_counter_map;
-            perfmon_numCounters = perfmon_numCountersK10;
-
             initThreadArch = perfmon_init_k10;
-            printDerivedMetrics = perfmon_printDerivedMetricsK10;
-            perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesK10;
-            logDerivedMetrics = perfmon_logDerivedMetricsK10;
             perfmon_startCountersThread = perfmon_startCountersThread_k10;
             perfmon_stopCountersThread = perfmon_stopCountersThread_k10;
             perfmon_readCountersThread = perfmon_readCountersThread_k10;
-            perfmon_setupCounterThread = perfmon_setupCounterThread_k10;
+            perfmon_setupCountersThread = perfmon_setupCounterThread_k10;
+            perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_k10;
             break;
 
         case K15_FAMILY:
-            eventHash = interlagos_arch_events;
-            perfmon_numArchEvents = perfmon_numArchEventsInterlagos;
-
-            group_map = interlagos_group_map;
-            group_help = interlagos_group_help;
-            perfmon_numGroups = perfmon_numGroupsInterlagos;
-
-            counter_map = interlagos_counter_map;
-            perfmon_numCounters = perfmon_numCountersInterlagos;
-
             initThreadArch = perfmon_init_interlagos;
-            printDerivedMetrics = perfmon_printDerivedMetricsInterlagos;
-            perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesInterlagos;
-            logDerivedMetrics = perfmon_logDerivedMetricsInterlagos;
             perfmon_startCountersThread = perfmon_startCountersThread_interlagos;
             perfmon_stopCountersThread = perfmon_stopCountersThread_interlagos;
             perfmon_readCountersThread = perfmon_readCountersThread_interlagos;
-            perfmon_setupCounterThread = perfmon_setupCounterThread_interlagos;
+            perfmon_setupCountersThread = perfmon_setupCounterThread_interlagos;
+            perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_interlagos;
             break;
 
         case K16_FAMILY:
-            eventHash = kabini_arch_events;
-            perfmon_numArchEvents = perfmon_numArchEventsKabini;
-
-            group_map = kabini_group_map;
-            group_help = kabini_group_help;
-            perfmon_numGroups = perfmon_numGroupsKabini;
-
-            counter_map = kabini_counter_map;
-            perfmon_numCounters = perfmon_numCountersKabini;
-
             initThreadArch = perfmon_init_kabini;
-            printDerivedMetrics = perfmon_printDerivedMetricsKabini;
-            perfmon_getDerivedCounterValuesArch = perfmon_getDerivedCounterValuesKabini;
-            logDerivedMetrics = perfmon_logDerivedMetricsKabini;
             perfmon_startCountersThread = perfmon_startCountersThread_kabini;
             perfmon_stopCountersThread = perfmon_stopCountersThread_kabini;
             perfmon_readCountersThread = perfmon_readCountersThread_kabini;
-            perfmon_setupCounterThread = perfmon_setupCounterThread_kabini;
+            perfmon_setupCountersThread = perfmon_setupCounterThread_kabini;
+            perfmon_finalizeCountersThread = perfmon_finalizeCountersThread_kabini;
            break;
 
         default:
             ERROR_PLAIN_PRINT(Unsupported Processor);
             break;
     }
+    *init_power = initialize_power;
+    *init_temp = initialize_thermal;
+}
+
+
+int
+perfmon_init(int nrThreads, int threadsToCpu[])
+{
+    int i;
+    int ret;
+    int initialize_power = FALSE;
+    int initialize_thermal = FALSE;
+
+    if (nrThreads <= 0)
+    {
+        ERROR_PRINT(Number of threads must be greater than 0 but only %d given,nrThreads);
+        return -EINVAL;
+    }
+    
+    if (!lock_check())
+    {
+        ERROR_PLAIN_PRINT(Access to performance monitoring registers locked);
+        return -EINVAL;
+    }
+    
+    /* Check threadsToCpu array if only valid cpu_ids are listed */
+    if (groupSet != NULL)
+    {
+        /* TODO: Decision whether setting new thread count and adjust processorIds
+         *          or just exit like implemented now
+         */
+        return -EEXIST;
+    }
+    
+    groupSet = (PerfmonGroupSet*) malloc(sizeof(PerfmonGroupSet));
+    if (groupSet == NULL)
+    {
+        ERROR_PLAIN_PRINT(Cannot allocate group descriptor);
+        return -ENOMEM;
+    }
+    groupSet->threads = (PerfmonThread*) malloc(nrThreads * sizeof(PerfmonThread));
+    if (groupSet->threads == NULL)
+    {
+        ERROR_PLAIN_PRINT(Cannot allocate set of threads);
+        free(groupSet);
+        return -ENOMEM;
+    }
+    groupSet->numberOfThreads = nrThreads;
+    groupSet->numberOfGroups = 0;
+    groupSet->numberOfActiveGroups = 0;
+
+    for(i=0; i<MAX_NUM_NODES; i++) socket_lock[i] = LOCK_INIT;
+    for(i=0; i<MAX_NUM_THREADS; i++) tile_lock[i] = LOCK_INIT;
+
+    /* Initialize maps pointer to current architecture maps */
+    perfmon_init_maps();
+
+    /* Initialize access interface */
+    ret = HPMaddThread(threadsToCpu[0]);
+    if (ret)
+    {
+        ERROR_PLAIN_PRINT(Cannot get access to performance counters);
+        free(groupSet->threads);
+        free(groupSet);
+        return ret;
+    }
+    timer_init();
+
+    
+    /* Initialize function pointer to current architecture functions */
+    perfmon_init_funcs(&initialize_power, &initialize_thermal);
+
+    /* Store thread information and reset counters for processor*/
+    /* If the arch supports it, initialize power and thermal measurements */
+    for(i=0;i<nrThreads;i++)
+    {
+        groupSet->threads[i].thread_id = i;
+        groupSet->threads[i].processorId = threadsToCpu[i];
+
+        if (initialize_power == TRUE)
+        {
+            power_init(threadsToCpu[i]);
+        }
+        if (initialize_thermal == TRUE)
+        {
+            thermal_init(threadsToCpu[i]);
+        }
+        initThreadArch(threadsToCpu[i]);
+    }
+    return 0;
+}
+
+void 
+perfmon_finalize(void)
+{
+    int group, event;
+    int thread;
+    for(group=0;group < groupSet->numberOfGroups; group++)
+    {
+        for (thread=0;thread< groupSet->numberOfThreads; thread++)
+        {
+            perfmon_finalizeCountersThread(thread, &(groupSet->groups[group]));
+        }
+        for (event=0;event < groupSet->groups[group].numberOfEvents; event++)
+        {
+            free(groupSet->groups[group].events[event].threadCounter);
+        }
+        free(groupSet->groups[group].events);
+    }
+    
+    free(groupSet->threads);
+    free(groupSet);
+    HPMfinalize();
+    power_finalize();
+    return;
+}
+
+int 
+perfmon_addEventSet(char* eventCString)
+{
+    int i, j;
+    bstring eventBString;
+    struct bstrList* eventtokens;
+    struct bstrList* subtokens;
+    PerfmonEventSet* eventSet;
+    PerfmonEventSetEntry* event;
+
+    if (eventCString == NULL)
+    {
+        DEBUG_PLAIN_PRINT(1, Event string is empty. Trying environment variable LIKWID_EVENTS);
+        eventCString = getenv("LIKWID_EVENTS");
+        if (eventCString == NULL)
+        {
+            ERROR_PLAIN_PRINT(Cannot read event string. Also event string from environment variable is empty);
+            return -EINVAL;
+        }
+    }
+
+    if (strchr(eventCString, '-') != NULL)
+    {
+        ERROR_PLAIN_PRINT(Event string contains invalid character -);
+        return -EINVAL;
+    }
+    if (strchr(eventCString, '.') != NULL)
+    {
+        ERROR_PLAIN_PRINT(Event string contains invalid character .);
+        return -EINVAL;
+    }
+    if (groupSet->numberOfGroups == 0)
+    {
+        groupSet->groups = (PerfmonEventSet*) malloc(sizeof(PerfmonEventSet));
+        if (groupSet->groups == NULL)
+        {
+            ERROR_PLAIN_PRINT(Cannot allocate initialize of event group list);
+            return -ENOMEM;
+        }
+
+        groupSet->numberOfGroups = 1;
+        groupSet->numberOfActiveGroups = 0;
+
+        /* Only one group exists by now */
+        groupSet->groups[0].rdtscTime = 0;
+        groupSet->groups[0].runTime = 0;
+        groupSet->groups[0].numberOfEvents = 0;
+    }
+    
+    if (groupSet->numberOfActiveGroups == groupSet->numberOfGroups)
+    {
+        
+        groupSet->numberOfGroups++;
+        groupSet->groups = (PerfmonEventSet*)realloc(groupSet->groups, groupSet->numberOfGroups*sizeof(PerfmonEventSet));
+        if (groupSet->groups == NULL)
+        {
+            ERROR_PLAIN_PRINT(Cannot allocate additional group);
+            return -ENOMEM;
+        }
+        groupSet->groups[groupSet->numberOfActiveGroups].rdtscTime = 0;
+        groupSet->groups[groupSet->numberOfActiveGroups].runTime = 0;
+        groupSet->groups[groupSet->numberOfActiveGroups].numberOfEvents = 0;
+        DEBUG_PLAIN_PRINT(DEBUGLEV_INFO, Allocating new group structure for group.);
+    }
+    DEBUG_PRINT(DEBUGLEV_INFO, Currently %d groups of %d active,
+                    groupSet->numberOfActiveGroups+1,
+                    groupSet->numberOfGroups+1);
+
+    eventSet = &(groupSet->groups[groupSet->numberOfActiveGroups]);
+
+    eventBString = bfromcstr(eventCString);
+    eventtokens = bstrListCreate();
+    eventtokens = bsplit(eventBString,',');
+    bdestroy(eventBString);
+    eventSet->events = (PerfmonEventSetEntry*) malloc(eventtokens->qty * sizeof(PerfmonEventSetEntry));
+    
+    if (eventSet->events == NULL)
+    {
+        ERROR_PRINT(Cannot allocate event list for group %d\n, groupSet->numberOfActiveGroups);
+        return -ENOMEM;
+    }
+    eventSet->numberOfEvents = 0;
+    eventSet->regTypeMask = 0x0ULL;
+
+    subtokens = bstrListCreate();
+    
+    for(i=0;i<eventtokens->qty;i++)
+    {
+        event = &(eventSet->events[i]);
+        subtokens = bsplit(eventtokens->entry[i],':');
+        if (subtokens->qty < 2)
+        {
+            fprintf(stderr,"Cannot parse event descriptor %s\n", bdata(eventtokens->entry[i]));
+            continue;
+        }
+        else
+        {
+            if (!getIndexAndType(subtokens->entry[1], &event->index, &event->type))
+            {
+                fprintf(stderr,"Counter register %s not supported or PCI device not available\n",bdata(
+                        subtokens->entry[1]));
+                event->type = NOTYPE;
+                goto past_checks;
+            }
+
+            if (!getEvent(subtokens->entry[0], subtokens->entry[1], &event->event))
+            {
+                fprintf(stderr,"Event %s not found for current architecture\n",
+                     bdata(subtokens->entry[0]));
+                event->type = NOTYPE;
+                goto past_checks;
+            }
+           
+            if (!checkCounter(subtokens->entry[1], event->event.limit))
+            {
+                fprintf(stderr,"Register %s not allowed for event %s\n",
+                     bdata(subtokens->entry[1]),bdata(subtokens->entry[0]));
+                event->type = NOTYPE;
+                goto past_checks;
+            }
+            if (parseOptions(subtokens, &event->event, event->index) < 0)
+            {
+                fprintf(stderr,"Cannot parse options in %s\n", bdata(eventtokens->entry[i]));
+                event->type = NOTYPE;
+                goto past_checks;
+            }
+            
+            eventSet->regTypeMask |= REG_TYPE_MASK(event->type);
+past_checks:
+            event->threadCounter = (PerfmonCounter*) malloc(
+                groupSet->numberOfThreads * sizeof(PerfmonCounter));
+
+            if (event->threadCounter == NULL)
+            {
+                ERROR_PRINT(Cannot allocate counter for all threads in group %d,groupSet->numberOfActiveGroups);
+                continue;
+            }
+            for(j=0;j<groupSet->numberOfThreads;j++)
+            {
+                event->threadCounter[j].counterData = 0;
+                event->threadCounter[j].startData = 0;
+                event->threadCounter[j].overflows = 0;
+                event->threadCounter[j].init = FALSE;
+            }
+
+            eventSet->numberOfEvents++;
+
+            if (event->type != NOTYPE)
+            {
+                DEBUG_PRINT(DEBUGLEV_INFO,
+                        Added event %s for counter %s to group %d,
+                        event->event.name,
+                        counter_map[event->index].key,
+                        groupSet->numberOfActiveGroups);
+            }
+        }
+    }
+    bstrListDestroy(subtokens);
+    bstrListDestroy(eventtokens);
+    groupSet->numberOfActiveGroups++;
+    if ((eventSet->numberOfEvents > 0) && (eventSet->regTypeMask != 0x0ULL))
+    {
+        return groupSet->numberOfActiveGroups-1;
+    }
+    else
+    {
+        fprintf(stderr,"No event in given event string can be configured\n");
+        return -EINVAL;
+    }
+}
+
+int
+__perfmon_setupCountersThread(int thread_id, int groupId)
+{
+    int i;
+    if (groupId >= groupSet->numberOfActiveGroups)
+    {
+        ERROR_PRINT(Group %d does not exist in groupSet, groupId);
+        return -ENOENT;
+    }
+
+    CHECK_AND_RETURN_ERROR(perfmon_setupCountersThread(thread_id, &groupSet->groups[groupId]),
+            Setup of counters failed);
+
+    groupSet->activeGroup = groupId;
+    return 0;
+}
+
+int
+perfmon_setupCounters(int groupId)
+{
+    int i;
+    int ret = 0;
+    for(i=0;i<groupSet->numberOfThreads;i++)
+    {
+        ret = __perfmon_setupCountersThread(groupSet->threads[i].thread_id, groupId);
+        if (ret != 0)
+        {
+            return ret;
+        }
+    }
+    return 0;
+}
+
+int
+__perfmon_startCounters(int groupId)
+{
+    int i = 0;
+    int ret = 0;
+    if ((groupId < 0) || (groupId >= groupSet->numberOfActiveGroups))
+    {
+        groupId = groupSet->activeGroup;
+    }
+    for(;i<groupSet->numberOfThreads;i++)
+    {
+        ret = perfmon_startCountersThread(groupSet->threads[i].thread_id, &groupSet->groups[groupId]);
+        if (ret)
+        {
+            return -groupSet->threads[i].thread_id-1;
+        }
+    }
+    timer_start(&groupSet->groups[groupId].timer);
+    return 0;
+}
+
+int perfmon_startCounters(void)
+{
+    return __perfmon_startCounters(-1);
+}
+
+int perfmon_startGroupCounters(int groupId)
+{
+    return __perfmon_startCounters(groupId);
+}
+
+int
+__perfmon_stopCounters(int groupId)
+{
+    int i = 0;
+    int ret = 0;
+
+    if ((groupId < 0) || (groupId >= groupSet->numberOfActiveGroups))
+    {
+        groupId = groupSet->activeGroup;
+    }
+
+    timer_stop(&groupSet->groups[groupId].timer);
+
+    for (; i<groupSet->numberOfThreads; i++)
+    {
+        ret = perfmon_stopCountersThread(groupSet->threads[i].thread_id, &groupSet->groups[groupId]);
+        if (ret)
+        {
+            return -groupSet->threads[i].thread_id-1;
+        }
+    }
+
+    groupSet->groups[groupId].rdtscTime =
+                timer_print(&groupSet->groups[groupId].timer);
+    groupSet->groups[groupId].runTime += groupSet->groups[groupId].rdtscTime;
+    return 0;
+}
+
+int perfmon_stopCounters(void)
+{
+    return __perfmon_stopCounters(-1);
+}
+
+int perfmon_stopGroupCounters(int groupId)
+{
+    return __perfmon_stopCounters(groupId);
+}
+
+int
+__perfmon_readCounters(int groupId, int threadId)
+{
+    int ret = 0;
+
+    if ((groupId < 0) || (groupId >= groupSet->numberOfActiveGroups))
+    {
+        groupId = groupSet->activeGroup;
+    }
+
+    if (threadId == -1)
+    {
+        for (threadId = 0; threadId<groupSet->numberOfThreads; threadId++)
+        {
+            ret = perfmon_readCountersThread(threadId, &groupSet->groups[groupId]);
+            if (ret)
+            {
+                return -threadId-1;
+            }
+        }
+    }
+    else if ((threadId >= 0) && (threadId < groupSet->numberOfThreads))
+    {
+        ret = perfmon_readCountersThread(threadId, &groupSet->groups[groupId]);
+        if (ret)
+        {
+            return -threadId-1;
+        }
+    }
+    return 0;
+}
+
+int perfmon_readCounters(void)
+{
+    return __perfmon_readCounters(-1,-1);
+}
+
+int perfmon_readCountersCpu(int cpu_id)
+{
+    int i;
+    int thread_id = 0;
+    for(i=0;i<groupSet->numberOfThreads;i++)
+    {
+        if (groupSet->threads[i].processorId == cpu_id)
+        {
+            thread_id = groupSet->threads[i].thread_id;
+            break;
+        }
+    }
+    return perfmon_readCountersThread(thread_id, &groupSet->groups[groupSet->activeGroup]);
+}
+
+int perfmon_readGroupCounters(int groupId)
+{
+    return __perfmon_readCounters(groupId,-1);
+}
+int perfmon_readGroupThreadCounters(int groupId, int threadId)
+{
+    return __perfmon_readCounters(groupId,threadId);
+}
+
+
+double
+perfmon_getResult(int groupId, int eventId, int threadId)
+{
+    double result = 0.0;
+    PerfmonEventSetEntry* event;
+    PerfmonCounter* counter;
+    int cpu_id;
+    if (unlikely(groupSet == NULL))
+    {
+        return 0;
+    }
+    if (groupId < 0)
+    {
+        groupId = groupSet->activeGroup;
+    }
+    if (eventId >= groupSet->groups[groupId].numberOfEvents)
+    {
+        printf("ERROR: EventID greater than defined events\n");
+        return 0;
+    }
+    if (threadId >= groupSet->numberOfThreads)
+    {
+        printf("ERROR: ThreadID greater than defined threads\n");
+        return 0;
+    }
+    event = &(groupSet->groups[groupId].events[eventId]);
+    counter = &(event->threadCounter[threadId]);
+    cpu_id = groupSet->threads[threadId].processorId;
 
+    if (counter->overflows == 0)
+    {
+        result = (double) (counter->counterData - counter->startData);
+    }
+    else if (counter->overflows > 0)
+    {
+        result += (double) ((perfmon_getMaxCounterValue(counter_map[event->index].type) - counter->startData) + counter->counterData);
+        counter->overflows--;
+    }
+    result += (double) (counter->overflows * perfmon_getMaxCounterValue(counter_map[event->index].type));
 
-    for (int i=0; i<perfmon_numThreads; i++)
+    if (counter_map[event->index].type == POWER)
+    {
+        result *= power_getEnergyUnit(getCounterTypeOffset(event->index));
+    }
+    else if (counter_map[event->index].type == THERMAL)
     {
-        initThread(i,threads[i]);
+        result = (double)counter->counterData;
     }
+    return result;
 }
 
-void
-perfmon_finalize()
+int __perfmon_switchActiveGroupThread(int thread_id, int new_group)
 {
+    int ret;
     int i;
+    ret = perfmon_stopCounters();
+    if (ret != 0)
+    {
+        return ret;
+    }
+    for(i=0; i<groupSet->groups[groupSet->activeGroup].numberOfEvents;i++)
+    {
+        groupSet->groups[groupSet->activeGroup].events[i].threadCounter[thread_id].init = FALSE;
+    }
+    /*for(i=0; i<groupSet->groups[new_group].numberOfEvents;i++)
+    {
+        groupSet->groups[new_group].events[i].threadCounter[cpu_id].init = TRUE;
+    }*/
+    ret = perfmon_setupCounters(new_group);
+    if (ret != 0)
+    {
+        return ret;
+    }
+    ret = perfmon_startCounters();
+    if (ret != 0)
+    {
+        return ret;
+    }
+    return 0;
+}
 
-    free(perfmon_threadData);
+int
+perfmon_switchActiveGroup(int new_group)
+{
+    int i=0;
+    int ret=0;
+    for(i=0;i<groupSet->numberOfThreads;i++)
+    {
+        ret = __perfmon_switchActiveGroupThread(groupSet->threads[i].thread_id, new_group);
+        if (ret != 0)
+        {
+            return ret;
+        }
+    }
+    return 0;
+}
+
+int
+perfmon_getNumberOfGroups(void)
+{
+    return groupSet->numberOfActiveGroups;
+}
+
+int
+perfmon_getIdOfActiveGroup(void)
+{
+    return groupSet->activeGroup;
+}
+
+int
+perfmon_getNumberOfThreads(void)
+{
+    return groupSet->numberOfThreads;
+}
+
+int
+perfmon_getNumberOfEvents(int groupId)
+{
+    if (groupId < 0)
+    {
+        groupId = groupSet->activeGroup;
+    }
+    return groupSet->groups[groupId].numberOfEvents;
+}
 
-    for (i=0; i<perfmon_numThreads; i++)
+double
+perfmon_getTimeOfGroup(int groupId)
+{
+    if (groupId < 0)
     {
-        free(perfmon_threadState[i]);
+        groupId = groupSet->activeGroup;
     }
-    free(perfmon_threadState);
-    msr_finalize();
-    pci_finalize();
-    accessClient_finalize(socket_fd);
+    return groupSet->groups[groupId].runTime;
 }
 
+uint64_t
+perfmon_getMaxCounterValue(RegisterType type)
+{
+    int width = 48;
+    uint64_t tmp = 0x0ULL;
+    if (box_map && (box_map[type].regWidth > 0))
+    {
+        width = box_map[type].regWidth;
+    }
+    for(int i=0;i<width;i++)
+    {
+        tmp |= (1ULL<<i);
+    }
+    return tmp;
+}
+
+
+
diff --git a/src/power.c b/src/power.c
index 3f4118c..2e12252 100644
--- a/src/power.c
+++ b/src/power.c
@@ -5,13 +5,14 @@
  *
  *      Description:  Module implementing Intel RAPL interface
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -33,136 +34,458 @@
 
 #include <types.h>
 #include <power.h>
-#include <cpuid.h>
+#include <topology.h>
 
 /* #####   EXPORTED VARIABLES   ########################################### */
 
 PowerInfo power_info;
-const uint32_t power_regs[4] = {MSR_PKG_ENERGY_STATUS,
-                                MSR_PP0_ENERGY_STATUS,
-                                MSR_PP1_ENERGY_STATUS,
-                                MSR_DRAM_ENERGY_STATUS};
 
 /* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-
+static int power_initialized = 0;
 
 
 /* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
 
-void
+int
 power_init(int cpuId)
 {
     uint64_t flags;
-    int hasRAPL = 0;
-    uint32_t info_register = 0x0;
+    int i;
+    int err;
 
     /* determine Turbo Mode features */
     double busSpeed;
 
-    if ((cpuid_info.model == SANDYBRIDGE_EP) ||
-            (cpuid_info.model == SANDYBRIDGE) ||
-            (cpuid_info.model == HASWELL) ||
-            (cpuid_info.model == HASWELL_EX) ||
-            (cpuid_info.model == IVYBRIDGE_EP) ||
-            (cpuid_info.model == IVYBRIDGE))
+    power_info.baseFrequency = 0;
+    power_info.minFrequency = 0;
+    power_info.turbo.numSteps = 0;
+    power_info.powerUnit = 0;
+    power_info.timeUnit = 0;
+    power_info.hasRAPL = 0;
+
+    switch (cpuid_info.model)
     {
-        hasRAPL = 1;
-        info_register = MSR_PKG_POWER_INFO;
+        case SANDYBRIDGE:
+        case IVYBRIDGE:
+        case HASWELL:
+        case SANDYBRIDGE_EP:
+        case IVYBRIDGE_EP:
+        case HASWELL_EP:
+        case ATOM_SILVERMONT_E:
+        case ATOM_SILVERMONT_Z1:
+        case ATOM_SILVERMONT_Z2:
+        case ATOM_SILVERMONT_F:
+        case BROADWELL:
+        case BROADWELL_E:
+        case BROADWELL_D:
+            power_info.hasRAPL = 1;
+            break;
+        case ATOM_SILVERMONT_C:
+            power_info.hasRAPL = 1;
+            /* The info_regs list needs an update for Silvermont Type C
+               because it uses another info register */
+            info_regs[PKG] = MSR_PKG_POWER_INFO_SILVERMONT;
+            break;
+        default:
+            DEBUG_PLAIN_PRINT(DEBUGLEV_INFO, NO RAPL SUPPORT);
+            return 0;
+            break;
     }
-    else if (cpuid_info.model == ATOM_SILVERMONT_C)
+
+    perfmon_init_maps();
+    if (!HPMinitialized())
     {
-        hasRAPL = 1;
-        info_register = MSR_PKG_POWER_INFO_SILVERMONT;
+        HPMaddThread(cpuId);
     }
-    else if ((cpuid_info.model == ATOM_SILVERMONT_E) ||
-             (cpuid_info.model == ATOM_SILVERMONT_F1) ||
-             (cpuid_info.model == ATOM_SILVERMONT_F2) ||
-             (cpuid_info.model == ATOM_SILVERMONT_F3))
+    if (power_initialized)
     {
-        hasRAPL = 1;
+        return 0;
+    }
+    if ( power_info.hasRAPL )
+    {
+        busSpeed = 100.0;
+    }
+    else
+    {
+        busSpeed = 133.33;
     }
-
     if (cpuid_info.turbo)
     {
-        flags = msr_read(cpuId, MSR_PLATFORM_INFO);
-
-        if ( hasRAPL )
-        {
-            busSpeed = 100.0;
-        }
-        else 
+        err = HPMread(cpuId, MSR_DEV, MSR_PLATFORM_INFO, &flags);
+        if (err == 0)
         {
-            busSpeed = 133.33;
-        }
-
-        power_info.baseFrequency = busSpeed * (double) extractBitField(flags,8,8);
-        power_info.minFrequency  = busSpeed * (double) extractBitField((flags>>(32)),8,8);
+            power_info.baseFrequency = busSpeed * (double) extractBitField(flags,8,8);
+            power_info.minFrequency  = busSpeed * (double) extractBitField((flags>>(32)),8,8);
 
-        power_info.turbo.numSteps = cpuid_topology.numCoresPerSocket;
-        power_info.turbo.steps = (double*) malloc(power_info.turbo.numSteps * sizeof(double));
-
-        flags = msr_read(cpuId, MSR_TURBO_RATIO_LIMIT);
+            power_info.turbo.numSteps = cpuid_topology.numCoresPerSocket;
+            if (cpuid_info.model == WESTMERE_EX)
+            {
+                power_info.turbo.numSteps = 4;
+            }
+            power_info.turbo.steps = (double*) malloc(power_info.turbo.numSteps * sizeof(double));
+            if (!power_info.turbo.steps)
+            {
+                return -ENOMEM;
+            }
 
-        for (int i=0; i < power_info.turbo.numSteps; i++)
-        {
-            if (i < 8)
+            err = HPMread(cpuId, MSR_DEV, MSR_TURBO_RATIO_LIMIT, &flags);
+            if (err)
             {
-                power_info.turbo.steps[i] = busSpeed * (double) field64(flags,i*8, 8);
+                fprintf(stderr,"Cannot gather values from MSR_TURBO_RATIO_LIMIT,\n");
             }
             else
             {
-                power_info.turbo.steps[i] = power_info.turbo.steps[7];
+                for (int i=0; i < power_info.turbo.numSteps; i++)
+                {
+                    if (i < 8)
+                    {
+                        power_info.turbo.steps[i] = busSpeed * (double) field64(flags,i*8, 8);
+                    }
+                    else
+                    {
+                        power_info.turbo.steps[i] = power_info.turbo.steps[7];
+                    }
+                }
             }
+            //TODO: Haswell EP and possibly Broadwell EP support multiple turbo 
+            //      registers besides MSR_TURBO_RATIO_LIMIT:
+            //      MSR_TURBO_RATIO_LIMIT1 and MSR_TURBO_RATIO_LIMIT2
+        }
+        else
+        {
+            fprintf(stderr,"Cannot gather values from MSR_PLATFORM_INFO,\n");
         }
-    }
-    else
-    {
-        power_info.turbo.numSteps = 0;
     }
 
     /* determine RAPL parameters */
-    if ( hasRAPL )
+    if ( power_info.hasRAPL )
     {
-        flags = msr_read(cpuId, MSR_RAPL_POWER_UNIT);
-
-        power_info.powerUnit = pow(0.5,(double) extractBitField(flags,4,0));
-        power_info.energyUnit = pow(0.5,(double) extractBitField(flags,5,8));
-        power_info.timeUnit = pow(0.5,(double) extractBitField(flags,4,16));
-
-        if (info_register != 0x0)
+        err = HPMread(cpuId, MSR_DEV, MSR_RAPL_POWER_UNIT, &flags);
+        if (err == 0)
         {
-            flags = msr_read(cpuId, info_register);
-            power_info.tdp = (double) extractBitField(flags,15,0) * power_info.powerUnit;
-            if (cpuid_info.model != ATOM_SILVERMONT_C)
+            double energyUnit;
+            power_info.powerUnit = 1000000 / (1<<(flags & 0xF));
+            power_info.timeUnit = 1000000 / (1 << ((flags>>16) & 0xF));
+            if (cpuid_info.model != ATOM_SILVERMONT_E)
             {
-                power_info.minPower =  (double) extractBitField(flags,15,16) * power_info.powerUnit;
-                power_info.maxPower = (double) extractBitField(flags,15,32) * power_info.powerUnit;
-                power_info.maxTimeWindow = (double) extractBitField(flags,7,48) * power_info.timeUnit;
+                energyUnit = 1.0 / (1 << ((flags >> 8) & 0x1F));
             }
             else
             {
-                power_info.minPower = 0.0;
-                power_info.maxPower = 0.0;
-                power_info.maxTimeWindow = 0.0;
+                energyUnit = 1.0 * (1 << ((flags >> 8) & 0x1F)) / 1000000;
+            }
+            
+            for (i = 0; i < NUM_POWER_DOMAINS; i++)
+            {
+                power_info.domains[i].energyUnit = energyUnit;
+                power_info.domains[i].type = i;
+                power_info.domains[i].supportFlags = 0x0U;
+                power_info.domains[i].tdp = 0.0;
+                power_info.domains[i].minPower = 0.0;
+                power_info.domains[i].maxPower = 0.0;
+                power_info.domains[i].maxTimeWindow = 0.0;
+            }
+            
+            if ((cpuid_info.model == HASWELL_EP) ||
+                (cpuid_info.model == HASWELL_M1) ||
+                (cpuid_info.model == HASWELL_M2))
+            {
+                power_info.domains[DRAM].energyUnit = 15.3E-6;
+            }
+
+            for(i = 0; i < NUM_POWER_DOMAINS; i++)
+            {
+                err = HPMread(cpuId, MSR_DEV, power_regs[i], &flags);
+                if (err == 0)
+                {
+                    power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_STATUS;
+                }
+                else
+                {
+                    DEBUG_PRINT(DEBUGLEV_DETAIL, RAPL domain %s not supported, power_names[i]);
+                    continue;
+                }
+                if (limit_regs[i] != 0x0)
+                {
+                    err = HPMread(cpuId, MSR_DEV, limit_regs[i], &flags);
+                    if (err == 0)
+                    {
+                        power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_LIMIT;
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_DETAIL, Deactivating limit register for RAPL domain %s, power_names[i]);
+                        limit_regs[i] = 0x0;
+                    }
+                }
+                if (info_regs[i] != 0x0)
+                {
+                    err = HPMread(cpuId, MSR_DEV, info_regs[i], &flags);
+                    if (err == 0)
+                    {
+                        power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_INFO;
+                        power_info.domains[i].tdp = (double) extractBitField(flags,15,0) * power_info.powerUnit;
+                        if (cpuid_info.model != ATOM_SILVERMONT_C)
+                        {
+                            power_info.domains[i].minPower = (double) extractBitField(flags,15,16) * power_info.powerUnit;
+                            power_info.domains[i].maxPower = (double) extractBitField(flags,15,32) * power_info.powerUnit;
+                            power_info.domains[i].maxTimeWindow = (double) extractBitField(flags,7,48) * power_info.timeUnit;
+                        }
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_DETAIL, Deactivating info register for RAPL domain %s, power_names[i]);
+                        info_regs[i] = 0x0;
+                    }
+                }
+                if (policy_regs[i] != 0x0)
+                {
+                    err = HPMread(cpuId, MSR_DEV, policy_regs[i], &flags);
+                    if (err == 0)
+                    {
+                        power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_POLICY;
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_DETAIL, Deactivating policy register for RAPL domain %s, power_names[i]);
+                        policy_regs[i] = 0x0;
+                    }
+                }
+                if (perf_regs[i] != 0x0)
+                {
+                    err = HPMread(cpuId, MSR_DEV, perf_regs[i], &flags);
+                    if (err == 0)
+                    {
+                        power_info.domains[i].supportFlags |= POWER_DOMAIN_SUPPORT_PERF;
+                    }
+                    else
+                    {
+                        DEBUG_PRINT(DEBUGLEV_DETAIL, Deactivating perf register for RAPL domain %s, power_names[i]);
+                        perf_regs[i] = 0x0;
+                    }
+                }
             }
         }
         else
         {
-            power_info.tdp = 0;
-            power_info.minPower = 0.0;
-            power_info.maxPower = 0.0;
-            power_info.maxTimeWindow = 0.0;
+            fprintf(stderr,"Cannot gather values from MSR_RAPL_POWER_UNIT, deactivating RAPL support\n");
+            power_info.hasRAPL =  0;
         }
+        power_initialized = 1;
+        return power_info.hasRAPL;
     }
     else
     {
-        power_info.powerUnit = 0.0;
-        power_info.energyUnit = 0.0;
-        power_info.timeUnit = 0.0;
-        power_info.tdp = 0;
-        power_info.minPower = 0.0;
-        power_info.maxPower = 0.0;
-        power_info.maxTimeWindow = 0.0;
+        return power_info.hasRAPL;
+    }
+    return 0;
+}
+
+/* All functions below are experimental and probably don't work */
+int power_perfGet(int cpuId, PowerType domain, uint32_t* status)
+{
+    int err = 0;
+    *status = 0x0U;
+    if (domain >= NUM_POWER_DOMAINS)
+    {
+        return -EINVAL;
     }
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_PERF)
+    {
+        err = HPMread(cpuId, MSR_DEV, perf_regs[domain], (uint64_t*)status);
+        if (err)
+        {
+            ERROR_PRINT(Failed to get power perf value for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+    }
+    return 0;
 }
 
+int power_limitSet(int cpuId, PowerType domain, double power, double time, int doClamping)
+{
+    int err = 0;
+    if (domain >= NUM_POWER_DOMAINS)
+    {
+        return -EINVAL;
+    }
+    fprintf(stderr, "Not implemented\n");
+    return 0;
+
+    uint32_t X = (log(time) - log(power_info.timeUnit))/log(2);
+    uint32_t powerField = (uint32_t)(power/(power_info.domains[domain].energyUnit));
+    uint64_t flags = (powerField & 0xFFFF)|((X & (0x1F))<<17);
+    // Construct flags missing. How is timeField calculated?
+    if (doClamping)
+    {
+        flags |= (1ULL<<16);
+    }
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+    {
+        err = HPMwrite(cpuId, MSR_DEV, limit_regs[domain], flags);
+        if (err)
+        {
+            fprintf(stderr, "Failed to set power limit for domain %s on CPU %d\n",power_names[domain], cpuId);
+            return -EFAULT;
+        }
+    }
+    return 0;
+}
+
+int power_limitGet(int cpuId, PowerType domain, double* power, double* time)
+{
+    int err = 0;
+    *power = 0;
+    *time = 0;
+    unsigned int Y,Z;
+    if (domain >= NUM_POWER_DOMAINS)
+    {
+        return -EINVAL;
+    }
+    uint64_t flags = 0x0ULL;
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+    {
+        err = HPMread(cpuId, MSR_DEV, limit_regs[domain], &flags);
+        if (err)
+        {
+            fprintf(stderr, "Failed to set power limit for domain %s on CPU %d\n",power_names[domain], cpuId);
+            return -EFAULT;
+        }
+        *power = ((double)extractBitField(flags, 15, 0)) * power_info.domains[domain].energyUnit;
+        Y = extractBitField(flags, 5, 17);
+        Z = extractBitField(flags, 2, 22);
+        *time = pow(2,((double)Y)) * (1.0 + (((double)Z)/4.0)) * power_info.timeUnit;
+    }
+    return 0;
+}
+
+int power_limitState(int cpuId, PowerType domain)
+{
+    int err = 0;
+    if (domain >= NUM_POWER_DOMAINS)
+    {
+        return -EINVAL;
+    }
+    uint64_t flags = 0x0ULL;
+
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+    {
+        err = HPMread(cpuId, MSR_DEV, limit_regs[domain], &flags);
+        if (err)
+        {
+            ERROR_PRINT(Failed to activate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+    }
+    if (flags & (1ULL<<15))
+    {
+        return 1;
+    }
+    return 0;
+}
+
+int power_limitActivate(int cpuId, PowerType domain)
+{
+    int err = 0;
+    if (domain >= NUM_POWER_DOMAINS)
+    {
+        return -EINVAL;
+    }
+    uint64_t flags = 0x0ULL;
+
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+    {
+        err = HPMread(cpuId, MSR_DEV, limit_regs[domain], &flags);
+        if (err)
+        {
+            ERROR_PRINT(Failed to activate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+        flags |= (1ULL<<15);
+        err = HPMwrite(cpuId, MSR_DEV, limit_regs[domain], flags);
+        if (err)
+        {
+            ERROR_PRINT(Failed to activate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+    }
+    return 0;
+}
+
+int power_limitDectivate(int cpuId, PowerType domain)
+{
+    int err = 0;
+    uint64_t flags = 0x0ULL;
+
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_LIMIT)
+    {
+        err = HPMread(cpuId, MSR_DEV, limit_regs[domain], &flags);
+        if (err)
+        {
+            ERROR_PRINT(Failed to deactivate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+        flags &= ~(1ULL<<15);
+        err = HPMwrite(cpuId, MSR_DEV, limit_regs[domain], flags);
+        if (err)
+        {
+            ERROR_PRINT(Failed to deactivate power limit for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+    }
+    return 0;
+}
+
+int power_policySet(int cpuId, PowerType domain, uint32_t priority)
+{
+    int err = 0;
+    if (domain >= NUM_POWER_DOMAINS)
+    {
+        return -EINVAL;
+    }
+    priority = extractBitField(priority, 5, 0);
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_POLICY)
+    {
+        err = HPMwrite(cpuId, MSR_DEV, policy_regs[domain], priority);
+        if (err)
+        {
+            ERROR_PRINT(Failed to set power policy for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+    }
+    return 0;
+}
+
+int power_policyGet(int cpuId, PowerType domain, uint32_t* priority)
+{
+    int err = 0;
+    *priority = 0x0U;
+    if (domain >= NUM_POWER_DOMAINS)
+    {
+        return -EINVAL;
+    }
+    if (power_info.domains[domain].supportFlags & POWER_DOMAIN_SUPPORT_POLICY)
+    {
+        err = HPMread(cpuId, MSR_DEV, policy_regs[domain], (uint64_t*)priority);
+        if (err)
+        {
+            ERROR_PRINT(Failed to get power policy for domain %s on CPU %d,power_names[domain], cpuId);
+            return -EFAULT;
+        }
+    }
+    return 0;
+}
+
+
+void power_finalize(void)
+{
+    if (power_info.turbo.steps)
+    {
+        free(power_info.turbo.steps);
+    }
+}
+
+PowerInfo_t get_powerInfo(void)
+{
+    return &power_info;
+}
diff --git a/src/pthread-overload/Makefile b/src/pthread-overload/Makefile
index 5f460a5..fdeea0a 100644
--- a/src/pthread-overload/Makefile
+++ b/src/pthread-overload/Makefile
@@ -1,16 +1,18 @@
 # =======================================================================================
-#  
+#
 #      Filename:  Makefile
-# 
+#
 #      Description:  pthread-overload Makefile
-# 
-#      Version:   3.1.3
-#      Released:  4.11.2014
-# 
+#
+#      Version:   4.0
+#      Released:  16.6.2015
+#
 #      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+#               Thomas Roehl (tr), thomas.roehl at googlemail.com
+#
 #      Project:  likwid
 #
-#      Copyright (C) 2014 Jan Treibig
+#      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
 #
 #      This program is free software: you can redistribute it and/or modify it under
 #      the terms of the GNU General Public License as published by the Free Software
@@ -28,6 +30,9 @@
 
 include  ../../config.mk
 include  ../../make/include_$(COMPILER).mk
+include  ../../make/config_checks.mk
+include  ../../make/config_defines.mk
+
 
 TARGET   = liblikwidpin.so
 
@@ -35,7 +40,7 @@ ifneq ($(COLOR),NONE)
 DEFINES += -DCOLOR=$(COLOR)
 endif
 
-DEFINES  += -DMAX_NUM_THREADS=$(MAX_NUM_THREADS)
+DEFINES  += -DMAX_NUM_THREADS=$(MAX_NUM_THREADS) -D_GNU_SOURCE
 INCLUDES += -I../includes
 LIBS     += -ldl
 CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES) 
@@ -43,5 +48,5 @@ CPPFLAGS := $(CPPFLAGS) $(DEFINES) $(INCLUDES)
 all: $(TARGET)
 
 $(TARGET): pthread-overload.c
-	$(CC) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $(INCLUDES) $(SHARED_CFLAGS) $(SHARED_LFLAGS) -o ../../$(TARGET) pthread-overload.c $(LIBS)
+	$(CC) -Wl,-soname,$(TARGET).$(VERSION) $(CFLAGS) $(ANSI_CFLAGS) $(CPPFLAGS) $(INCLUDES) $(SHARED_CFLAGS) $(SHARED_LFLAGS) -o ../../$(TARGET) pthread-overload.c $(LIBS)
 
diff --git a/src/pthread-overload/pthread-overload.c b/src/pthread-overload/pthread-overload.c
index e9d5dcc..7acb9c7 100644
--- a/src/pthread-overload/pthread-overload.c
+++ b/src/pthread-overload/pthread-overload.c
@@ -3,16 +3,16 @@
  *
  *      Filename:  pthread-overload.c
  *
- *      Description:  Overloaded library for pthread_create call.
+ *      Description:  Overloaded library for pthread_create call. 
  *                    Implements pinning of threads together with likwid-pin.
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -77,25 +77,28 @@ pthread_create(pthread_t* thread,
     static int silent = 0;
     static int pin_ids[MAX_NUM_THREADS];
     static uint64_t skipMask = 0;
-    static int got_skipMask = 0;
 
 
     /* On first entry: Get Evironment Variable and initialize pin_ids */
     if (ncalled == 0)
     {
-        char *str = getenv("LIKWID_SKIP");
+        char *str;
         char *token, *saveptr;
         char *delimiter = ",";
         int i = 0;
         int ncpus = 0;
 
+        str = getenv("LIKWID_SKIP");
         if (str != NULL)
         {
-            skipMask = strtoul(str, &str, 10);
-            got_skipMask = 1;
+            skipMask = strtoul(str, &str, 16);
+        }
+        else
+        {
+            printf("[pthread wrapper] ERROR: Environment Variabel LIKWID_SKIP not set!\n");
         }
 
-        if ( got_skipMask == 0 && skipMask == 0x0 )
+        if ( skipMask == 0 )
         {
             dlerror();    /* Clear any existing error */
             dlsym(RTLD_DEFAULT,"__kmpc_begin");
@@ -104,14 +107,17 @@ pthread_create(pthread_t* thread,
                 skipMask = 0x1;
             }
         }
+
         if (getenv("LIKWID_SILENT") != NULL)
         {
             silent = 1;
         }
+#ifdef COLOR
         else
         {
             color_on(BRIGHT, COLOR);
         }
+#endif
 
         if (!silent)
         {
@@ -171,7 +177,7 @@ pthread_create(pthread_t* thread,
         {
             break;
         }
-        if (sosearchpaths[reallpthrindex] != NULL)
+        if (sosearchpaths[reallpthrindex] != NULL) 
         {
             reallpthrindex++;
         }
diff --git a/src/strUtil.c b/src/strUtil.c
deleted file mode 100644
index cf37920..0000000
--- a/src/strUtil.c
+++ /dev/null
@@ -1,975 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  strUtil.c
- *
- *      Description:  Utility routines for strings. Depends on bstring lib.
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <sys/types.h>
-#include <unistd.h>
-#include <sched.h>
-
-#include <error.h>
-#include <types.h>
-#include <bstrlib.h>
-#include <strUtil.h>
-#include <affinity.h>
-#include <cpuid.h>
-#include <pci.h>
-
-/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
-static int
-cpu_count(cpu_set_t* set)
-{
-    uint32_t i;
-    int s = 0;
-    const __cpu_mask *p = set->__bits;
-    const __cpu_mask *end = &set->__bits[sizeof(cpu_set_t) / sizeof (__cpu_mask)];
-
-    while (p < end)
-    {
-        __cpu_mask l = *p++;
-
-        if (l == 0)
-        {
-            continue;
-        }
-
-        for (i=0; i< (sizeof(__cpu_mask)*8); i++)
-        {
-            if (l&(1UL<<i))
-            {
-            s++;
-            }
-        }
-    }
-
-    return s;
-}
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-int str2int(const char* str)
-{
-    char* endptr;
-    errno = 0;
-    unsigned long val;
-    val = strtoul(str, &endptr, 10);
-    if ((errno == ERANGE && val == LONG_MAX )
-        || (errno != 0 && val == 0))
-    {
-        ERROR;
-    }
-
-    if (endptr == str)
-    {
-        ERROR_PRINT(Cannot parse string %s to digits, str);
-    }
-
-    return (int) val;
-}
-
-uint32_t
-bstr_to_cpuset_physical(uint32_t* threads,  const_bstring q)
-{
-    int i;
-    unsigned int rangeBegin;
-    unsigned int rangeEnd;
-    uint32_t numThreads=0;
-    struct bstrList* tokens;
-    struct bstrList* subtokens;
-
-    tokens = bsplit(q,',');
-
-    for (i=0;i<tokens->qty;i++)
-    {
-        subtokens = bsplit(tokens->entry[i],'-');
-
-        if( subtokens->qty == 1 )
-        {
-            threads[numThreads] = str2int((char *) bdata(subtokens->entry[0]));
-            numThreads++;
-        }
-        else if ( subtokens->qty == 2 )
-        {
-            rangeBegin = str2int((char*) bdata(subtokens->entry[0]));
-            rangeEnd = str2int((char*) bdata(subtokens->entry[1]));
-
-            if (!(rangeBegin <= rangeEnd))
-            {
-                ERROR_PRINT(Range End %d bigger than begin %d, rangeEnd, rangeBegin);
-            }
-
-            while (rangeBegin <= rangeEnd) {
-                threads[numThreads] = rangeBegin;
-                numThreads++;
-                rangeBegin++;
-            }
-        }
-        else
-        {
-            ERROR_PLAIN_PRINT(Parse Error);
-        }
-        bstrListDestroy(subtokens);
-    }
-    if (numThreads > MAX_NUM_THREADS)
-    {
-        ERROR_PRINT(Number Of threads %d too large, numThreads);
-    }
-
-    bstrListDestroy(tokens);
-
-    return numThreads;
-}
-
-uint32_t
-bstr_to_cpuset_logical(uint32_t* threads,  const_bstring q)
-{
-    int i;
-    uint32_t j;
-    int id;
-    uint32_t tmpThreads[MAX_NUM_THREADS];
-    int globalNumThreads=0;
-    uint32_t numThreads=0;
-    struct bstrList* tokens;
-    struct bstrList* subtokens;
-    const AffinityDomain* domain;
-
-    tokens = bsplit(q,'@');
-
-    for (i=0;i<tokens->qty;i++)
-    {
-        subtokens = bsplit(tokens->entry[i],':');
-
-        if ( subtokens->qty == 2 )
-        {
-            domain =  affinity_getDomain(subtokens->entry[0]);
-
-            if (!domain)
-            {
-                ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
-            }
-
-            numThreads = bstr_to_cpuset_physical(tmpThreads, subtokens->entry[1]);
-
-            for (j=0; j<numThreads; j++)
-                {
-                if (! (tmpThreads[j] >= domain->numberOfProcessors))
-                {
-                    id = (tmpThreads[j]/domain->numberOfCores) +
-                        (tmpThreads[j]%domain->numberOfCores) * cpuid_topology.numThreadsPerCore;
-                    threads[globalNumThreads++] = domain->processorList[id];
-                }
-                else
-                {
-                    ERROR_PRINT(Too many threads requested. Avaialable 0-%d,domain->numberOfProcessors-1);
-                }
-            }
-        }
-        else
-        {
-            ERROR_PLAIN_PRINT(Parse Error);
-        }
-        bstrListDestroy(subtokens);
-    }
-
-    bstrListDestroy(tokens);
-
-    return globalNumThreads;
-}
-
-#define PRINT_EXPR_ERR printf("SYNTAX ERROR: Expression must have the format E:<thread domain>:<num threads>[:chunk size>:<stride>]\n")
-
-uint32_t
-bstr_to_cpuset_expression(uint32_t* threads,  const_bstring qi)
-{
-    int i;
-    uint32_t j;
-    bstring q = (bstring) qi;
-    int globalNumThreads=0;
-    uint32_t numThreads=0;
-    struct bstrList* tokens;
-    struct bstrList* subtokens;
-    const AffinityDomain* domain;
-
-    bdelete (q, 0, 2);
-    tokens = bsplit(q,'@');
-
-    for (i=0;i<tokens->qty;i++)
-    {
-        subtokens = bsplit(tokens->entry[i],':');
-
-        if ( subtokens->qty == 2 )
-        {
-            domain =  affinity_getDomain(subtokens->entry[0]);
-
-            if (!domain)
-            {
-                ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
-            }
-
-            numThreads = str2int(bdata(subtokens->entry[1]));
-
-            if (numThreads > domain->numberOfProcessors)
-            {
-                ERROR_PRINT(Invalid processor id requested. Avaialable 0-%d,
-                            domain->numberOfProcessors-1);
-            }
-
-            for (j=0; j<numThreads; j++)
-            {
-                threads[globalNumThreads++] = domain->processorList[j];
-            }
-        }
-        else if ( subtokens->qty == 4 )
-        {
-            int counter;
-            int currentId = 0;
-            int startId = 0;
-            int chunksize =  str2int(bdata(subtokens->entry[2]));
-            int stride =  str2int(bdata(subtokens->entry[3]));
-            domain = affinity_getDomain(subtokens->entry[0]);
-
-            if (!domain)
-            {
-                ERROR_PRINT(Unknown domain ##%s##,bdata(subtokens->entry[0]));
-            }
-
-            numThreads = str2int(bdata(subtokens->entry[1]));
-
-            if (numThreads > domain->numberOfProcessors)
-            {
-                ERROR_PRINT(Invalid number of processors requested. Available 0-%d,
-                            domain->numberOfProcessors-1);
-            }
-
-
-            counter = 0;
-            for (j=0; j<numThreads; j+=chunksize)
-            {
-                for(i=0;i<chunksize && j+i<numThreads ;i++)
-                {
-                    threads[globalNumThreads++] = domain->processorList[counter+i];
-                }
-                counter += stride;
-                if (counter >= domain->numberOfProcessors)
-                {
-                    counter = 0;
-                }
-            }
-        }
-        else
-        {
-            PRINT_EXPR_ERR;
-            ERROR_PLAIN_PRINT(Parse Error);
-        }
-        bstrListDestroy(subtokens);
-    }
-
-    bstrListDestroy(tokens);
-
-    return globalNumThreads;
-}
-
-uint32_t
-bstr_to_cpuset_scatter(uint32_t* threads,  const_bstring qi)
-{
-    int domainId = 0;
-    int id = 0;
-    int threadId = 0;
-    bstring q = (bstring) qi;
-    bstring domaintag;
-    int globalNumThreads=0;
-    struct bstrList* subtokens;
-    int numberOfDomains = 0;
-    AffinityDomain* domain;
-    AffinityDomain* tmpDomainPtr;
-
-    domain = (AffinityDomain*) malloc(cpuid_topology.numHWThreads * sizeof(AffinityDomain));
-
-    subtokens = bsplit(q,':');
-
-    if ( subtokens->qty == 2 )
-    {
-        for(int i =0;;i++)
-        {
-            domaintag = bformat("%s%d",bdata(subtokens->entry[0]),i);
-            tmpDomainPtr = (AffinityDomain*) affinity_getDomain(domaintag);
-
-            if (tmpDomainPtr == NULL)
-            {
-                break;
-            }
-            else
-            {
-                memcpy(domain+i,tmpDomainPtr,sizeof(AffinityDomain));
-                numberOfDomains++;
-            }
-        }
-
-        threads[globalNumThreads++] = domain[domainId].processorList[0];
-
-        for (uint32_t i=1; i<cpuid_topology.numHWThreads; i++)
-        {
-            domainId = i%numberOfDomains;
-
-            if (domainId == 0)
-            {
-                threadId++;
-            }
-
-            id = (threadId/domain->numberOfCores) +
-                (threadId%domain->numberOfCores) * cpuid_topology.numThreadsPerCore;
-
-            threads[globalNumThreads++] = domain[domainId].processorList[id];
-        }
-    }
-    else
-    {
-        PRINT_EXPR_ERR;
-        ERROR_PLAIN_PRINT(Parse Error);
-    }
-    bstrListDestroy(subtokens);
-    free(domain);
-
-    return globalNumThreads;
-}
-
-
-
-#define CPUSET_ERROR  \
-    if (cpuid_isInCpuset()) {  \
-        ERROR_PLAIN_PRINT(You are running inside a cpuset. In cpusets only logical pinning inside set is allowed!);  \
-    }
-
-
-
-int
-bstr_to_cpuset(int* threadsIN,  const_bstring q)
-{
-    uint32_t i;
-    int num=0;
-    int cpuMapping[cpuid_topology.numHWThreads];
-    cpu_set_t cpu_set;
-    uint32_t numThreads;
-    bstring domainStr = bformat("NSCM");
-    const_bstring  scatter = bformat("scatter");
-    struct bstrList* tokens;
-    CPU_ZERO(&cpu_set);
-    sched_getaffinity(0,sizeof(cpu_set_t), &cpu_set);
-    uint32_t* threads = (uint32_t*) threadsIN;
-
-    if (binchr (q, 0, domainStr) !=  BSTR_ERR)
-    {
-        CPUSET_ERROR;
-
-        if (binstr (q, 0 , scatter ) !=  BSTR_ERR)
-        {
-          numThreads =  bstr_to_cpuset_scatter(threads,q);
-        }
-        else if (bstrchr (q, 'E') !=  BSTR_ERR)
-        {
-          numThreads =  bstr_to_cpuset_expression(threads,q);
-        }
-        else
-        {
-          numThreads =  bstr_to_cpuset_logical(threads,q);
-        }
-    }
-    else if (bstrchr (q, 'L') !=  BSTR_ERR)
-    {
-        uint32_t count = cpu_count(&cpu_set);
-
-        tokens = bsplit(q,':');
-        numThreads = bstr_to_cpuset_physical(threads,tokens->entry[1]);
-
-        for (i=0; i <  cpuid_topology.numHWThreads; i++)
-        {
-            if (CPU_ISSET(i,&cpu_set))
-            {
-                cpuMapping[num++]=i;
-            }
-        }
-
-        for (i=0; i < numThreads; i++)
-        {
-            if (!(threads[i] >= count))
-            {
-                threads[i] = cpuMapping[threads[i]];
-            }
-            else
-            {
-                fprintf(stderr, "Available CPUs: ");
-                for (int j=0; j< num-1;j++)
-                {
-                    fprintf(stderr, "%d,", cpuMapping[j]);
-                }
-                fprintf(stderr, "%d\n", cpuMapping[num-1]);
-                ERROR_PRINT(Index %d out of range.,threads[i]);
-            }
-        }
-        bstrListDestroy(tokens);
-    }
-    else
-    {
-        CPUSET_ERROR;
-        numThreads = bstr_to_cpuset_physical(threads,q);
-    }
-
-    bdestroy(domainStr);
-    return (int) numThreads;
-}
-
-
-void
-bstr_to_eventset(StrUtilEventSet* set, const_bstring q)
-{
-    int i;
-    struct bstrList* tokens;
-    struct bstrList* subtokens;
-
-    tokens = bsplit(q,',');
-    set->numberOfEvents = tokens->qty;
-    set->events = (StrUtilEvent*)
-    malloc(set->numberOfEvents * sizeof(StrUtilEvent));
-
-    for (i=0;i<tokens->qty;i++)
-    {
-        subtokens = bsplit(tokens->entry[i],':');
-
-        if ( subtokens->qty != 2 )
-        {
-          
-            fprintf(stderr, "Cannot parse event string %s, probably missing counter name\n"
-                          ,bdata(tokens->entry[i]));
-            fprintf(stderr, "Format: <eventName>:<counter>,...\n");
-            msr_finalize();
-            pci_finalize();
-            exit(EXIT_FAILURE);
-
-        }
-        else
-        {
-            set->events[i].eventName = bstrcpy(subtokens->entry[0]);
-            set->events[i].counterName = bstrcpy(subtokens->entry[1]);
-        }
-
-        bstrListDestroy(subtokens);
-    }
-
-    bstrListDestroy(tokens);
-}
-
-FILE*
-bstr_to_outstream(const_bstring argString, bstring filter)
-{
-    int i;
-    char* cstr;
-    FILE* STREAM;
-    struct bstrList* tokens;
-    bstring base;
-    bstring suffix = bfromcstr(".");
-    bstring filename;
-
-    /* configure filter */
-    tokens = bsplit(argString,'.');
-
-    if (tokens->qty < 2)
-    {
-        fprintf(stderr, "Outputfile has no filetype suffix!\n");
-        fprintf(stderr, "Add suffix .txt for raw output or any supported filter suffix.\n");
-        exit(EXIT_FAILURE);
-    }
-
-    base = bstrcpy(tokens->entry[0]);
-
-    if (biseqcstr(tokens->entry[1],"txt"))
-    {
-        bassigncstr(filter, "NO");
-    }
-    else
-    {
-        bassigncstr(filter, TOSTRING(LIKWIDFILTERPATH));
-        bconchar(filter,'/');
-        bconcat(filter,tokens->entry[1]);
-    }
-
-    bconcat(suffix,tokens->entry[1]);
-    bstrListDestroy(tokens);
-
-    tokens = bsplit(base,'_');
-
-    if (tokens->qty < 1)
-    {
-        ERROR_PLAIN_PRINT(Error in parsing file string);
-    }
-
-    filename = bstrcpy(tokens->entry[0]);
-
-    for (i=1; i<tokens->qty; i++)
-    {
-        if (biseqcstr(tokens->entry[i],"%j"))
-        {
-            cstr = getenv("PBS_JOBID");
-            if (cstr != NULL) 
-            {
-                bcatcstr(filename, "_");
-                bcatcstr(filename, cstr);
-            }
-        }
-        else if (biseqcstr(tokens->entry[i],"%r"))
-        {
-            cstr = getenv("PMI_RANK");
-            if (cstr == NULL) 
-            {
-                cstr = getenv("OMPI_COMM_WORLD_RANK");
-            }
-            if (cstr != NULL) 
-            {
-                bcatcstr(filename, "_");
-                bcatcstr(filename, cstr);
-            }
-        }
-        else if (biseqcstr(tokens->entry[i],"%h"))
-        {
-            cstr = (char*) malloc(HOST_NAME_MAX * sizeof(char));
-            gethostname(cstr,HOST_NAME_MAX);
-            bcatcstr(filename, "_");
-            bcatcstr(filename, cstr);
-            free(cstr);
-        }
-        else if (biseqcstr(tokens->entry[i],"%p"))
-        {
-            bstring pid = bformat("_%d",getpid());
-            bconcat(filename, pid);
-            bdestroy(pid);
-        }
-        else 
-        {
-            ERROR_PLAIN_PRINT(Unsupported placeholder in filename!);
-        }
-    }
-
-    if (biseqcstr(filter,"NO"))
-    {
-        bconcat(filename, suffix);
-    }
-    else
-    {
-        bcatcstr(filter, " ");
-        bcatcstr(filename, ".tmp");
-        bconcat(filter, filename);
-    }
-
-    bstrListDestroy(tokens);
-    STREAM = fopen(bdata(filename),"w");
-    bdestroy(filename);
-    bdestroy(suffix);
-    bdestroy(base);
-
-    return STREAM;
-}
-
-
-uint64_t
-bstr_to_doubleSize(const_bstring str, DataType type)
-{
-    bstring unit = bmidstr(str, blength(str)-2, 2);
-    bstring sizeStr = bmidstr(str, 0, blength(str)-2);
-    uint64_t sizeU = str2int(bdata(sizeStr));
-    uint64_t junk = 0;
-    uint64_t bytesize = 0;
-
-    switch (type)
-    {
-        case SINGLE:
-        case SINGLE_RAND:
-            bytesize = sizeof(float);
-            break;
-
-        case DOUBLE:
-        case DOUBLE_RAND:
-            bytesize = sizeof(double);
-            break;
-    }
-
-    if (biseqcstr(unit, "kB")) {
-        junk = (sizeU *1024)/bytesize;
-    } else if (biseqcstr(unit, "MB")) {
-        junk = (sizeU *1024*1024)/bytesize;
-    } else if (biseqcstr(unit, "GB")) {
-        junk = (sizeU *1024*1024*1024)/bytesize;
-    }
-
-    return junk;
-}
-
-void
-bstr_to_interval(const_bstring str, struct timespec* interval)
-{
-    int size;
-    int pos;
-    bstring ms = bformat("ms");
-
-    if ((pos = bstrrchr (str, 's')) != BSTR_ERR)
-    {
-        if (pos != (blength(str)-1))
-        {
-            fprintf(stderr, "You need to specify a time unit s or ms like 200ms\n");
-            msr_finalize();
-            exit(EXIT_FAILURE);
-        }
-
-        /* unit is ms */
-        if (binstrr (str, blength(str), ms) != BSTR_ERR)
-        {
-            bstring sizeStr = bmidstr(str, 0, blength(str)-2);
-            size = str2int(bdata(sizeStr));
-            if (size >= 1000)
-            {
-                interval->tv_sec = size/1000;
-                interval->tv_nsec = (size%1000) * 1.E06;
-            }
-            else
-            {
-                interval->tv_sec = 0L;
-                interval->tv_nsec = size * 1.E06;
-            }
-        }
-        /* unit is s */
-        else 
-        {
-            bstring sizeStr = bmidstr(str, 0, blength(str)-1);
-            size = str2int(bdata(sizeStr));
-            interval->tv_sec = size;
-            interval->tv_nsec = 0L;
-        }
-    }
-    else
-    {
-        fprintf(stderr, "You need to specify a time unit s or ms like 200ms\n");
-        msr_finalize();
-        exit(EXIT_FAILURE);
-    }
-}
-
-
-void
-bstr_to_workgroup(Workgroup* group,
-    const_bstring str,
-    DataType type,
-    int numberOfStreams)
-{
-    uint32_t i;
-    int parseStreams = 0;
-    bstring threadInfo;
-    bstring streams= bformat("0");
-    struct bstrList* tokens;
-    struct bstrList* subtokens;
-    const AffinityDomain* domain;
-
-    /* split the workgroup into the thread and the streams part */
-    tokens = bsplit(str,'-');
-
-    if (tokens->qty == 2)
-    {
-        threadInfo = bstrcpy(tokens->entry[0]);
-        streams = bstrcpy(tokens->entry[1]);
-        parseStreams = 1;
-    }
-    else if (tokens->qty == 1)
-    {
-        threadInfo = bstrcpy(tokens->entry[0]);
-    }
-    else
-    {
-        ERROR_PLAIN_PRINT(Error in parsing workgroup string);
-    }
-
-    bstrListDestroy (tokens);
-    tokens = bsplit(threadInfo,':');
-
-    if (tokens->qty == 5)
-    {
-        uint32_t maxNumThreads;
-        int chunksize;
-        int stride;
-        int counter;
-        int currentId = 0;
-        int startId = 0;
-
-        domain = affinity_getDomain(tokens->entry[0]);
-
-        if (domain == NULL)
-        {
-          fprintf(stderr, "Error: Domain %s not available on current machine.\nTry likwid-bench -p for supported domains.",
-              bdata(tokens->entry[0]));
-          exit(EXIT_FAILURE);
-        }
-
-        group->size = bstr_to_doubleSize(tokens->entry[1], type);
-        group->numberOfThreads = str2int(bdata(tokens->entry[2]));
-        chunksize = str2int(bdata(tokens->entry[3]));
-        stride = str2int(bdata(tokens->entry[4]));
-        maxNumThreads = (domain->numberOfProcessors / stride) * chunksize;
-
-        if (group->numberOfThreads > maxNumThreads)
-        {
-          fprintf(stderr, "Error: Domain %s supports only up to %d threads with used expression.\n",
-                        bdata(tokens->entry[0]), maxNumThreads);
-          exit(EXIT_FAILURE);
-        }
-
-        group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
-
-        counter = chunksize;
-
-        for (i=0; i<group->numberOfThreads; i++)
-        {
-            if (counter)
-            {
-                group->processorIds[i] = domain->processorList[currentId++];
-            }
-            else
-            {
-                startId += stride;
-                currentId = startId;
-                group->processorIds[i] = domain->processorList[currentId++];
-                counter = chunksize;
-            }
-            counter--;
-        }
-    }
-    else if (tokens->qty == 3)
-    {
-        domain = affinity_getDomain(tokens->entry[0]);
-
-        if (domain == NULL)
-        {
-            fprintf(stderr, "Error: Domain %s not available on current machine.\n", bdata(tokens->entry[0]));
-            fprintf(stderr, "Try likwid-bench -p for supported domains.\n");
-            exit(EXIT_FAILURE);
-        }
-
-        group->size = bstr_to_doubleSize(tokens->entry[1], type);
-        group->numberOfThreads = str2int(bdata(tokens->entry[2]));
-
-        if (group->numberOfThreads > domain->numberOfProcessors)
-        {
-            fprintf(stderr, "Error: Domain %s supports only up to %d threads.\n",
-                            bdata(tokens->entry[0]),domain->numberOfProcessors);
-            exit(EXIT_FAILURE);
-        }
-
-        group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
-
-        for (i=0; i<group->numberOfThreads; i++)
-        {
-            group->processorIds[i] = domain->processorList[i];
-        }
-    }
-    else if (tokens->qty == 2)
-    {
-        domain = affinity_getDomain(tokens->entry[0]);
-
-        if (domain == NULL)
-        {
-            fprintf(stderr, "Error: Domain %s not available on current machine.\n",
-                            bdata(tokens->entry[0]));
-            fprintf(stderr, "Try likwid-bench -p for supported domains.\n");
-            exit(EXIT_FAILURE);
-        }
-
-        group->size = bstr_to_doubleSize(tokens->entry[1], type);
-        group->numberOfThreads = domain->numberOfProcessors;
-        group->processorIds = (int*) malloc(group->numberOfThreads * sizeof(int));
-
-        for (i=0; i<group->numberOfThreads; i++)
-        {
-            group->processorIds[i] = domain->processorList[i];
-        }
-    }
-    else
-    {
-    ERROR_PLAIN_PRINT(Error in parsing workgroup string);
-    }
-
-    bstrListDestroy(tokens);
-
-    /* parse stream list */
-    if (parseStreams)
-    {
-        tokens = bsplit(streams,',');
-
-        if (tokens->qty < numberOfStreams)
-        {
-            ERROR_PRINT(Testcase requires at least %d streams, numberOfStreams);
-        }
-
-        group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
-
-        for (i=0;i<(uint32_t) tokens->qty;i++)
-        {
-            subtokens = bsplit(tokens->entry[i],':');
-
-            if ( subtokens->qty == 3 )
-            {
-                int index = str2int(bdata(subtokens->entry[0]));
-                if (index >= numberOfStreams)
-                {
-                    ERROR_PRINT(Stream Index %d out of range,index);
-                }
-                group->streams[index].domain = bstrcpy(subtokens->entry[1]);
-                group->streams[index].offset = str2int(bdata(subtokens->entry[2]));
-            }
-            else if ( subtokens->qty == 2 )
-            {
-                int index = str2int(bdata(subtokens->entry[0]));
-                if (index >= numberOfStreams)
-                {
-                    ERROR_PRINT(Stream Index %d out of range,index);
-                }
-                group->streams[index].domain = bstrcpy(subtokens->entry[1]);
-                group->streams[index].offset = 0;
-            }
-            else
-            {
-                ERROR_PLAIN_PRINT(Error in parsing event string);
-            }
-
-            bstrListDestroy(subtokens);
-        }
-
-        bstrListDestroy(tokens);
-    }
-    else
-    {
-        group->streams = (Stream*) malloc(numberOfStreams * sizeof(Stream));
-
-        for (i=0; i< (uint32_t)numberOfStreams; i++)
-        {
-            group->streams[i].domain = domain->tag;
-            group->streams[i].offset = 0;
-        }
-    }
-
-    group->size /= numberOfStreams;
-}
-
-
-#define INIT_SECURE_INPUT_LENGTH 256
-
-bstring
-bSecureInput (int maxlen, char* vgcCtx) {
-    int i, m, c = 1;
-    bstring b, t;
-    int termchar = 0;
-
-    if (!vgcCtx) return NULL;
-
-    b = bfromcstralloc (INIT_SECURE_INPUT_LENGTH, "");
-
-    for (i=0; ; i++)
-    {
-        if (termchar == c)
-        {
-            break;
-        }
-        else if ((maxlen > 0) && (i >= maxlen))
-        {
-            b = NULL;
-            return b;
-        }
-        else
-        {
-            c = *(vgcCtx++);
-        }
-
-        if (EOF == c)
-        {
-            break;
-        }
-
-        if (i+1 >= b->mlen) {
-
-            /* Double size, but deal with unusual case of numeric
-             overflows */
-
-            if ((m = b->mlen << 1)   <= b->mlen &&
-                (m = b->mlen + 1024) <= b->mlen &&
-                (m = b->mlen + 16)   <= b->mlen &&
-                (m = b->mlen + 1)    <= b->mlen)
-            {
-                t = NULL;
-            }
-            else
-            {
-                t = bfromcstralloc (m, "");
-            }
-
-            if (t)
-            {
-                memcpy (t->data, b->data, i);
-            }
-
-            bdestroy (b); /* Clean previous buffer */
-            b = t;
-            if (!b)
-            {
-                return b;
-            }
-        }
-
-        b->data[i] = (unsigned char) c;
-    }
-
-    i--;
-    b->slen = i;
-    b->data[i] = (unsigned char) '\0';
-    return b;
-}
-
-
-int
-bJustifyCenter (bstring b, int width) 
-{
-    unsigned char space  = ' ';
-    int alignSpace = (width - b->slen) / 2;
-    int restSpace = (width - b->slen) % 2;
-    if (width <= 0) return -__LINE__;
-
-    if (b->slen <= width)
-    {
-        binsertch (b, 0, alignSpace, space);
-    }
-
-    binsertch (b, b->slen , alignSpace+restSpace, space);
-
-    return BSTR_OK;
-}
-
-
diff --git a/src/thermal.c b/src/thermal.c
index 0812086..5db0d5c 100644
--- a/src/thermal.c
+++ b/src/thermal.c
@@ -5,13 +5,13 @@
  *
  *      Description:  Module implementing Intel TM/TM2 interface
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -34,7 +34,7 @@
 
 #include <types.h>
 #include <thermal.h>
-#include <cpuid.h>
+#include <topology.h>
 
 /* #####   EXPORTED VARIABLES   ########################################### */
 
@@ -52,7 +52,10 @@ void thermal_init(int cpuId)
 
     if ( cpuid_hasFeature(TM2) )
     {
-        flags = msr_read(cpuId, IA32_THERM_STATUS);
+        if (HPMread(cpuId, MSR_DEV, IA32_THERM_STATUS, &flags))
+        {
+            return;
+        }
 
         if ( flags & 0x1 )
         {
@@ -66,7 +69,10 @@ void thermal_init(int cpuId)
         thermal_info.resolution =  extractBitField(flags,4,27);
 
         flags = 0ULL;
-        flags = msr_read(cpuId, MSR_TEMPERATURE_TARGET);
+        if (HPMread(cpuId, MSR_DEV, MSR_TEMPERATURE_TARGET, &flags))
+        {
+            return;
+        }
         thermal_info.activationT =  extractBitField(flags,8,16);
         thermal_info.offset = extractBitField(flags,6,24);
     }
diff --git a/src/threads.c b/src/threads.c
deleted file mode 100644
index 87fa2b2..0000000
--- a/src/threads.c
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * =======================================================================================
- *
- *      Filename:  threads.c
- *
- *      Description:  High level interface to pthreads
- *
- *      Version:   3.1.3
- *      Released:  4.11.2014
- *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
- *      Project:  likwid
- *
- *      Copyright (C) 2014 Jan Treibig
- *
- *      This program is free software: you can redistribute it and/or modify it under
- *      the terms of the GNU General Public License as published by the Free Software
- *      Foundation, either version 3 of the License, or (at your option) any later
- *      version.
- *
- *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
- *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
- *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
- *
- *      You should have received a copy of the GNU General Public License along with
- *      this program.  If not, see <http://www.gnu.org/licenses/>.
- *
- * =======================================================================================
- */
-
-/* #####   HEADER FILE INCLUDES   ######################################### */
-
-#include <stdlib.h>
-#include <stdio.h>
-
-#include <error.h>
-#include <types.h>
-#include <threads.h>
-
-
-/* #####   EXPORTED VARIABLES   ########################################### */
-
-pthread_barrier_t threads_barrier;
-ThreadData* threads_data;
-ThreadGroup* threads_groups;
-
-/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
-
-static pthread_t* threads = NULL;
-static pthread_attr_t attr;
-static int numThreads = 0;
-
-
-/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
-
-void
-threads_init(FILE* OUTSTREAM, int numberOfThreads)
-{
-    int i;
-    numThreads = numberOfThreads;
-
-    threads = (pthread_t*) malloc(numThreads * sizeof(pthread_t));
-    threads_data = (ThreadData*) malloc(numThreads * sizeof(ThreadData));
-
-    for(i = 0; i < numThreads; i++)
-    {
-        threads_data[i].numberOfThreads = numThreads;
-        threads_data[i].globalNumberOfThreads = numThreads;
-        threads_data[i].globalThreadId = i;
-        threads_data[i].threadId = i;
-        threads_data[i].output = OUTSTREAM;
-    }
-
-    pthread_barrier_init(&threads_barrier, NULL, numThreads);
-    pthread_attr_init(&attr);
-    pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_JOINABLE);
-}
-
-
-void 
-threads_create(void *(*startRoutine)(void*))
-{
-    int i;
-
-    for(i = 0; i < numThreads; i++)
-    {
-        pthread_create(&threads[i],
-                &attr,
-                startRoutine,
-                (void*) &threads_data[i]); 
-    }
-}
-
-void 
-threads_createGroups(int numberOfGroups)
-{
-    int i;
-    int j;
-    int numThreadsPerGroup;
-    int globalId = 0;
-
-    if (numThreads % numberOfGroups)
-    {
-        ERROR_PRINT(Not enough threads %d to create %d groups,numThreads,numberOfGroups);
-    }
-    else
-    {
-        numThreadsPerGroup = numThreads / numberOfGroups;
-    }
-
-    threads_groups = (ThreadGroup*) malloc(numberOfGroups *
-            sizeof(ThreadGroup));
-
-    for (i = 0; i < numberOfGroups; i++)
-    {
-        threads_groups[i].numberOfThreads = numThreadsPerGroup;
-        threads_groups[i].threadIds = (int*) malloc(numThreadsPerGroup *
-                sizeof(int));
-
-        for (j = 0; j < numThreadsPerGroup; j++)
-        {
-            threads_data[globalId].threadId = j;
-            threads_data[globalId].groupId = i;
-            threads_data[globalId].numberOfGroups = numberOfGroups;
-            threads_data[globalId].numberOfThreads = numThreadsPerGroup;
-            threads_groups[i].threadIds[j] = globalId++;
-        }
-    }
-}
-
-
-void 
-threads_registerDataAll(ThreadUserData* data, threads_copyDataFunc func)
-{
-    int i;
-
-    if (func == NULL)
-    {
-        for(i = 0; i < numThreads; i++)
-        {
-            threads_data[i].data = (*data);
-        }
-    }
-    else
-    {
-        for(i = 0; i < numThreads; i++)
-        {
-            func( data, &threads_data[i].data);
-        }
-    }
-}
-
-void
-threads_registerDataThread(int threadId,
-        ThreadUserData* data,
-        threads_copyDataFunc func)
-{
-    if (func == NULL)
-    {
-        threads_data[threadId].data = (*data);
-    }
-    else
-    {
-        func( data, &threads_data[threadId].data);
-    }
-}
-
-void
-threads_registerDataGroup(int groupId,
-        ThreadUserData* data,
-        threads_copyDataFunc func)
-{
-    int i;
-
-    if (func == NULL)
-    {
-        for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
-        {
-            threads_data[threads_groups[groupId].threadIds[i]].data = (*data);
-        }
-    }
-    else
-    {
-        for (i = 0; i < threads_groups[groupId].numberOfThreads; i++)
-        {
-            func( data,
-                    &threads_data[threads_groups[groupId].threadIds[i]].data);
-        }
-    }
-}
-
-void
-threads_join(void)
-{
-    int i;
-
-    for(i=0; i < numThreads; i++)
-    {
-        pthread_join(threads[i], NULL);
-    }
-
-    pthread_attr_destroy(&attr);
-    pthread_barrier_destroy(&threads_barrier);
-}
-
-void
-threads_destroy(int numberOfGroups)
-{
-    int i;
-    free(threads_data);
-    for(i=0;i<numberOfGroups;i++)
-    {
-        free(threads_groups[i].threadIds);
-    }
-    free(threads_groups);
-    free(threads);
-}
diff --git a/src/timer.c b/src/timer.c
index 337c13d..6896d76 100644
--- a/src/timer.c
+++ b/src/timer.c
@@ -5,13 +5,13 @@
  *
  *      Description:  Implementation of timer module
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
  *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -34,16 +34,59 @@
 #include <sys/time.h>
 
 #include <types.h>
-#include <timer.h>
+#include <likwid.h>
 
 static uint64_t baseline = 0ULL;
 static uint64_t cpuClock = 0ULL;
 
+void (*TSTART)(TscCounter*) = NULL;
+void (*TSTOP)(TscCounter*) = NULL;
+
+#define CPUID                              \
+    __asm__ volatile ("cpuid"                             \
+            : "=a" (eax),     \
+            "=b" (ebx),     \
+            "=c" (ecx),     \
+            "=d" (edx)      \
+            : "0" (eax), "2" (ecx))
+
+static void fRDTSC(TscCounter* cpu_c)
+{
+    __asm__ volatile("xor %%eax,%%eax\n\t"           \
+    "cpuid\n\t"           \
+    "rdtsc\n\t"           \
+    "movl %%eax, %0\n\t"  \
+    "movl %%edx, %1\n\t"  \
+    : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi) \
+    : : "%eax","%ebx","%ecx","%edx");
+}
+
+static void fRDTSC_CR(TscCounter* cpu_c)
+{
+    __asm__ volatile(   \
+    "rdtsc\n\t"           \
+    "movl %%eax, %0\n\t"  \
+    "movl %%edx, %1\n\t"  \
+    : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi) \
+    : : "%eax","%ebx","%ecx","%edx");
+}
+
+static void fRDTSCP(TscCounter* cpu_c)
+{
+    __asm__ volatile(     \
+    "rdtscp\n\t"          \
+    "movl %%eax, %0\n\t"  \
+    "movl %%edx, %1\n\t"  \
+    "cpuid\n\t"           \
+    : "=r" ((cpu_c)->int32.lo), "=r" ((cpu_c)->int32.hi) \
+    : : "%eax","%ebx","%ecx","%edx");
+}
 
 static uint64_t
 getCpuSpeed(void)
 {
 #ifdef __x86_64
+    int i;
     TimerData data;
     TscCounter start;
     TscCounter stop;
@@ -51,9 +94,9 @@ getCpuSpeed(void)
     struct timeval tv1;
     struct timeval tv2;
     struct timezone tzp;
-    struct timespec delay = { 0, 800000000 }; /* calibration time: 800 ms */
+    struct timespec delay = { 0, 500000000 }; /* calibration time: 500 ms */
 
-    for (int i=0; i< 10; i++)
+    for (i=0; i< 10; i++)
     {
         timer_start(&data);
         timer_stop(&data);
@@ -63,12 +106,12 @@ getCpuSpeed(void)
     baseline = result;
     result = 0xFFFFFFFFFFFFFFFFULL;
 
-    for (int i=0; i< 2; i++)
+    for (i=0; i< 2; i++)
     {
-        RDTSC(start);
+        TSTART(&start);
         gettimeofday( &tv1, &tzp);
         nanosleep( &delay, NULL);
-        RDTSC_STOP(stop);
+        TSTOP(&stop);
         gettimeofday( &tv2, &tzp);
 
         result = MIN(result,(stop.int64 - start.int64));
@@ -98,7 +141,25 @@ getCpuSpeed(void)
 
 void timer_init( void )
 {
-    cpuClock = getCpuSpeed();
+    uint32_t eax,ebx,ecx,edx;
+    if ((!TSTART) && (!TSTOP))
+    {
+        TSTART = fRDTSC;
+        eax = 0x80000001;
+        CPUID;
+        if (edx & (1<<27))
+        {
+            TSTOP = fRDTSCP;
+        }
+        else
+        {
+            TSTOP = fRDTSC_CR;
+        }
+    }
+    if (cpuClock == 0ULL)
+    {
+        cpuClock = getCpuSpeed();
+    }
 }
 
 uint64_t timer_printCycles( TimerData* time )
@@ -128,7 +189,6 @@ double timer_print( TimerData* time )
     {
         cycles = time->stop.int64 - time->start.int64 - baseline;
     }
-
     return  ((double) cycles / (double) cpuClock);
 }
 
@@ -142,4 +202,41 @@ uint64_t timer_getBaseline( void )
     return baseline;
 }
 
+void timer_start( TimerData* time )
+{
+#ifdef __x86_64
+    if (TSTART)
+        TSTART(&(time->start));
+#endif
+#ifdef _ARCH_PPC
+    uint32_t tbl, tbu0, tbu1;
+
+    do {
+        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+        __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+    } while (tbu0 != tbu1);
+
+    time->start.int64 = (((uint64_t)tbu0) << 32) | tbl;
+#endif
+}
+
+
+void timer_stop( TimerData* time )
+{
+#ifdef __x86_64
+    if (TSTOP)
+        TSTOP(&(time->stop));
+#endif
+#ifdef _ARCH_PPC
+    uint32_t tbl, tbu0, tbu1;
+    do {
+        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+        __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+        __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+    } while (tbu0 != tbu1);
+
+    time->stop.int64 = (((uint64_t)tbu0) << 32) | tbl;
+#endif
+}
 
diff --git a/src/topology.c b/src/topology.c
new file mode 100644
index 0000000..2ba3b35
--- /dev/null
+++ b/src/topology.c
@@ -0,0 +1,965 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology.c
+ *
+ *      Description:  Interface to the topology backends
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sched.h>
+
+#include <likwid.h>
+
+#include <topology.h>
+#include <error.h>
+#include <tree.h>
+#include <bitUtil.h>
+//#include <strUtil.h>
+#include <configuration.h>
+
+
+static volatile int init = 0;
+CpuInfo cpuid_info;
+CpuTopology cpuid_topology;
+
+int affinity_thread2tile_lookup[MAX_NUM_THREADS];
+
+static char* pentium_m_b_str = "Intel Pentium M Banias processor";
+static char* pentium_m_d_str = "Intel Pentium M Dothan processor";
+static char* core_duo_str = "Intel Core Duo processor";
+static char* core_2a_str = "Intel Core 2 65nm processor";
+static char* core_2b_str = "Intel Core 2 45nm processor";
+static char* atom_45_str = "Intel Atom 45nm processor";
+static char* atom_32_str = "Intel Atom 32nm processor";
+static char* atom_22_str = "Intel Atom 22nm processor";
+static char* atom_silvermont_str = "Intel Atom (Silvermont) processor";
+static char* atom_airmont_str = "Intel Atom (Airmont) processor";
+static char* nehalem_bloom_str = "Intel Core Bloomfield processor";
+static char* nehalem_lynn_str = "Intel Core Lynnfield processor";
+static char* nehalem_west_str = "Intel Core Westmere processor";
+static char* sandybridge_str = "Intel Core SandyBridge processor";
+static char* ivybridge_str = "Intel Core IvyBridge processor";
+static char* ivybridge_ep_str = "Intel Xeon IvyBridge EN/EP/EX processor";
+static char* sandybridge_ep_str = "Intel Xeon SandyBridge EN/EP processor";
+static char* haswell_str = "Intel Core Haswell processor";
+static char* haswell_ep_str = "Intel Xeon Haswell EN/EP/EX processor";
+static char* broadwell_str = "Intel Core Broadwell processor";
+static char* broadwell_d_str = "Intel Xeon D Broadwell processor";
+static char* broadwell_ep_str = "Intel Xeon Broadwell EN/EP/EX processor";
+static char* nehalem_ex_str = "Intel Nehalem EX processor";
+static char* westmere_ex_str = "Intel Westmere EX processor";
+static char* xeon_mp_string = "Intel Xeon MP processor";
+static char* xeon_phi_string = "Intel Xeon Phi Coprocessor";
+static char* barcelona_str = "AMD Barcelona processor";
+static char* shanghai_str = "AMD Shanghai processor";
+static char* istanbul_str = "AMD Istanbul processor";
+static char* magnycours_str = "AMD Magny Cours processor";
+static char* interlagos_str = "AMD Interlagos processor";
+static char* kabini_str = "AMD Family 16 model - Kabini processor";
+static char* opteron_sc_str = "AMD Opteron single core 130nm processor";
+static char* opteron_dc_e_str = "AMD Opteron Dual Core Rev E 90nm processor";
+static char* opteron_dc_f_str = "AMD Opteron Dual Core Rev F 90nm processor";
+static char* athlon64_str = "AMD Athlon64 X2 (AM2) Rev F 90nm processor";
+static char* athlon64_f_str = "AMD Athlon64 (AM2) Rev F 90nm processor";
+static char* athlon64_X2_g_str = "AMD Athlon64 X2 (AM2) Rev G 65nm processor";
+static char* athlon64_g_str = "AMD Athlon64 (AM2) Rev G 65nm processor";
+static char* amd_k8_str = "AMD K8 architecture";
+static char* unknown_intel_str = "Unknown Intel Processor";
+static char* unknown_amd_str = "Unknown AMD Processor";
+
+static char* short_core2 = "core2";
+static char* short_atom = "atom";
+static char* short_pm = "pentiumm";
+static char* short_silvermont = "silvermont";
+static char* short_nehalem = "nehalem";
+static char* short_nehalemEX = "nehalemEX";
+static char* short_westmere = "westmere";
+static char* short_westmereEX = "westmereEX";
+static char* short_haswell = "haswell";
+static char* short_haswell_ep = "haswellEP";
+static char* short_broadwell = "broadwell";
+static char* short_broadwell_ep = "broadwellEP";
+static char* short_ivybridge = "ivybridge";
+static char* short_ivybridge_ep = "ivybridgeEP";
+static char* short_sandybridge = "sandybridge";
+static char* short_sandybridge_ep = "sandybridgeEP";
+static char* short_phi = "phi";
+static char* short_k8 = "k8";
+static char* short_k10 = "k10";
+static char* short_k15 = "interlagos";
+static char* short_k16 = "kabini";
+static char* short_unknown = "unknown";
+
+
+
+int cpu_count(cpu_set_t* set)
+{
+    uint32_t i;
+    int s = 0;
+    const __cpu_mask *p = set->__bits;
+    const __cpu_mask *end = &set->__bits[sizeof(cpu_set_t) / sizeof (__cpu_mask)];
+
+    while (p < end)
+    {
+        __cpu_mask l = *p++;
+
+        if (l == 0)
+        {
+            continue;
+        }
+
+        for (i=0; i< (sizeof(__cpu_mask)*8); i++)
+        {
+            if (l&(1UL<<i))
+            {
+                s++;
+            }
+        }
+    }
+
+    return s;
+}
+
+static void initTopologyFile(FILE* file)
+{
+    size_t items;
+    HWThread* hwThreadPool;
+    CacheLevel* cacheLevels;
+    TreeNode* currentNode;
+
+    items = fread((void*) &cpuid_topology, sizeof(CpuTopology), 1, file);
+
+    hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
+    items = fread((void*) hwThreadPool, sizeof(HWThread), cpuid_topology.numHWThreads, file);
+    cpuid_topology.threadPool = hwThreadPool;
+
+    cacheLevels = (CacheLevel*) malloc(cpuid_topology.numCacheLevels * sizeof(CacheLevel));
+    items = fread((void*) cacheLevels, sizeof(CacheLevel), cpuid_topology.numCacheLevels, file);
+    cpuid_topology.cacheLevels = cacheLevels;
+    cpuid_topology.topologyTree = NULL;
+
+    tree_init(&cpuid_topology.topologyTree, 0);
+
+    for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
+    {
+        if (!tree_nodeExists(cpuid_topology.topologyTree,
+                    hwThreadPool[i].packageId))
+        {
+            tree_insertNode(cpuid_topology.topologyTree,
+                    hwThreadPool[i].packageId);
+        }
+        currentNode = tree_getNode(cpuid_topology.topologyTree,
+                hwThreadPool[i].packageId);
+
+        if (!tree_nodeExists(currentNode, hwThreadPool[i].coreId))
+        {
+            tree_insertNode(currentNode, hwThreadPool[i].coreId);
+        }
+        currentNode = tree_getNode(currentNode, hwThreadPool[i].coreId);
+
+        if (!tree_nodeExists(currentNode, i))
+        {
+            tree_insertNode(currentNode, i);
+            affinity_thread2tile_lookup[hwThreadPool[i].apicId] = hwThreadPool[i].coreId;
+        }
+    }
+}
+
+
+static int readTopologyFile(const char* filename)
+{
+    FILE* fp;
+    char structure[256];
+    char field[256];
+    char value[256];
+    char line[512];
+    int numHWThreads = -1;
+    int numCacheLevels = -1;
+    int numberOfNodes = -1;
+    int* tmpNumberOfProcessors;
+    int counter;
+    int i;
+    uint32_t tmp, tmp1;
+
+    fp = fopen(filename, "r");
+
+    while (fgets(line, 512, fp) != NULL) {
+        sscanf(line,"%s %s", structure, field);
+        if ((strcmp(structure, "cpuid_topology") == 0) && (strcmp(field, "numHWThreads") == 0))
+        {
+            sscanf(line,"%s %s = %d", structure, field, &numHWThreads);
+        }
+        else if ((strcmp(structure, "cpuid_topology") == 0) && (strcmp(field, "numCacheLevels") == 0))
+        {
+            sscanf(line,"%s %s = %d", structure, field, &numCacheLevels);
+        }
+        else if ((strcmp(structure, "numa_info") == 0) && (strcmp(field, "numberOfNodes") == 0))
+        {
+            sscanf(line,"%s %s = %d", structure, field, &numberOfNodes);
+        }
+        if ((numHWThreads >= 0) && (numCacheLevels >= 0) && (numberOfNodes >= 0))
+        {
+            break;
+        }
+    }
+
+    tmpNumberOfProcessors = (int*) malloc(numberOfNodes *sizeof(int));
+    fseek(fp, 0, SEEK_SET);
+    counter = 0;
+    while (fgets(line, 512, fp) != NULL) {
+        sscanf(line,"%s %s %d %s = %d", structure, field, &tmp, value, &tmp1);
+        if ((strcmp(structure, "numa_info") == 0) && (strcmp(value, "numberOfProcessors") == 0))
+        {
+            tmpNumberOfProcessors[tmp-1] = tmp1;
+            counter++;
+        }
+        if (counter == numberOfNodes)
+        {
+            break;
+        }
+    }
+
+    cpuid_topology.threadPool = (HWThread*)malloc(numHWThreads * sizeof(HWThread));
+    cpuid_topology.cacheLevels = (CacheLevel*)malloc(numCacheLevels * sizeof(CacheLevel));
+    cpuid_topology.numHWThreads = numHWThreads;
+    cpuid_topology.numCacheLevels = numCacheLevels;
+
+    numa_info.nodes = (NumaNode*) malloc(numberOfNodes * sizeof(NumaNode));
+    numa_info.numberOfNodes = numberOfNodes;
+
+    for(i=0;i<numberOfNodes;i++)
+    {
+        numa_info.nodes[i].processors = (uint32_t*) malloc (tmpNumberOfProcessors[i] * sizeof(int));
+        numa_info.nodes[i].distances = (uint32_t*) malloc (numberOfNodes * sizeof(int));
+    }
+    free(tmpNumberOfProcessors);
+
+    fseek(fp, 0, SEEK_SET);
+
+    while (fgets(line, 512, fp) != NULL) {
+        sscanf(line,"%s %s", structure, field);
+        if (strcmp(structure, "cpuid_topology") == 0)
+        {
+            if (strcmp(field, "numSockets") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_topology.numSockets = tmp;
+            }
+            else if (strcmp(field, "numCoresPerSocket") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_topology.numCoresPerSocket = tmp;
+            }
+            else if (strcmp(field, "numThreadsPerCore") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_topology.numThreadsPerCore = tmp;
+            }
+            else if (strcmp(field, "threadPool") == 0)
+            {
+                int thread;
+
+                sscanf(line, "%s %s %d %s = %d", structure, field, &thread, value, &tmp);
+
+                if (strcmp(value, "threadId") == 0)
+                {
+                    cpuid_topology.threadPool[thread].threadId = tmp;
+                }
+                else if (strcmp(value, "coreId") == 0)
+                {
+                    cpuid_topology.threadPool[thread].coreId = tmp;
+                }
+                else if (strcmp(value, "packageId") == 0)
+                {
+                    cpuid_topology.threadPool[thread].packageId = tmp;
+                }
+                else if (strcmp(value, "apicId") == 0)
+                {
+                    cpuid_topology.threadPool[thread].apicId = tmp;
+                }
+                
+            }
+            else if (strcmp(field, "cacheLevels") == 0)
+            {
+                int level;
+                char type[128];
+                sscanf(line, "%s %s %d %s", structure, field, &level, value);
+                
+                cpuid_topology.cacheLevels[level-1].level = level-1;
+                if (strcmp(value, "type") == 0)
+                {
+                    sscanf(line, "%s %s %d %s = %s", structure, field, &level, value, type);
+                    if (strcmp(type, "UNIFIEDCACHE") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].type = UNIFIEDCACHE;
+                    } 
+                    else if (strcmp(type, "DATACACHE") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].type = DATACACHE;
+                    } 
+                    else if (strcmp(type, "INSTRUCTIONCACHE") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].type = INSTRUCTIONCACHE;
+                    } 
+                    else if (strcmp(type, "ITLB") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].type = ITLB;
+                    } 
+                    else if (strcmp(type, "DTLB") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].type = DTLB;
+                    }
+                    else if (strcmp(type, "NOCACHE") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].type = NOCACHE;
+                    }
+                }
+                else
+                {
+                    sscanf(line, "%s %s %d %s = %d", structure, field, &level, value, &tmp);
+                    if (strcmp(value, "associativity") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].associativity = tmp;
+                    }
+                    else if (strcmp(value, "sets") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].sets = tmp;
+                    }
+                    else if (strcmp(value, "lineSize") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].lineSize = tmp;
+                    }
+                    else if (strcmp(value, "size") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].size = tmp;
+                    }
+                    else if (strcmp(value, "threads") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].threads = tmp;
+                    }
+                    else if (strcmp(value, "inclusive") == 0)
+                    {
+                        cpuid_topology.cacheLevels[level-1].inclusive = tmp;
+                    }
+                }
+                
+            }
+        }
+        else if (strcmp(structure, "cpuid_info") == 0)
+        {
+            if (strcmp(field, "family") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.family = tmp;
+                
+            }
+            else if (strcmp(field, "model") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.model = tmp;
+            }
+            else if (strcmp(field, "osname") == 0)
+            {
+                strcpy(value,&(line[strlen(structure)+strlen(field)+4]));
+                cpuid_info.osname = (char*) malloc((strlen(value)+1) * sizeof(char));
+                strncpy(cpuid_info.osname, value, strlen(value));
+                cpuid_info.osname[strlen(value)-1] = '\0';
+            }
+            else if (strcmp(field, "stepping") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.stepping = tmp;
+                
+            }
+            else if (strcmp(field, "clock") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.clock = tmp;
+                
+            }
+            else if (strcmp(field, "turbo") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.turbo = tmp;
+                
+            }
+            else if (strcmp(field, "isIntel") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.isIntel = tmp;
+                
+            }
+            else if (strcmp(field, "featureFlags") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.featureFlags = tmp;
+                
+            }
+            else if (strcmp(field, "perf_version") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.perf_version = tmp;
+                
+            }
+            else if (strcmp(field, "perf_num_ctr") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.perf_num_ctr = tmp;
+                
+            }
+            else if (strcmp(field, "perf_width_ctr") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.perf_width_ctr = tmp;
+                
+            }
+            else if (strcmp(field, "perf_num_fixed_ctr") == 0)
+            {
+                sscanf(line, "%s %s = %d", structure, field, &tmp);
+                cpuid_info.perf_num_fixed_ctr = tmp;
+                
+            }
+            else if (strcmp(field, "features") == 0)
+            {
+                strcpy(value,&(line[strlen(structure)+strlen(field)+4]));
+                cpuid_info.features = (char*) malloc((strlen(value)+1) * sizeof(char));
+                strncpy(cpuid_info.features, value, strlen(value));
+                cpuid_info.features[strlen(value)-1] = '\0';
+            }
+        }
+        else if (strcmp(structure, "numa_info") == 0)
+        {
+            if (strcmp(field, "nodes") == 0)
+            {
+                int id;
+                sscanf(line, "%s %s %d %s", structure, field, &id, value);
+                    
+                if (strcmp(value,"numberOfProcessors") == 0)
+                {
+                    sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+                    numa_info.nodes[id-1].numberOfProcessors = tmp;
+                }
+                else if (strcmp(value, "freeMemory") == 0)
+                {
+                    sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+                    numa_info.nodes[id-1].freeMemory = tmp;
+                }
+                else if (strcmp(value, "id") == 0)
+                {
+                    sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+                    numa_info.nodes[id-1].id = tmp;
+                }
+                else if (strcmp(value, "totalMemory") == 0)
+                {
+                    sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+                    numa_info.nodes[id-1].totalMemory = tmp;
+                }
+                else if (strcmp(value, "numberOfDistances") == 0)
+                {
+                    sscanf(line, "%s %s %d %s = %d", structure, field, &id, value, &tmp);
+                    numa_info.nodes[id-1].numberOfDistances = tmp;
+                }
+                if (strcmp(value, "processors") == 0)
+                {
+                    sscanf(line, "%s %s %d %s %d = %d", structure, field, &id, value, &tmp, &tmp1);
+                    numa_info.nodes[id-1].processors[tmp-1] = tmp1;
+                }
+                else if (strcmp(value,"distances") == 0)
+                {
+                    sscanf(line, "%s %s %d %s %d = %d", structure, field, &id, value, &tmp, &tmp1);
+                    numa_info.nodes[id-1].distances[tmp] = tmp1;
+                }
+            }
+        }
+    }
+    fclose(fp);
+
+    return 0;
+}
+
+int topology_setName(void)
+{
+    switch ( cpuid_info.family )
+    {
+        case P6_FAMILY:
+            switch ( cpuid_info.model )
+            {
+                case PENTIUM_M_BANIAS:
+                    cpuid_info.name = pentium_m_b_str;
+                    cpuid_info.short_name = short_pm;
+                    break;
+
+                case PENTIUM_M_DOTHAN:
+                    cpuid_info.name = pentium_m_d_str;
+                    cpuid_info.short_name = short_pm;
+                    break;
+
+                case CORE_DUO:
+                    cpuid_info.name = core_duo_str;
+                    cpuid_info.short_name = short_core2;
+                    break;
+
+                case CORE2_65:
+                    cpuid_info.name = core_2a_str;
+                    cpuid_info.short_name = short_core2;
+                    break;
+
+                case CORE2_45:
+                    cpuid_info.name = core_2b_str;
+                    cpuid_info.short_name = short_core2;
+                    break;
+
+                case NEHALEM_BLOOMFIELD:
+                    cpuid_info.name = nehalem_bloom_str;
+                    cpuid_info.short_name = short_nehalem;
+                    break;
+
+                case NEHALEM_LYNNFIELD:
+                    cpuid_info.name = nehalem_lynn_str;
+                    cpuid_info.short_name = short_nehalem;
+                    break;
+
+                case NEHALEM_WESTMERE_M:
+
+                case NEHALEM_WESTMERE:
+                    cpuid_info.name = nehalem_west_str;
+                    cpuid_info.short_name = short_westmere;
+                    break;
+
+                case SANDYBRIDGE:
+                    cpuid_info.name = sandybridge_str;
+                    cpuid_info.short_name = short_sandybridge;
+                    break;
+
+                case SANDYBRIDGE_EP:
+                    cpuid_info.supportUncore = 1;
+                    cpuid_info.name = sandybridge_ep_str;
+                    cpuid_info.short_name = short_sandybridge_ep;
+                    break;
+
+                case IVYBRIDGE:
+                    cpuid_info.name = ivybridge_str;
+                    cpuid_info.short_name = short_ivybridge;
+                    break;
+
+                case IVYBRIDGE_EP:
+                    cpuid_info.supportUncore = 1;
+                    cpuid_info.name = ivybridge_ep_str;
+                    cpuid_info.short_name = short_ivybridge_ep;
+                    break;
+
+                case HASWELL_EP:
+                    cpuid_info.supportUncore = 1;
+                    cpuid_info.name = haswell_ep_str;
+                    cpuid_info.short_name = short_haswell_ep;
+                    break;
+                case HASWELL:
+                case HASWELL_M1:
+                case HASWELL_M2:
+                    cpuid_info.name = haswell_str;
+                    cpuid_info.short_name = short_haswell;
+                    break;
+
+                case BROADWELL:
+                    cpuid_info.name = broadwell_str;
+                    cpuid_info.short_name = short_broadwell;
+                    break;
+                case BROADWELL_D:
+                    cpuid_info.name = broadwell_d_str;
+                    cpuid_info.short_name = short_broadwell;
+                    break;
+                case BROADWELL_E:
+                    cpuid_info.name = broadwell_ep_str;
+                    cpuid_info.short_name = short_broadwell_ep;
+                    break;
+
+                case NEHALEM_EX:
+                    cpuid_info.name = nehalem_ex_str;
+                    cpuid_info.short_name = short_nehalemEX;
+                    break;
+
+                case WESTMERE_EX:
+                    cpuid_info.name = westmere_ex_str;
+                    cpuid_info.short_name = short_westmereEX;
+                    break;
+
+                case XEON_MP:
+                    cpuid_info.name = xeon_mp_string;
+                    cpuid_info.short_name = short_core2;
+                    break;
+
+                case ATOM_45:
+
+                case ATOM:
+                    cpuid_info.name = atom_45_str;
+                    cpuid_info.short_name = short_atom;
+                    break;
+
+                case ATOM_32:
+                    cpuid_info.name = atom_32_str;
+                    cpuid_info.short_name = short_atom;
+                    break;
+
+                case ATOM_22:
+                    cpuid_info.name = atom_22_str;
+                    cpuid_info.short_name = short_atom;
+                    break;
+
+                case ATOM_SILVERMONT_E:
+                case ATOM_SILVERMONT_C:
+                case ATOM_SILVERMONT_Z1:
+                case ATOM_SILVERMONT_Z2:
+                case ATOM_SILVERMONT_F:
+                    cpuid_info.name = atom_silvermont_str;
+                    cpuid_info.short_name = short_silvermont;
+                    break;
+                case ATOM_SILVERMONT_AIR:
+                    cpuid_info.name = atom_airmont_str;
+                    cpuid_info.short_name = short_silvermont;
+                    break;
+
+                default:
+                    cpuid_info.name = unknown_intel_str;
+                    cpuid_info.short_name = short_unknown;
+                    break;
+            }
+            break;
+
+        case MIC_FAMILY:
+            switch ( cpuid_info.model ) 
+            {
+                case XEON_PHI:
+                    cpuid_info.name = xeon_phi_string;
+                    cpuid_info.short_name = short_phi;
+                    break;
+
+            }
+            break;
+
+        case K8_FAMILY:
+
+            if (cpuid_info.isIntel)
+            {
+                ERROR_PLAIN_PRINT(Netburst architecture is not supported);
+            }
+
+            switch ( cpuid_info.model )
+            {
+                case OPTERON_DC_E:
+                    cpuid_info.name = opteron_dc_e_str;
+                    break;
+
+                case OPTERON_DC_F:
+                    cpuid_info.name = opteron_dc_f_str;
+                    break;
+
+                case ATHLON64_X2:
+
+                case ATHLON64_X2_F:
+                    cpuid_info.name = athlon64_str;
+                    break;
+
+                case ATHLON64_F1:
+
+                case ATHLON64_F2:
+                    cpuid_info.name = athlon64_f_str;
+                    break;
+
+                case ATHLON64_X2_G:
+                    cpuid_info.name = athlon64_X2_g_str;
+                    break;
+
+                case ATHLON64_G1:
+
+                case ATHLON64_G2:
+                    cpuid_info.name = athlon64_g_str;
+                    break;
+
+                case OPTERON_SC_1MB:
+                    cpuid_info.name = opteron_sc_str;
+                    break;
+
+                default:
+                    cpuid_info.name = amd_k8_str;
+                    break;
+            }
+            cpuid_info.short_name = short_k8;
+            break;
+
+        case K10_FAMILY:
+            switch ( cpuid_info.model )
+            {
+                case BARCELONA:
+                    cpuid_info.name = barcelona_str;
+                    break;
+
+                case SHANGHAI:
+                    cpuid_info.name = shanghai_str;
+                    break;
+
+                case ISTANBUL:
+                    cpuid_info.name = istanbul_str;
+                    break;
+
+                case MAGNYCOURS:
+                    cpuid_info.name = magnycours_str;
+                    break;
+
+                default:
+                    cpuid_info.name = unknown_amd_str;
+                    break;
+            }
+            cpuid_info.short_name = short_k10;
+            break;
+
+        case K15_FAMILY:
+            cpuid_info.name = interlagos_str;
+            cpuid_info.short_name = short_k15;
+            break;
+
+        case K16_FAMILY:
+            cpuid_info.name = kabini_str;
+            cpuid_info.short_name = short_k16;
+            break;
+            
+        default:
+            return EXIT_FAILURE;
+            break;
+    }
+    return EXIT_SUCCESS;
+}
+
+const struct topology_functions topology_funcs = {
+#ifndef LIKWID_USE_HWLOC
+    .init_cpuInfo = cpuid_init_cpuInfo,
+    .init_cpuFeatures = cpuid_init_cpuFeatures,
+    .init_nodeTopology = cpuid_init_nodeTopology,
+    .init_cacheTopology = cpuid_init_cacheTopology,
+#else
+    .init_cpuInfo = hwloc_init_cpuInfo,
+    .init_nodeTopology = hwloc_init_nodeTopology,
+    .init_cacheTopology = hwloc_init_cacheTopology,
+    .init_cpuFeatures = proc_init_cpuFeatures,
+#endif
+    .init_fileTopology = initTopologyFile,
+};
+
+
+void topology_setupTree(void)
+{
+    uint32_t i;
+    TreeNode* currentNode;
+    HWThread* hwThreadPool = cpuid_topology.threadPool;
+    
+    tree_init(&cpuid_topology.topologyTree, 0);
+    for (i=0; i<  cpuid_topology.numHWThreads; i++)
+    {
+        /* Add node to Topology tree */
+        if (!tree_nodeExists(cpuid_topology.topologyTree,
+                    hwThreadPool[i].packageId))
+        {
+            tree_insertNode(cpuid_topology.topologyTree,
+                    hwThreadPool[i].packageId);
+        }
+        currentNode = tree_getNode(cpuid_topology.topologyTree,
+                hwThreadPool[i].packageId);
+        if (!tree_nodeExists(currentNode, hwThreadPool[i].coreId))
+        {
+            tree_insertNode(currentNode, hwThreadPool[i].coreId);
+        }
+        currentNode = tree_getNode(currentNode, hwThreadPool[i].coreId);
+        if (!tree_nodeExists(currentNode, hwThreadPool[i].apicId))
+        {
+            /*
+               printf("WARNING: Thread already exists!\n");
+               */
+            tree_insertNode(currentNode, hwThreadPool[i].apicId);
+            affinity_thread2tile_lookup[hwThreadPool[i].apicId] = hwThreadPool[i].coreId;
+        }
+
+    }
+    cpuid_topology.numSockets = tree_countChildren(cpuid_topology.topologyTree);
+    currentNode = tree_getChildNode(cpuid_topology.topologyTree);
+    cpuid_topology.numCoresPerSocket = tree_countChildren(currentNode);
+    currentNode = tree_getChildNode(currentNode);
+    cpuid_topology.numThreadsPerCore = tree_countChildren(currentNode);
+    return;
+}
+
+int topology_init(void)
+{
+    struct topology_functions funcs = topology_funcs;
+
+    if (init)
+    {
+        return EXIT_SUCCESS;
+    }
+    init = 1;
+
+    init_configuration();
+
+    if (access(config.topologyCfgFileName, R_OK))
+    {
+        cpu_set_t cpuSet;
+        CPU_ZERO(&cpuSet);
+        sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
+        if (cpu_count(&cpuSet) < sysconf(_SC_NPROCESSORS_CONF))
+        {
+            funcs.init_cpuInfo = proc_init_cpuInfo;
+            funcs.init_cpuFeatures = proc_init_cpuFeatures;
+            funcs.init_nodeTopology = proc_init_nodeTopology;
+            funcs.init_cacheTopology = proc_init_cacheTopology;
+            cpuid_topology.activeHWThreads =
+                ((cpu_count(&cpuSet) < sysconf(_SC_NPROCESSORS_CONF)) ?
+                cpu_count(&cpuSet) :
+                sysconf(_SC_NPROCESSORS_CONF));
+        }
+        else
+        {
+            cpuid_topology.activeHWThreads = sysconf(_SC_NPROCESSORS_CONF);
+        }
+        funcs.init_cpuInfo(cpuSet);
+        topology_setName();
+        funcs.init_cpuFeatures();
+        funcs.init_nodeTopology(cpuSet);
+        topology_setupTree();
+        funcs.init_cacheTopology();
+        sched_setaffinity(0, sizeof(cpu_set_t), &cpuSet);
+    }
+    else
+    {
+        cpu_set_t cpuSet;
+        CPU_ZERO(&cpuSet);
+        sched_getaffinity(0,sizeof(cpu_set_t), &cpuSet);
+        DEBUG_PRINT(DEBUGLEV_INFO, Reading topology information from %s, config.topologyCfgFileName);
+        readTopologyFile(config.topologyCfgFileName);
+        cpuid_topology.activeHWThreads = 0;
+        for (int i=0;i<cpuid_topology.numHWThreads;i++)
+        {
+            if (CPU_ISSET(cpuid_topology.threadPool[i].apicId, &cpuSet))
+            {
+                cpuid_topology.activeHWThreads++;
+                cpuid_topology.threadPool[i].inCpuSet = 1;
+            }
+        }
+        topology_setName();
+        topology_setupTree();
+    }
+
+
+    return EXIT_SUCCESS;
+}
+
+
+void topology_finalize(void)
+{
+    if (cpuid_info.features != NULL)
+    {
+        free(cpuid_info.features);
+        cpuid_info.features = NULL;
+    }
+    if (cpuid_info.osname != NULL)
+    {
+        free(cpuid_info.osname);
+        cpuid_info.osname = NULL;
+    }
+    if (cpuid_topology.cacheLevels != NULL)
+    {
+        free(cpuid_topology.cacheLevels);
+        cpuid_topology.cacheLevels = NULL;
+    }
+    if (cpuid_topology.threadPool != NULL)
+    {
+        free(cpuid_topology.threadPool);
+        cpuid_topology.threadPool = NULL;
+    }
+    if (cpuid_topology.topologyTree != NULL)
+    {
+        tree_destroy(cpuid_topology.topologyTree);
+        cpuid_topology.topologyTree = NULL;
+    }
+}
+
+
+
+
+
+void print_supportedCPUs (void)
+{
+    printf("Supported Intel processors:\n");
+    printf("\t%s\n",core_2a_str);
+    printf("\t%s\n",core_2b_str);
+    printf("\t%s\n",xeon_mp_string);
+    printf("\t%s\n",atom_45_str);
+    printf("\t%s\n",atom_32_str);
+    printf("\t%s\n",atom_22_str);
+    printf("\t%s\n",nehalem_bloom_str);
+    printf("\t%s\n",nehalem_lynn_str);
+    printf("\t%s\n",nehalem_west_str);
+    printf("\t%s\n",nehalem_ex_str);
+    printf("\t%s\n",westmere_ex_str);
+    printf("\t%s\n",sandybridge_str);
+    printf("\t%s\n",sandybridge_ep_str);
+    printf("\t%s\n",ivybridge_str);
+    printf("\t%s\n",ivybridge_ep_str);
+    printf("\t%s\n",haswell_str);
+    printf("\t%s\n",haswell_ep_str);
+    printf("\t%s\n",atom_silvermont_str);
+    printf("\t%s\n",atom_airmont_str);
+    printf("\t%s\n",xeon_phi_string);
+    printf("\t%s\n\n",broadwell_str);
+
+    printf("Supported AMD processors:\n");
+    printf("\t%s\n",opteron_sc_str);
+    printf("\t%s\n",opteron_dc_e_str);
+    printf("\t%s\n",opteron_dc_f_str);
+    printf("\t%s\n",barcelona_str);
+    printf("\t%s\n",shanghai_str);
+    printf("\t%s\n",istanbul_str);
+    printf("\t%s\n",magnycours_str);
+    printf("\t%s\n",interlagos_str);
+    printf("\t%s\n\n",kabini_str);
+}
+
+
+
+CpuTopology_t get_cpuTopology(void)
+{
+    return &cpuid_topology;
+}
+
+CpuInfo_t get_cpuInfo(void)
+{
+    return &cpuid_info;
+}
+NumaTopology_t get_numaTopology(void)
+{
+    return &numa_info;
+}
+
diff --git a/src/topology_cpuid.c b/src/topology_cpuid.c
new file mode 100644
index 0000000..a4ad6ad
--- /dev/null
+++ b/src/topology_cpuid.c
@@ -0,0 +1,943 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology_cpuid.c
+ *
+ *      Description:  Interface to the cpuid based topology backend
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <sched.h>
+#include <unistd.h>
+
+#include <error.h>
+
+#include <tree.h>
+#include <bitUtil.h>
+#include <tlb-info.h>
+#include <topology.h>
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+/* this was taken from the linux kernel */
+#define CPUID                              \
+    __asm__ volatile ("cpuid"                             \
+            : "=a" (eax),     \
+            "=b" (ebx),     \
+            "=c" (ecx),     \
+            "=d" (edx)      \
+            : "0" (eax), "2" (ecx))
+            
+            
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+static int largest_function = 0;        
+static uint32_t eax, ebx, ecx, edx;
+
+/* Dirty hack to avoid nonull warnings */
+char* (*ownstrcpy)(char *__restrict __dest, const char *__restrict __src);
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+static int intelCpuidFunc_4(CacheLevel** cachePool)
+{
+    int i;
+    int level=0;
+    int maxNumLevels=0;
+    uint32_t valid=1;
+    CacheLevel* pool;
+
+    while (valid)
+    {
+        eax = 0x04;
+        ecx = level;
+        CPUID;
+        valid = extractBitField(eax,5,0);
+        if (!valid)
+        {
+            break;
+        }
+        level++;
+    }
+
+    maxNumLevels = level;
+    *cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
+    pool = *cachePool;
+
+    for (i=0; i < maxNumLevels; i++) 
+    {
+        eax = 0x04;
+        ecx = i;
+        CPUID;
+
+        pool[i].level = extractBitField(eax,3,5);
+        pool[i].type = (CacheType) extractBitField(eax,5,0);
+        pool[i].associativity = extractBitField(ebx,8,22)+1;
+        pool[i].sets = ecx+1;
+        pool[i].lineSize = extractBitField(ebx,12,0)+1;
+        pool[i].size = pool[i].sets *
+            pool[i].associativity *
+            pool[i].lineSize;
+        pool[i].threads = extractBitField(eax,10,14)+1;
+
+        /* WORKAROUND cpuid reports wrong number of threads on SMT processor with SMT
+         * turned off */
+        if (i < 3)
+        {
+            if ((cpuid_info.model == NEHALEM_BLOOMFIELD) ||
+                    (cpuid_info.model == NEHALEM_LYNNFIELD) ||
+                    (cpuid_info.model == NEHALEM_WESTMERE) ||
+                    (cpuid_info.model == NEHALEM_WESTMERE_M) ||
+                    (cpuid_info.model == SANDYBRIDGE) ||
+                    (cpuid_info.model == SANDYBRIDGE_EP) ||
+                    (cpuid_info.model == IVYBRIDGE) ||
+                    (cpuid_info.model == IVYBRIDGE_EP) ||
+                    (cpuid_info.model == HASWELL) ||
+                    (cpuid_info.model == HASWELL_EP) ||
+                    (cpuid_info.model == HASWELL_M1) ||
+                    (cpuid_info.model == HASWELL_M2) ||
+                    (cpuid_info.model == WESTMERE_EX) ||
+                    (cpuid_info.model == NEHALEM_EX))
+            {
+                if (cpuid_topology.numThreadsPerCore == 1)
+                {
+                    pool[i].threads = 1;
+                }
+            }
+        }
+
+        /* :WORKAROUND:08/13/2009 08:34:15 AM:jt: For L3 caches the value is sometimes 
+         * too large in here. Ask Intel what is wrong here!
+         * Limit threads per Socket then to the maximum possible value.*/
+        if(pool[i].threads > (int)
+                (cpuid_topology.numCoresPerSocket*
+                 cpuid_topology.numThreadsPerCore))
+        {
+            pool[i].threads = cpuid_topology.numCoresPerSocket*
+                cpuid_topology.numThreadsPerCore;
+        }
+        pool[i].inclusive = edx&0x2;
+    }
+
+    return maxNumLevels;
+}
+
+static uint32_t amdGetAssociativity(uint32_t flag)
+{
+    uint32_t asso= 0;
+
+    switch ( flag )
+    {
+        case 0x0:
+            asso = 0;
+            break;
+
+        case 0x1:
+            asso = 1;
+            break;
+
+        case 0x2:
+            asso = 2;
+            break;
+
+        case 0x4:
+            asso = 4;
+            break;
+
+        case 0x6:
+            asso = 8;
+            break;
+
+        case 0x8:
+            asso = 16;
+            break;
+
+        case 0xA:
+            asso = 32;
+            break;
+
+        case 0xB:
+            asso = 48;
+            break;
+
+        case 0xC:
+            asso = 64;
+            break;
+
+        case 0xD:
+            asso = 96;
+            break;
+
+        case 0xE:
+            asso = 128;
+            break;
+
+        case 0xF:
+            asso = 0;
+            break;
+
+        default:
+            break;
+    }
+    return asso;
+
+}
+
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+
+void cpuid_printTlbTopology()
+{
+    int i;
+    uint32_t loop = 1;
+
+    if (cpuid_info.isIntel)
+    {
+        eax = 0x02;
+        CPUID;
+    
+    
+        loop = extractBitField(eax,8,0);
+        for(i=1;i<loop;i++)
+        {
+            eax = 0x02;
+            CPUID;
+        }
+
+        for(i=8;i<32;i+=8)
+        {
+            if (extractBitField(eax,8,i) != 0x0)
+            {
+                if (intel_tlb_info[extractBitField(eax,8,i)])
+                    printf("%s\n",intel_tlb_info[extractBitField(eax,8,i)]);
+            }
+        }
+        for(i=0;i<32;i+=8)
+        {
+            if (extractBitField(eax,8,i) != 0x0)
+            {
+                if (intel_tlb_info[extractBitField(ebx,8,i)])
+                    printf("%s\n",intel_tlb_info[extractBitField(ebx,8,i)]);
+            }
+        }
+        for(i=0;i<32;i+=8)
+        {
+            if (extractBitField(eax,8,i) != 0x0)
+            {
+                if (intel_tlb_info[extractBitField(ecx,8,i)])
+                    printf("%s\n",intel_tlb_info[extractBitField(ecx,8,i)]);
+            }
+        }
+        for(i=0;i<32;i+=8)
+        {
+            if (extractBitField(eax,8,i) != 0x0)
+            {
+                if (intel_tlb_info[extractBitField(edx,8,i)])
+                    printf("%s\n",intel_tlb_info[extractBitField(edx,8,i)]);
+            }
+        }
+    }
+    else
+    {
+        eax = 0x80000005;
+        CPUID;
+        printf("L1DTlb2and4MAssoc: 0x%x\n",extractBitField(eax,8,24));
+        printf("L1DTlb2and4MSize: %d entries for 2MB pages\n",(uint32_t)extractBitField(eax,8,16));
+        printf("L1ITlb2and4MAssoc: 0x%x\n",extractBitField(eax,8,8));
+        printf("L1ITlb2and4MSize: %d entries for 2MB pages\n",(uint32_t)extractBitField(eax,8,0));
+        ebx = 0x80000005;
+        CPUID;
+        printf("L1DTlb4KAssoc: 0x%x\n",extractBitField(ebx,8,24));
+        printf("L1DTlb4KSize: 0x%x\n",extractBitField(ebx,8,16));
+        printf("L1ITlb4KAssoc: 0x%x\n",extractBitField(ebx,8,8));
+        printf("L1ITlb4KSize: 0x%x\n",extractBitField(ebx,8,0));
+        eax = 0x80000006;
+        CPUID;
+        printf("L2DTlb2and4MAssoc: 0x%x\n",extractBitField(eax,4,24));
+        printf("L2DTlb2and4MAssoc_c: %d\n",amdGetAssociativity(extractBitField(eax,4,24)));
+        printf("L2DTlb2and4MSize: 0x%x\n",extractBitField(eax,12,16));
+        printf("L2ITlb2and4MAssoc: 0x%x\n",extractBitField(eax,4,12));
+        printf("L2ITlb2and4MAssoc_c: %d\n",amdGetAssociativity(extractBitField(eax,4,12)));
+        printf("L2ITlb2and4MSize: 0x%x\n",extractBitField(eax,12,0));
+        ebx = 0x80000006;
+        CPUID;
+        printf("L2DTlb4KAssoc: 0x%x\n",extractBitField(eax,4,24));
+        printf("L2DTlb4KAssoc_c: %d\n",amdGetAssociativity(extractBitField(eax,4,24)));
+        printf("L2DTlb4KSize: 0x%x\n",extractBitField(eax,12,16));
+        printf("L2ITlb4KAssoc: 0x%x\n",extractBitField(eax,4,12));
+        printf("L2ITlb4KAssoc_c: %d\n",amdGetAssociativity(extractBitField(eax,4,12)));
+        printf("L2ITlb4KSize: 0x%x\n",extractBitField(eax,12,0));
+    }        
+    return;
+}
+
+static void
+cpuid_set_osname(void)
+{
+    FILE *fp;
+    bstring nameString = bformat("model name");
+    cpuid_info.osname = malloc(MAX_MODEL_STRING_LENGTH * sizeof(char));
+    ownstrcpy = strcpy;
+    int i;
+
+    if (NULL != (fp = fopen ("/proc/cpuinfo", "r"))) 
+    {
+        bstring src = bread ((bNread) fread, fp);
+        struct bstrList* tokens = bsplit(src,(char) '\n');
+
+        for (i=0;i<tokens->qty;i++)
+        {
+            if (binstr(tokens->entry[i],0,nameString) != BSTR_ERR)
+            {
+                 struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+                 bltrimws(subtokens->entry[1]);
+                 ownstrcpy(cpuid_info.osname, bdata(subtokens->entry[1]));
+            }
+        }
+    }
+    else
+    {
+        ERROR;
+    }
+
+    fclose(fp);
+}
+
+
+void cpuid_init_cpuInfo(cpu_set_t cpuSet)
+{
+    int cpus_in_set = 0;
+    cpuid_info.isIntel = 1;
+
+    eax = 0x00;
+    CPUID;
+
+    largest_function = eax;
+    if (ebx == 0x68747541U)
+    {
+        cpuid_info.isIntel = 0;
+    }
+
+    eax = 0x01;
+    CPUID;
+    cpuid_info.family = ((eax>>8)&0xFU) + ((eax>>20)&0xFFU);
+    cpuid_info.model = (((eax>>16)&0xFU)<<4) + ((eax>>4)&0xFU);
+    cpuid_info.stepping =  (eax&0xFU);
+    cpuid_set_osname();
+    cpuid_topology.numHWThreads = sysconf(_SC_NPROCESSORS_CONF);
+    cpus_in_set = cpu_count(&cpuSet);
+    if (cpus_in_set < cpuid_topology.numHWThreads)
+    {
+        cpuid_topology.numHWThreads = cpus_in_set;
+    }
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, CPU-ID CpuInfo Family %d Model %d Stepping %d isIntel %d numHWThreads %d activeHWThreads %d,
+                            cpuid_info.family,
+                            cpuid_info.model,
+                            cpuid_info.stepping,
+                            cpuid_info.isIntel,
+                            cpuid_topology.numHWThreads,
+                            cpuid_topology.activeHWThreads)
+    return;
+}
+
+void cpuid_init_cpuFeatures(void)
+{
+    eax = 0x01;
+    CPUID;
+
+    cpuid_info.featureFlags = 0;
+    cpuid_info.features = (char*) malloc(MAX_FEATURE_STRING_LENGTH*sizeof(char));
+    cpuid_info.features[0] = '\0';
+    if (ecx & (1<<0))
+    {
+        strcat(cpuid_info.features, "SSE3 ");
+        cpuid_info.featureFlags |= (1<<SSE3);
+    }
+    if (ecx & (1<<3))
+    {
+        strcat(cpuid_info.features, "MONITOR ");
+        cpuid_info.featureFlags |= (1<<MONITOR);
+    }
+    if (ecx & (1<<5))
+    {
+        strcat(cpuid_info.features, "VMX ");
+        cpuid_info.featureFlags |= (1<<VMX);
+    }
+    if (ecx & (1<<7))
+    {
+        strcat(cpuid_info.features, "EIST ");
+        cpuid_info.featureFlags |= (1<<EIST);
+    }
+    if (ecx & (1<<8))
+    {
+        strcat(cpuid_info.features, "TM2 ");
+        cpuid_info.featureFlags |= (1<<TM2);
+    }
+    if (ecx & (1<<9))
+    {
+        strcat(cpuid_info.features, "SSSE3 ");
+        cpuid_info.featureFlags |= (1<<SSSE3);
+    }
+    if (ecx & (1<<12))
+    {
+        strcat(cpuid_info.features, "FMA ");
+        cpuid_info.featureFlags |= (1<<FMA);
+    }
+    if (ecx & (1<<19))
+    {
+        strcat(cpuid_info.features, "SSE4.1 ");
+        cpuid_info.featureFlags |= (1<<SSE41);
+    }
+    if (ecx & (1<<20))
+    {
+        strcat(cpuid_info.features, "SSE4.2 ");
+        cpuid_info.featureFlags |= (1<<SSE42);
+    }
+    if (ecx & (1<<25))
+    {
+        strcat(cpuid_info.features, "AES ");
+        cpuid_info.featureFlags |= (1<<AES);
+    }
+    if (ecx & (1<<28))
+    {
+        strcat(cpuid_info.features, "AVX ");
+        cpuid_info.featureFlags |= (1<<AVX);
+    }
+    if (ecx & (1<<30))
+    {
+        strcat(cpuid_info.features, "RDRAND ");
+        cpuid_info.featureFlags |= (1<<RDRAND);
+    }
+
+    if (edx & (1<<22))
+    {
+        strcat(cpuid_info.features, "ACPI ");
+        cpuid_info.featureFlags |= (1<<ACPI);
+    }
+    if (edx & (1<<23))
+    {
+        strcat(cpuid_info.features, "MMX ");
+        cpuid_info.featureFlags |= (1<<MMX);
+    }
+    if (edx & (1<<25))
+    {
+        strcat(cpuid_info.features, "SSE ");
+        cpuid_info.featureFlags |= (1<<SSE);
+    }
+    if (edx & (1<<26))
+    {
+        strcat(cpuid_info.features, "SSE2 ");
+        cpuid_info.featureFlags |= (1<<SSE2);
+    }
+    if (edx & (1<<28))
+    {
+        strcat(cpuid_info.features, "HTT ");
+        cpuid_info.featureFlags |= (1<<HTT);
+    }
+    if (edx & (1<<29))
+    {
+        strcat(cpuid_info.features, "TM ");
+        cpuid_info.featureFlags |= (1<<TM);
+    }
+
+    eax = 0x7;
+    ecx = 0x0;
+    CPUID;
+    if (ebx & (1<<5))
+    {
+        strcat(cpuid_info.features, "AVX2 ");
+        cpuid_info.featureFlags |= (1<<AVX2);
+    }
+    if (ebx & (1<<11))
+    {
+        strcat(cpuid_info.features, "RTM ");
+        cpuid_info.featureFlags |= (1<<RTM);
+    }
+    if (ebx & (1<<4))
+    {
+        strcat(cpuid_info.features, "HLE ");
+        cpuid_info.featureFlags |= (1<<HLE);
+    }
+    if (ebx & (1<<18))
+    {
+        strcat(cpuid_info.features, "RDSEED ");
+        cpuid_info.featureFlags |= (1<<RDSEED);
+    }
+
+    eax = 0x80000001;
+    CPUID;
+    if (edx & (1<<27))
+    {
+        strcat(cpuid_info.features, "RDTSCP ");
+        cpuid_info.featureFlags |= (1<<RDTSCP);
+    }
+
+    cpuid_info.perf_version   =  0;
+    if( cpuid_info.family == P6_FAMILY && 0x0A <= largest_function)
+    {
+        eax = 0x0A;
+        CPUID;
+        cpuid_info.perf_version   =  (eax&0xFFU);
+        cpuid_info.perf_num_ctr   =   ((eax>>8)&0xFFU);
+        cpuid_info.perf_width_ctr =  ((eax>>16)&0xFFU);
+        cpuid_info.perf_num_fixed_ctr =  (edx&0xFU);
+
+        eax = 0x06;
+        CPUID;
+        if (eax & (1<<1))
+        {
+            cpuid_info.turbo = 1;
+        }
+        else
+        {
+            cpuid_info.turbo = 0;
+        }
+    }
+
+    return;
+}
+
+void cpuid_init_nodeTopology(cpu_set_t cpuSet)
+{
+    uint32_t apicId;
+    uint32_t bitField;
+    int level;
+    int prevOffset = 0;
+    int currOffset = 0;
+    cpu_set_t set;
+    HWThread* hwThreadPool;
+    int hasBLeaf = 0;
+    int maxNumLogicalProcs;
+    int maxNumLogicalProcsPerCore;
+    int maxNumCores;
+    int width;
+    
+    hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
+    
+    
+    /* check if 0x0B cpuid leaf is supported */
+    if (largest_function >= 0x0B)
+    {
+        eax = 0x0B;
+        ecx = 0;
+        CPUID;
+
+        if (ebx)
+        {
+            hasBLeaf = 1;
+        }
+    }
+
+    if (hasBLeaf)
+    {
+        for (uint32_t i=0; i < cpuid_topology.numHWThreads; i++)
+        {
+            int id;
+            CPU_ZERO(&set);
+            CPU_SET(i,&set);
+            sched_setaffinity(0, sizeof(cpu_set_t), &set);
+            eax = 0x0B;
+            ecx = 0;
+            CPUID;
+            apicId = edx;
+            id = i;
+            hwThreadPool[id].apicId = i;
+            hwThreadPool[id].inCpuSet = 0;
+            if (CPU_ISSET(id, &cpuSet))
+            {
+                hwThreadPool[id].inCpuSet = 1;
+            }
+
+            for (level=0; level < 3; level++)
+            {
+                eax = 0x0B;
+                ecx = level;
+                CPUID;
+                currOffset = eax&0xFU;
+
+                switch ( level ) {
+                    case 0:  /* SMT thread */
+                        bitField = extractBitField(apicId,
+                                currOffset,
+                                0);
+                        hwThreadPool[id].threadId = bitField;
+                        break;
+
+                    case 1:  /* Core */
+                        bitField = extractBitField(apicId,
+                                currOffset-prevOffset,
+                                prevOffset);
+                        hwThreadPool[id].coreId = bitField;
+                        break;
+
+                    case 2:  /* Package */
+                        bitField = extractBitField(apicId,
+                                32-prevOffset,
+                                prevOffset);
+                        hwThreadPool[id].packageId = bitField;
+                        break;
+
+                }
+                prevOffset = currOffset;
+            }
+            DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+                                    hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+                                    hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+        }
+    }
+    else
+    {
+        switch ( cpuid_info.family )
+        {
+
+            case MIC_FAMILY:
+
+            case P6_FAMILY:
+                eax = 0x01;
+                CPUID;
+                maxNumLogicalProcs = extractBitField(ebx,8,16);
+
+                /* Check number of cores per package */
+                eax = 0x04;
+                ecx = 0;
+                CPUID;
+                maxNumCores = extractBitField(eax,6,26)+1;
+
+                maxNumLogicalProcsPerCore = maxNumLogicalProcs/maxNumCores;
+
+                for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
+                {
+                    int id;
+                    CPU_ZERO(&set);
+                    CPU_SET(i,&set);
+                    sched_setaffinity(0, sizeof(cpu_set_t), &set);
+
+                    eax = 0x01;
+                    CPUID;
+                    id = i;
+                    hwThreadPool[id].apicId = i;//extractBitField(ebx,8,24);
+
+                    /* ThreadId is extracted from th apicId using the bit width
+                     * of the number of logical processors
+                     * */
+                    hwThreadPool[id].threadId =
+                        extractBitField(hwThreadPool[id].apicId,
+                                getBitFieldWidth(maxNumLogicalProcsPerCore),0); 
+
+                    /* CoreId is extracted from th apicId using the bitWidth 
+                     * of the number of logical processors as offset and the
+                     * bit width of the number of cores as width
+                     * */
+                    hwThreadPool[id].coreId =
+                        extractBitField(hwThreadPool[id].apicId,
+                                getBitFieldWidth(maxNumCores),
+                                getBitFieldWidth(maxNumLogicalProcsPerCore)); 
+
+                    hwThreadPool[id].packageId =
+                        extractBitField(hwThreadPool[id].apicId,
+                                8-getBitFieldWidth(maxNumLogicalProcs),
+                                getBitFieldWidth(maxNumLogicalProcs));
+                    DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+                                    hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+                                    hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+                }
+                break;
+
+            case K8_FAMILY:
+                /* AMD Bios manual Rev. 2.28 section 3.1
+                 * Legacy method */
+                /*FIXME: This is a bit of a hack */
+
+                maxNumLogicalProcsPerCore = 1;
+                maxNumLogicalProcs = 1;
+
+                eax = 0x80000008;
+                CPUID;
+
+                maxNumCores =  extractBitField(ecx,8,0)+1;
+
+                for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
+                {
+                    int id;
+                    CPU_ZERO(&set);
+                    CPU_SET(i,&set);
+                    sched_setaffinity(0, sizeof(cpu_set_t), &set);
+
+                    eax = 0x01;
+                    CPUID;
+                    id = extractBitField(ebx,8,24);
+                    hwThreadPool[id].apicId = extractBitField(ebx,8,24);
+
+                    /* ThreadId is extracted from th apicId using the bit width
+                     * of the number of logical processors
+                     * */
+                    hwThreadPool[id].threadId =
+                        extractBitField(hwThreadPool[i].apicId,
+                                getBitFieldWidth(maxNumLogicalProcsPerCore),0); 
+
+                    /* CoreId is extracted from th apicId using the bitWidth 
+                     * of the number of logical processors as offset and the
+                     * bit width of the number of cores as width
+                     * */
+                    hwThreadPool[id].coreId =
+                        extractBitField(hwThreadPool[i].apicId,
+                                getBitFieldWidth(maxNumCores),
+                                0); 
+
+                    hwThreadPool[id].packageId =
+                        extractBitField(hwThreadPool[i].apicId,
+                                8-getBitFieldWidth(maxNumCores),
+                                getBitFieldWidth(maxNumCores));
+                    DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+                                    hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+                                    hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+                }
+                break;
+
+            case K16_FAMILY:
+
+            case K15_FAMILY:
+
+            case K10_FAMILY:
+                /* AMD Bios manual Rev. 2.28 section 3.2
+                 * Extended method */
+                eax = 0x80000008;
+                CPUID;
+
+                width =  extractBitField(ecx,4,12);
+
+                if (width == 0)
+                {
+                    width =  extractBitField(ecx,8,0)+1;
+                }
+
+                eax = 0x01;
+                CPUID;
+                maxNumLogicalProcs =  extractBitField(ebx,8,16);
+                maxNumCores = extractBitField(ecx,8,0)+1;
+
+
+                for (uint32_t i=0; i<  cpuid_topology.numHWThreads; i++)
+                {
+                    int id;
+                    CPU_ZERO(&set);
+                    CPU_SET(i,&set);
+                    sched_setaffinity(0, sizeof(cpu_set_t), &set);
+
+                    eax = 0x01;
+                    CPUID;
+                    id = extractBitField(ebx,8,24);
+                    hwThreadPool[id].apicId = extractBitField(ebx,8,24);
+                    /* AMD only knows cores */
+                    hwThreadPool[id].threadId = 0;
+
+                    hwThreadPool[id].coreId =
+                        extractBitField(hwThreadPool[i].apicId,
+                                width, 0); 
+                    hwThreadPool[id].packageId =
+                        extractBitField(hwThreadPool[i].apicId,
+                                (8-width), width);
+                    DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+                                    hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+                                    hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+                }
+
+                break;
+        }
+    }
+    cpuid_topology.threadPool = hwThreadPool;
+    
+    return;
+}
+
+
+void cpuid_init_cacheTopology(void)
+{
+    int maxNumLevels=0;
+    int id=0;
+    CacheLevel* cachePool = NULL;
+    CacheType type = DATACACHE;
+
+    switch ( cpuid_info.family ) 
+    {
+        case MIC_FAMILY:
+
+        case P6_FAMILY:
+
+            if (largest_function >= 4)
+            {
+                maxNumLevels = intelCpuidFunc_4(&cachePool);
+            }
+            else
+            {
+                //                intelCpuidFunc_2(&cachePool);
+            }
+
+            break;
+
+        case K8_FAMILY:
+            maxNumLevels = 2;
+            cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
+
+            eax = 0x80000005;
+            CPUID;
+            cachePool[0].level = 1;
+            cachePool[0].type = DATACACHE;
+            cachePool[0].associativity = extractBitField(ecx,8,16);
+            cachePool[0].lineSize = extractBitField(ecx,8,0);
+            cachePool[0].size =  extractBitField(ecx,8,24) * 1024;
+            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+            {
+                cachePool[0].sets = cachePool[0].size/
+                    (cachePool[0].associativity * cachePool[0].lineSize);
+            }
+            cachePool[0].threads = 1;
+            cachePool[0].inclusive = 1;
+
+            eax = 0x80000006;
+            CPUID;
+            cachePool[1].level = 2;
+            cachePool[1].type = UNIFIEDCACHE;
+            cachePool[1].associativity = 
+                amdGetAssociativity(extractBitField(ecx,4,12));
+            cachePool[1].lineSize = extractBitField(ecx,8,0);
+            cachePool[1].size =  extractBitField(ecx,16,16) * 1024;
+            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+            {
+                cachePool[1].sets = cachePool[1].size/
+                    (cachePool[1].associativity * cachePool[1].lineSize);
+            }
+            cachePool[1].threads = 1;
+            cachePool[1].inclusive = 1;
+
+            break;
+
+
+        case K10_FAMILY:
+            /* FIXME: Adds one level for the instruction cache on Intel
+             * This fixes the level for the cores
+             */
+            maxNumLevels = 3;
+            cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
+
+            eax = 0x80000005;
+            CPUID;
+            cachePool[0].level = 1;
+            cachePool[0].type = DATACACHE;
+            cachePool[0].associativity = extractBitField(ecx,8,16);
+            cachePool[0].lineSize = extractBitField(ecx,8,0);
+            cachePool[0].size =  extractBitField(ecx,8,24) * 1024;
+            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+            {
+                cachePool[0].sets = cachePool[0].size/
+                    (cachePool[0].associativity * cachePool[0].lineSize);
+            }
+            cachePool[0].threads = 1;
+            cachePool[0].inclusive = 1;
+
+            eax = 0x80000006;
+            CPUID;
+            cachePool[1].level = 2;
+            cachePool[1].type = UNIFIEDCACHE;
+            cachePool[1].associativity = 
+                amdGetAssociativity(extractBitField(ecx,4,12));
+            cachePool[1].lineSize = extractBitField(ecx,8,0);
+            cachePool[1].size =  extractBitField(ecx,16,16) * 1024;
+            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+            {
+                cachePool[1].sets = cachePool[1].size/
+                    (cachePool[1].associativity * cachePool[1].lineSize);
+            }
+            cachePool[1].threads = 1;
+            cachePool[1].inclusive = 1;
+
+            cachePool[2].level = 3;
+            cachePool[2].type = UNIFIEDCACHE;
+            cachePool[2].associativity =
+                amdGetAssociativity(extractBitField(edx,4,12));
+            cachePool[2].lineSize = extractBitField(edx,8,0);
+            cachePool[2].size =  (extractBitField(edx,14,18)+1) * 524288;
+            if ((cachePool[0].associativity * cachePool[0].lineSize) != 0)
+            {
+                cachePool[2].sets = cachePool[1].size/
+                    (cachePool[1].associativity * cachePool[1].lineSize);
+            }
+
+            if (cpuid_info.model != MAGNYCOURS)
+            {
+                cachePool[2].threads = cpuid_topology.numCoresPerSocket;
+            }
+            else
+            {
+                cachePool[2].threads = cpuid_topology.numCoresPerSocket/2;
+                cachePool[2].size /= 2 ;
+            }
+
+            cachePool[2].inclusive = 1;
+
+            break;
+
+        case K16_FAMILY:
+
+        case K15_FAMILY:
+
+            maxNumLevels = 0;
+            cachePool = (CacheLevel*) malloc(3 * sizeof(CacheLevel));
+
+            while (type)
+            {
+                ecx = id;
+                eax = 0x8000001D;
+                CPUID;
+                type = (CacheType) extractBitField(eax,4,0);
+
+                if ((type == DATACACHE) || (type == UNIFIEDCACHE))
+                {
+                    cachePool[maxNumLevels].level =   extractBitField(eax,3,5);
+                    cachePool[maxNumLevels].type = type;
+                    cachePool[maxNumLevels].associativity = extractBitField(ebx,10,22)+1;
+                    cachePool[maxNumLevels].lineSize = extractBitField(ebx,12,0)+1;
+                    cachePool[maxNumLevels].sets =  extractBitField(ecx,32,0)+1;
+                    cachePool[maxNumLevels].size = cachePool[maxNumLevels].associativity *
+                        cachePool[maxNumLevels].lineSize * cachePool[maxNumLevels].sets;
+                    cachePool[maxNumLevels].threads =  extractBitField(eax,12,14)+1;
+                    cachePool[maxNumLevels].inclusive =  (edx & (0x1<<1));
+                    maxNumLevels++;
+                }
+                id++;
+            }
+            break;
+
+        default:
+            ERROR_PLAIN_PRINT(Processor is not supported);
+            break;
+    }
+    
+
+    cpuid_topology.numCacheLevels = maxNumLevels;
+    cpuid_topology.cacheLevels = cachePool;
+    
+    return;
+}
diff --git a/src/topology_hwloc.c b/src/topology_hwloc.c
new file mode 100644
index 0000000..97e19ff
--- /dev/null
+++ b/src/topology_hwloc.c
@@ -0,0 +1,277 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology_hwloc.c
+ *
+ *      Description:  Interface to the hwloc based topology backend
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Authors:  Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+ 
+#include <stdlib.h>
+#include <stdio.h>
+#include <error.h>
+
+#include <topology.h>
+#ifdef LIKWID_USE_HWLOC
+#include <hwloc.h>
+#include <topology_hwloc.h>
+#endif
+
+hwloc_topology_t hwloc_topology = NULL;
+
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+
+/* #####   VARIABLES  -  LOCAL TO THIS SOURCE FILE   ###################### */
+
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+#ifdef LIKWID_USE_HWLOC
+int hwloc_record_objs_of_type_below_obj(hwloc_topology_t t, hwloc_obj_t obj, hwloc_obj_type_t type, int* index, uint32_t **list)
+{
+    int i;
+    int count = 0;
+    hwloc_obj_t walker;
+    if (!obj->arity) return 0;
+    for (i=0;i<obj->arity;i++)
+    {
+        walker = obj->children[i];
+        if (walker->type == type)
+        {
+            if (list && *list && index)
+            {
+                (*list)[(*index)++] = walker->os_index;
+            }
+            count++;
+        }
+        count += hwloc_record_objs_of_type_below_obj(t, walker, type, index, list);
+    }
+    return count;
+}
+
+void hwloc_init_cpuInfo(cpu_set_t cpuSet)
+{
+    int i;
+    hwloc_obj_t obj;
+
+    hwloc_topology_init(&hwloc_topology);
+    hwloc_topology_set_flags(hwloc_topology, HWLOC_TOPOLOGY_FLAG_WHOLE_IO );
+    hwloc_topology_load(hwloc_topology);
+    obj = hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_SOCKET, 0);
+
+    cpuid_info.model = 0;
+    cpuid_info.family = 0;
+    cpuid_info.isIntel = 0;
+    cpuid_info.osname = malloc(MAX_MODEL_STRING_LENGTH * sizeof(char));
+
+    const char * info;
+    if (info = hwloc_obj_get_info_by_name(obj, "CPUModelNumber"))
+        cpuid_info.model = atoi(info);
+    if (info = hwloc_obj_get_info_by_name(obj, "CPUFamilyNumber"))
+       cpuid_info.family = atoi(info);
+    if (info = hwloc_obj_get_info_by_name(obj, "CPUVendor"))
+        cpuid_info.isIntel = strcmp(info, "GenuineIntel") == 0;
+    if (info = hwloc_obj_get_info_by_name(obj, "CPUModel"))
+        strcpy(cpuid_info.osname, info);
+    if (info = hwloc_obj_get_info_by_name(obj, "CPUStepping"))
+        cpuid_info.stepping = atoi(info);
+
+    cpuid_topology.numHWThreads = hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PU);
+    DEBUG_PRINT(DEBUGLEV_DEVELOP, HWLOC CpuInfo Family %d Model %d Stepping %d isIntel %d numHWThreads %d activeHWThreads %d,
+                            cpuid_info.family,
+                            cpuid_info.model,
+                            cpuid_info.stepping,
+                            cpuid_info.isIntel,
+                            cpuid_topology.numHWThreads,
+                            cpuid_topology.activeHWThreads)
+    return;
+}
+
+void hwloc_init_nodeTopology(cpu_set_t cpuSet)
+{
+    HWThread* hwThreadPool;
+    int maxNumLogicalProcs;
+    int maxNumLogicalProcsPerCore;
+    int maxNumCores;
+    hwloc_obj_t obj;
+    int poolsize = 0;
+    int id = 0;
+    hwloc_obj_type_t socket_type = HWLOC_OBJ_SOCKET;
+    for (uint32_t i=0;i<cpuid_topology.numHWThreads;i++)
+    {
+        if (CPU_ISSET(i, &cpuSet))
+        {
+            poolsize = i+1;
+        }
+    }
+    hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
+    for (uint32_t i=0;i<cpuid_topology.numHWThreads;i++)
+    {
+        hwThreadPool[i].apicId = -1;
+        hwThreadPool[i].threadId = -1;
+        hwThreadPool[i].coreId = -1;
+        hwThreadPool[i].packageId = -1;
+        hwThreadPool[i].inCpuSet = 0;
+    }
+
+    maxNumLogicalProcs = hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_PU);
+    maxNumCores = hwloc_get_nbobjs_by_type(hwloc_topology, HWLOC_OBJ_CORE);
+    if (hwloc_get_nbobjs_by_type(hwloc_topology, socket_type) == 0)
+    {
+        socket_type = HWLOC_OBJ_NODE;
+    }
+    maxNumLogicalProcsPerCore = maxNumLogicalProcs/maxNumCores;
+    for (uint32_t i=0; i< cpuid_topology.numHWThreads; i++)
+    {
+        obj = hwloc_get_obj_by_type(hwloc_topology, HWLOC_OBJ_PU, i);
+        if (!obj)
+        {
+            printf("No obj for CPU %d\n", i);
+            continue;
+        }
+        id = obj->os_index;
+        hwThreadPool[id].inCpuSet = 1;
+        hwThreadPool[id].apicId = obj->os_index;
+        hwThreadPool[id].threadId = obj->sibling_rank;
+        while (obj->type != HWLOC_OBJ_CORE) {
+            obj = obj->parent;
+        }
+        hwThreadPool[id].coreId = obj->os_index;
+        while (obj->type != socket_type) {
+            obj = obj->parent;
+        }
+        hwThreadPool[id].packageId = obj->os_index;
+        /*DEBUG_PRINT(DEBUGLEV_DEVELOP, HWLOC Thread Pool PU %d Thread %d Core %d Socket %d,
+                            hwThreadPool[threadIdx].apicId,
+                            hwThreadPool[threadIdx].threadId,
+                            hwThreadPool[threadIdx].coreId,
+                            hwThreadPool[threadIdx].packageId)*/
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, I[%d] ID[%d] APIC[%d] T[%d] C[%d] P [%d], i, id,
+                                    hwThreadPool[id].apicId, hwThreadPool[id].threadId,
+                                    hwThreadPool[id].coreId, hwThreadPool[id].packageId);
+    }
+
+    cpuid_topology.threadPool = hwThreadPool;
+
+    return;
+}
+
+
+void hwloc_init_cacheTopology(void)
+{
+    int maxNumLevels=0;
+    int id=0;
+    CacheLevel* cachePool = NULL;
+    hwloc_obj_t obj;
+    int depth;
+    int d;
+
+    /* Sum up all depths with caches */
+    depth = hwloc_topology_get_depth(hwloc_topology);
+    for (d = 0; d < depth; d++)
+    {
+        if (hwloc_get_depth_type(hwloc_topology, d) == HWLOC_OBJ_CACHE)
+            maxNumLevels++;
+    }
+    cachePool = (CacheLevel*) malloc(maxNumLevels * sizeof(CacheLevel));
+    /* Start at the bottom of the tree to get all cache levels in order */
+    depth = hwloc_topology_get_depth(hwloc_topology);
+    id = 0;
+    for(d=depth-1;d >= 0; d--)
+    {
+        /* We only need caches, so skip other levels */
+        if (hwloc_get_depth_type(hwloc_topology, d) != HWLOC_OBJ_CACHE)
+        {
+            continue;
+        }
+        /* Get the cache object */
+        obj = hwloc_get_obj_by_depth(hwloc_topology, d, 0);
+        /* All caches have this attribute, so safe to access */
+        switch (obj->attr->cache.type)
+        {
+            case HWLOC_OBJ_CACHE_DATA:
+                cachePool[id].type = DATACACHE;
+                break;
+            case HWLOC_OBJ_CACHE_INSTRUCTION:
+                cachePool[id].type = INSTRUCTIONCACHE;
+                break;
+            case HWLOC_OBJ_CACHE_UNIFIED:
+                cachePool[id].type = UNIFIEDCACHE;
+                break;
+            default:
+                cachePool[id].type = NOCACHE;
+                break;
+        }
+
+        cachePool[id].associativity = obj->attr->cache.associativity;
+        cachePool[id].level = obj->attr->cache.depth;
+        cachePool[id].lineSize = obj->attr->cache.linesize;
+        cachePool[id].size = obj->attr->cache.size;
+        cachePool[id].sets = 0;
+        if ((cachePool[id].associativity * cachePool[id].lineSize) != 0)
+        {
+            cachePool[id].sets = cachePool[id].size /
+                (cachePool[id].associativity * cachePool[id].lineSize);
+        }
+        /* Count all HWThreads below the current cache */
+        cachePool[id].threads = hwloc_record_objs_of_type_below_obj(
+                        hwloc_topology, obj, HWLOC_OBJ_PU, NULL, NULL);
+
+        char* info;
+        if (info = hwloc_obj_get_info_by_name(obj, "inclusiveness"))
+            cachePool[id].inclusive = info[0]=='t';
+        else
+            ERROR_PLAIN_PRINT(Processor is not supported);
+        id++;
+    }
+
+    cpuid_topology.numCacheLevels = maxNumLevels;
+    cpuid_topology.cacheLevels = cachePool;
+    return;
+}
+
+#else
+
+void hwloc_init_cpuInfo(void)
+{
+    return;
+}
+
+void hwloc_init_cpuFeatures(void)
+{
+    return;
+}
+
+void hwloc_init_nodeTopology(void)
+{
+    return;
+}
+
+void hwloc_init_cacheTopology(void)
+{
+    return;
+}
+#endif
diff --git a/src/topology_proc.c b/src/topology_proc.c
new file mode 100644
index 0000000..277c0fb
--- /dev/null
+++ b/src/topology_proc.c
@@ -0,0 +1,627 @@
+/*
+ * =======================================================================================
+ *
+ *      Filename:  topology_proc.c
+ *
+ *      Description:  Interface to the procfs/sysfs based topology backend
+ *
+ *      Version:   4.0
+ *      Released:  16.6.2015
+ *
+ *      Authors:  Jan Treibig (jt), jan.treibig at gmail.com,
+ *                Thomas Roehl (tr), thomas.roehl at googlemail.com
+ *      Project:  likwid
+ *
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
+ *
+ *      This program is free software: you can redistribute it and/or modify it under
+ *      the terms of the GNU General Public License as published by the Free Software
+ *      Foundation, either version 3 of the License, or (at your option) any later
+ *      version.
+ *
+ *      This program is distributed in the hope that it will be useful, but WITHOUT ANY
+ *      WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+ *      PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+ *
+ *      You should have received a copy of the GNU General Public License along with
+ *      this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * =======================================================================================
+ */
+
+#include <topology_proc.h>
+
+
+/* #####   MACROS  -  LOCAL TO THIS SOURCE FILE   ######################### */
+/* this was taken from the linux kernel */
+#define CPUID                              \
+    __asm__ volatile ("cpuid"                             \
+            : "=a" (eax),     \
+            "=b" (ebx),     \
+            "=c" (ecx),     \
+            "=d" (edx)      \
+            : "0" (eax), "2" (ecx))
+/* #####   FUNCTION DEFINITIONS  -  LOCAL TO THIS SOURCE FILE   ########### */
+static int get_cpu_perf_data(void)
+{
+    uint32_t eax = 0x0U, ebx = 0x0U, ecx = 0x0U, edx = 0x0U;
+    int largest_function = 0;
+    eax = 0x00;
+    CPUID;
+    largest_function = eax;
+    if (cpuid_info.family == P6_FAMILY && 0x0A <= largest_function)
+    {
+        eax = 0x0A;
+        CPUID;
+        cpuid_info.perf_version   =  (eax&0xFFU);
+        cpuid_info.perf_num_ctr   =   ((eax>>8)&0xFFU);
+        cpuid_info.perf_width_ctr =  ((eax>>16)&0xFFU);
+        cpuid_info.perf_num_fixed_ctr =  (edx&0xFU);
+
+        eax = 0x06;
+        CPUID;
+        if (eax & (1<<1))
+        {
+            cpuid_info.turbo = 1;
+        }
+        else
+        {
+            cpuid_info.turbo = 0;
+        }
+    }
+    return 0;
+}
+
+int get_listPosition(int ownid, bstring list)
+{
+    bstring ownStr = bformat("%d",ownid);
+    struct bstrList* tokens = bsplit(list,(char) ',');
+    for(int i=0;i<tokens->qty;i++)
+    {
+        btrimws(tokens->entry[i]);
+        if (bstrcmp(ownStr, tokens->entry[i]) == BSTR_OK)
+        {
+            return i;
+        }
+    }
+    return -1;
+}
+
+int fillList(int* outList, int outOffset, bstring list)
+{
+    int current = 0;
+    int (*ownatoi)(const char*);
+    struct bstrList* tokens = bsplit(list,',');
+    ownatoi = &atoi;
+    for(int i=0;i<tokens->qty;i++)
+    {
+        btrimws(tokens->entry[i]);
+        if (bstrchrp(tokens->entry[i],'-',0) == BSTR_ERR)
+        {
+            if (outList)
+            {
+                outList[outOffset+current] = ownatoi(bdata(tokens->entry[i]));
+            }
+            current++;
+        }
+        else
+        {
+            struct bstrList* range = bsplit(tokens->entry[i],'-');
+            if (range->qty == 2)
+            {
+                for (int j=ownatoi(bdata(range->entry[0]));j<=ownatoi(bdata(range->entry[1]));j++)
+                {
+                    if (outList)
+                    {
+                        outList[outOffset+current] = j;
+                    }
+                    
+                    current++;
+                }
+            }
+        }
+    }
+    return current;
+}
+
+static int readCacheInclusiveIntel(int level)
+{
+    uint32_t eax = 0x0U, ebx = 0x0U, ecx = 0x0U, edx = 0x0U;
+    eax = 0x04;
+    ecx = level;
+    CPUID;
+    return edx & 0x2;
+}
+
+static int readCacheInclusiveAMD(int level)
+{
+    uint32_t eax = 0x0U, ebx = 0x0U, ecx = 0x0U, edx = 0x0U;
+    eax = 0x8000001D;
+    ecx = level;
+    CPUID;
+    return (edx & (0x1<<1));
+}
+
+/* #####   FUNCTION DEFINITIONS  -  EXPORTED FUNCTIONS   ################## */
+void proc_init_cpuInfo(cpu_set_t cpuSet)
+{
+    int i;
+    int HWthreads = 0;
+    FILE *fp;
+
+    int (*ownatoi)(const char*);
+    char* (*ownstrcpy)(char*,const char*);
+    ownatoi = &atoi;
+    ownstrcpy = &strcpy;
+
+    const_bstring countString = bformat("processor\t:");
+    const_bstring modelString = bformat("model\t\t:");
+    const_bstring familyString = bformat("cpu family\t:");
+    const_bstring steppingString = bformat("stepping\t:");
+    const_bstring vendorString = bformat("vendor_id\t:");
+    const_bstring vendorIntelString = bformat("GenuineIntel");
+    const_bstring nameString = bformat("model name\t:");
+
+    cpuid_info.isIntel = 0;
+    cpuid_info.model = 0;
+    cpuid_info.family = 0;
+    cpuid_info.stepping = 0;
+    cpuid_topology.numHWThreads = 0;
+    cpuid_info.osname = malloc(MAX_MODEL_STRING_LENGTH * sizeof(char));
+
+    if (NULL != (fp = fopen ("/proc/cpuinfo", "r"))) 
+    {
+        bstring src = bread ((bNread) fread, fp);
+        struct bstrList* tokens = bsplit(src,(char) '\n');
+        for (i=0;i<tokens->qty;i++)
+        {
+            if (binstr(tokens->entry[i],0,countString) != BSTR_ERR)
+            {
+                HWthreads++;
+            }
+            else if ((cpuid_info.model == 0) && (binstr(tokens->entry[i],0,modelString) != BSTR_ERR))
+            {
+                struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+                bltrimws(subtokens->entry[1]);
+                cpuid_info.model = ownatoi(bdata(subtokens->entry[1]));
+            }
+            else if ((cpuid_info.family == 0) && (binstr(tokens->entry[i],0,familyString) != BSTR_ERR))
+            {
+                struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+                bltrimws(subtokens->entry[1]);
+                cpuid_info.family = ownatoi(bdata(subtokens->entry[1]));
+            }
+            else if (binstr(tokens->entry[i],0,steppingString) != BSTR_ERR)
+            {
+                struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+                bltrimws(subtokens->entry[1]);
+                cpuid_info.stepping = ownatoi(bdata(subtokens->entry[1]));
+            }
+            else if (binstr(tokens->entry[i],0,nameString) != BSTR_ERR)
+            {
+                struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+                bltrimws(subtokens->entry[1]);
+                ownstrcpy(cpuid_info.osname, bdata(subtokens->entry[1]));
+            }
+            else if (binstr(tokens->entry[i],0,vendorString) != BSTR_ERR)
+            {
+                struct bstrList* subtokens = bsplit(tokens->entry[i],(char) ':');
+                bltrimws(subtokens->entry[1]);
+                if (bstrcmp(subtokens->entry[1], vendorIntelString) == BSTR_OK)
+                {
+                    cpuid_info.isIntel = 1;
+                }
+            }
+        }
+        cpuid_topology.numHWThreads = HWthreads;
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, PROC CpuInfo Family %d Model %d Stepping %d isIntel %d numHWThreads %d,
+                            cpuid_info.family,
+                            cpuid_info.model,
+                            cpuid_info.stepping,
+                            cpuid_info.isIntel,
+                            cpuid_topology.numHWThreads)
+    }
+    return;
+}
+
+void proc_init_cpuFeatures(void)
+{
+    int ret;
+    FILE* file;
+    char buf[1024];
+    char ident[30];
+    char delimiter[] = " ";
+    char* cptr;
+
+    if ( (file = fopen( "/proc/cpuinfo", "r")) == NULL )
+    {
+        fprintf(stderr, "Cannot open /proc/cpuinfo\n");
+        return;
+    }
+    ret = 0;
+    while( fgets(buf, sizeof(buf)-1, file) )
+    {
+        ret = sscanf(buf, "%s\t:", &(ident[0]));
+        if (ret != 1 || strcmp(ident,"flags") != 0)
+        {
+            continue;
+        }
+        else
+        {
+            ret = 1;
+            break;
+        }
+    }
+    fclose(file);
+    if (ret == 0)
+    {
+        return;
+    }
+
+    cpuid_info.featureFlags = 0;
+    cpuid_info.features = (char*) malloc(MAX_FEATURE_STRING_LENGTH*sizeof(char));
+    cpuid_info.features[0] = '\0';
+    buf[strcspn(buf, "\n")] = '\0';
+    cptr = strtok(&(buf[6]),delimiter);
+
+    while (cptr != NULL)
+    {
+        if (strcmp(cptr,"ssse3") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<SSSE3);
+            strcat(cpuid_info.features, "SSSE3 ");
+        }
+        else if (strcmp(cptr,"sse3") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<SSE3);
+            strcat(cpuid_info.features, "SSE3 ");
+        }
+        else if (strcmp(cptr,"monitor") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<MONITOR);
+            strcat(cpuid_info.features, "MONITOR ");
+        }
+        else if (strcmp(cptr,"mmx") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<MMX);
+            strcat(cpuid_info.features, "MMX ");
+        }
+        else if (strcmp(cptr,"sse") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<SSE);
+            strcat(cpuid_info.features, "SSE ");
+        }
+        else if (strcmp(cptr,"sse2") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<SSE2);
+            strcat(cpuid_info.features, "SSE2 ");
+        }
+        else if (strcmp(cptr,"acpi") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<ACPI);
+            strcat(cpuid_info.features, "ACPI ");
+        }
+        else if (strcmp(cptr,"rdtscp") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<RDTSCP);
+            strcat(cpuid_info.features, "RDTSCP ");
+        }
+        else if (strcmp(cptr,"vmx") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<VMX);
+            strcat(cpuid_info.features, "VMX ");
+        }
+        else if (strcmp(cptr,"est") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<EIST);
+            strcat(cpuid_info.features, "EIST ");
+        }
+        else if (strcmp(cptr,"tm") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<TM);
+            strcat(cpuid_info.features, "TM ");
+        }
+        else if (strcmp(cptr,"tm2") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<TM2);
+            strcat(cpuid_info.features, "TM2 ");
+        }
+        else if (strcmp(cptr,"aes") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<AES);
+            strcat(cpuid_info.features, "AES ");
+        }
+        else if (strcmp(cptr,"rdrand") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<RDRAND);
+            strcat(cpuid_info.features, "RDRAND ");
+        }
+        else if (strcmp(cptr,"sse4_1") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<SSE41);
+            strcat(cpuid_info.features, "SSE4.1 ");
+        }
+        else if (strcmp(cptr,"sse4_2") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<SSE42);
+            strcat(cpuid_info.features, "SSE4.2 ");
+        }
+        else if (strcmp(cptr,"avx") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<AVX);
+            strcat(cpuid_info.features, "AVX ");
+        }
+        else if (strcmp(cptr,"fma") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<FMA);
+            strcat(cpuid_info.features, "FMA ");
+        }
+        else if (strcmp(cptr,"avx2") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<AVX2);
+            strcat(cpuid_info.features, "AVX2 ");
+        }
+        else if (strcmp(cptr,"rtm") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<RTM);
+            strcat(cpuid_info.features, "RTM ");
+        }
+        else if (strcmp(cptr,"hle") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<HLE);
+            strcat(cpuid_info.features, "HLE ");
+        }
+        else if (strcmp(cptr,"rdseed") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<RDSEED);
+            strcat(cpuid_info.features, "RDSEED ");
+        }
+        else if (strcmp(cptr,"ht") == 0)
+        {
+            cpuid_info.featureFlags |= (1<<HTT);
+            strcat(cpuid_info.features, "HTT ");
+        }
+        cptr = strtok(NULL, delimiter);
+    }
+
+    if ((cpuid_info.featureFlags & (1<<SSSE3)) && !((cpuid_info.featureFlags) & (1<<SSE3)))
+    {
+        cpuid_info.featureFlags |= (1<<SSE3);
+        strcat(cpuid_info.features, "SSE3 ");
+    }
+
+    get_cpu_perf_data();
+    return;
+}
+
+
+
+void proc_init_nodeTopology(cpu_set_t cpuSet)
+{
+    HWThread* hwThreadPool;
+    FILE *fp;
+    bstring cpudir;
+    bstring file;
+    int (*ownatoi)(const char*);
+    ownatoi = &atoi;
+
+    hwThreadPool = (HWThread*) malloc(cpuid_topology.numHWThreads * sizeof(HWThread));
+    for (uint32_t i=0;i<cpuid_topology.numHWThreads;i++)
+    {
+        hwThreadPool[i].apicId = i;
+        hwThreadPool[i].threadId = -1;
+        hwThreadPool[i].coreId = -1;
+        hwThreadPool[i].packageId = -1;
+        hwThreadPool[i].inCpuSet = 1;
+        if (!CPU_ISSET(i, &cpuSet))
+        {
+            hwThreadPool[i].inCpuSet = 0;
+        }
+        cpudir = bformat("/sys/devices/system/cpu/cpu%d/topology",i);
+        file = bformat("%s/core_id", bdata(cpudir));
+        if (NULL != (fp = fopen (bdata(file), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            hwThreadPool[i].coreId = ownatoi(bdata(src));
+            fclose(fp);
+        }
+        bdestroy(file);
+        file = bformat("%s/physical_package_id", bdata(cpudir));
+        if (NULL != (fp = fopen (bdata(file), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            hwThreadPool[i].packageId = ownatoi(bdata(src));
+            fclose(fp);
+        }
+        bdestroy(file);
+        file = bformat("%s/thread_siblings_list", bdata(cpudir));
+        if (NULL != (fp = fopen (bdata(file), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            hwThreadPool[i].threadId = get_listPosition(i, src);
+            fclose(fp);
+        }
+        bdestroy(file);
+        DEBUG_PRINT(DEBUGLEV_DEVELOP, PROC Thread Pool PU %d Thread %d Core %d Socket %d,
+                            hwThreadPool[i].apicId,
+                            hwThreadPool[i].threadId,
+                            hwThreadPool[i].coreId,
+                            hwThreadPool[i].packageId)
+    }
+    cpuid_topology.threadPool = hwThreadPool;
+    return;
+}
+
+void proc_init_cacheTopology(void)
+{
+    FILE *fp;
+    CacheLevel* cachePool = NULL;
+    int maxNumLevels = 0;
+    int nrCaches = 0;
+    bstring cpudir = bformat("/sys/devices/system/cpu/cpu0/cache");
+    bstring levelStr;
+    int (*ownatoi)(const char*);
+    ownatoi = &atoi;
+    for (int i=0;i<10;i++)
+    {
+        levelStr = bformat("%s/index%d/level",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            int tmp = 0;
+            tmp = ownatoi(bdata(src));
+            if (tmp > maxNumLevels)
+            {
+                maxNumLevels = tmp;
+            }
+            nrCaches++;
+            fclose(fp);
+        }
+        else
+        {
+            bdestroy(levelStr);
+            break;
+        }
+        bdestroy(levelStr);
+    }
+
+    cachePool = (CacheLevel*) malloc(nrCaches * sizeof(CacheLevel));
+    for (int i=0;i<nrCaches;i++)
+    {
+        levelStr = bformat("%s/index%d/level",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            cachePool[i].level = ownatoi(bdata(src));
+            fclose(fp);
+            bdestroy(src);
+        }
+        bdestroy(levelStr);
+        levelStr = bformat("%s/index%d/type",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring unifiedStr = bformat("Unified");
+            bstring dataStr = bformat("Data");
+            bstring intrStr = bformat("Instruction");
+            bstring src = bread ((bNread) fread, fp);
+            btrimws(src);
+            if (bstrcmp(dataStr, src) == BSTR_OK)
+            {
+                cachePool[i].type = DATACACHE;
+            }
+            else if (bstrcmp(intrStr, src) == BSTR_OK)
+            {
+                cachePool[i].type = INSTRUCTIONCACHE;
+            }
+            else if (bstrcmp(unifiedStr, src) == BSTR_OK)
+            {
+                cachePool[i].type = UNIFIEDCACHE;
+            }
+            else
+            {
+                cachePool[i].type = NOCACHE;
+            }
+            fclose(fp);
+            bdestroy(unifiedStr);
+            bdestroy(dataStr);
+            bdestroy(intrStr);
+            bdestroy(src);
+        }
+        bdestroy(levelStr);
+        levelStr = bformat("%s/index%d/size",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            btrimws(src);
+            bdelete(src, blength(src)-1, 1);
+            cachePool[i].size = ownatoi(bdata(src)) * 1024;
+            fclose(fp);
+            bdestroy(src);
+        }
+        else
+        {
+            cachePool[i].size = 0;
+        }
+        bdestroy(levelStr);
+        levelStr = bformat("%s/index%d/ways_of_associativity",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            btrimws(src);
+            cachePool[i].associativity = ownatoi(bdata(src));
+            fclose(fp);
+            bdestroy(src);
+        }
+        else
+        {
+            cachePool[i].associativity = 0;
+        }
+        bdestroy(levelStr);
+        levelStr = bformat("%s/index%d/coherency_line_size",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            btrimws(src);
+            cachePool[i].lineSize = ownatoi(bdata(src));
+            fclose(fp);
+            bdestroy(src);
+        }
+        else
+        {
+            cachePool[i].lineSize = 0;
+        }
+        bdestroy(levelStr);
+        levelStr = bformat("%s/index%d/number_of_sets",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            btrimws(src);
+            cachePool[i].sets = ownatoi(bdata(src));
+            fclose(fp);
+            bdestroy(src);
+        }
+        else
+        {
+            if ((cachePool[i].associativity * cachePool[i].lineSize) != 0)
+            {
+                cachePool[i].sets = cachePool[i].size /
+                    (cachePool[i].associativity * cachePool[i].lineSize);
+            }
+        }
+        bdestroy(levelStr);
+        levelStr = bformat("%s/index%d/shared_cpu_list",bdata(cpudir),i);
+        if (NULL != (fp = fopen (bdata(levelStr), "r")))
+        {
+            bstring src = bread ((bNread) fread, fp);
+            btrimws(src);
+            cachePool[i].threads = fillList(NULL, 0, src);
+            fclose(fp);
+            bdestroy(src);
+        }
+        bdestroy(levelStr);
+
+        switch ( cpuid_info.family )
+        {
+            case MIC_FAMILY:
+            case P6_FAMILY:
+                cachePool[i].inclusive = readCacheInclusiveIntel(cachePool[i].level);
+                break;
+            case K16_FAMILY:
+            case K15_FAMILY:
+                cachePool[i].inclusive = readCacheInclusiveAMD(cachePool[i].level);
+                break;
+            /* For K8 and K10 it is known that they are inclusive */
+            case K8_FAMILY:
+            case K10_FAMILY:
+                cachePool[i].inclusive = 1;
+                break;
+            default:
+                ERROR_PLAIN_PRINT(Processor is not supported);
+                break;
+        }
+    }
+    cpuid_topology.numCacheLevels = nrCaches;
+    cpuid_topology.cacheLevels = cachePool;
+    return;
+}
+
diff --git a/src/tree.c b/src/tree.c
index 795dd17..4ce3249 100644
--- a/src/tree.c
+++ b/src/tree.c
@@ -5,13 +5,13 @@
  *
  *      Description:  Module implementing a tree data structure
  *
- *      Version:   3.1.3
- *      Released:  4.11.2014
+ *      Version:   4.0
+ *      Released:  16.6.2015
  *
- *      Author:  Jan Treibig (jt), jan.treibig at gmail.com
+ *      Author:   Jan Treibig (jt), jan.treibig at gmail.com
  *      Project:  likwid
  *
- *      Copyright (C) 2014 Jan Treibig
+ *      Copyright (C) 2015 RRZE, University Erlangen-Nuremberg
  *
  *      This program is free software: you can redistribute it and/or modify it under
  *      the terms of the GNU General Public License as published by the Free Software
@@ -48,32 +48,62 @@ tree_init(TreeNode** root, int id)
 void
 tree_print(TreeNode* nodePtr)
 {
-    int level = 0;
+  int level = 0;
+
+  if (nodePtr != NULL)
+  {
+
+    TreeNode* digger;
+    TreeNode* walker;
+
+    digger = nodePtr->llink;
+
+    while (digger != NULL)
+    {
+      printf("\n Level %d:\n", level++);
+      printf("%d ", digger->id);
+      walker = digger->rlink;
+
+      while (walker != NULL)
+      {
+        printf("%d ", walker->id);
+        walker = walker->rlink;
+      }
+
+      digger = digger->llink;
+    }
+
+    printf("\n ");
+  }
+}
+
+void
+tree_destroy(TreeNode* nodePtr)
+{
 
     if (nodePtr != NULL)
     {
 
         TreeNode* digger;
         TreeNode* walker;
+        TreeNode* tmp;
 
         digger = nodePtr->llink;
 
         while (digger != NULL)
         {
-            printf("\n Level %d:\n", level++);
-            printf("%d ", digger->id);
             walker = digger->rlink;
 
             while (walker != NULL)
             {
-            printf("%d ", walker->id);
-            walker = walker->rlink;
+                tmp = walker;
+                walker = walker->rlink;
+                free(tmp);
             }
-
+            tmp = digger;
             digger = digger->llink;
+            free(tmp);
         }
-
-        printf("\n ");
     }
 }
 
diff --git a/test/MPI_pin_test.c b/test/MPI_pin_test.c
index 5624a95..f0e1271 100644
--- a/test/MPI_pin_test.c
+++ b/test/MPI_pin_test.c
@@ -1,15 +1,46 @@
 #include <stdio.h>
+#include <stdlib.h>
 #include <unistd.h>
 #include <mpi.h>
+#include <sys/types.h>
+#include <string.h>
+#include <sys/syscall.h>
+
 #ifdef _OPENMP
 extern int omp_get_num_threads();
 extern int omp_get_thread_num();
 #endif
 
-#include <affinity.h>
+#include <sched.h>
+
+int get_cpu_id()
+{
+    int i;
+    int cpu_id = 0;
+    /* Get the the current process' stat file from the proc filesystem */
+    FILE* procfile = fopen("/proc/self/stat", "r");
+    long to_read = 8192;
+    char* line;
+    char buffer[to_read];
+    int read = fread(buffer, sizeof(char), to_read, procfile);
+    fclose(procfile);
+
+    // Field with index 38 (zero-based counting) is the one we want
+    line = strtok(buffer, " ");
+    for (i = 1; i < 38; i++)
+    {
+        line = strtok(NULL, " ");
+    }
+
+    line = strtok(NULL, " ");
+    cpu_id = atoi(line);
+    return cpu_id;
+}
 
+#define HOST_NAME_MAX 1024
 #define MASTER(msg) \
     if (rank == 0)  printf(#msg "\n")
+#define gettid() (int)syscall(SYS_gettid)
 
 main(int argc, char **argv)
 {
@@ -19,27 +50,31 @@ main(int argc, char **argv)
     MPI_Init(&argc,&argv);
     MPI_Comm_rank(MPI_COMM_WORLD, &rank);
     host = (char*) malloc(HOST_NAME_MAX * sizeof(char));
-    gethostname(host,HOST_NAME_MAX);
+    gethostname(host, HOST_NAME_MAX);
 
     MASTER(MPI started);
     MPI_Barrier(MPI_COMM_WORLD);
-    printf("Process with rank %d running on Node %s Core %d\n",rank ,host, likwid_getProcessorId());
-    fflush(stdout);
+    printf("Process with rank %d running on Node %s Core %d/%d\n",rank ,host, sched_getcpu(),get_cpu_id());
     MPI_Barrier(MPI_COMM_WORLD);
 
     MASTER(Enter OpenMP parallel region);
     MPI_Barrier(MPI_COMM_WORLD);
 #pragma omp parallel
     {
-        int coreId = likwid_getProcessorId();
+#pragma omp master
+        {
+            pid_t pid = getppid();
+            char cmd[1024];
+            sprintf(cmd, "pstree -p -H %d %d",pid, pid);
+            system(cmd);
+        }
 #pragma omp critical
         {
-            printf ("Rank %d Thread %d running on core %d \n",rank,omp_get_thread_num(), coreId);
-            fflush(stdout);
+            printf ("Rank %d Thread %d running on core %d/%d with pid %d and tid %d\n",rank,omp_get_thread_num(), sched_getcpu(),get_cpu_id(), getpid(),gettid());
         }
-    }
 
-    sleep(2);
+    }
 
+    free(host);
     MPI_Finalize();
 }
diff --git a/test/Makefile b/test/Makefile
index 56fece1..f7ee2c6 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -1,5 +1,7 @@
-LIKWID_LIB = -L../ -llikwid 
-INCLUDES = -I../src/includes -I../ -I../MIC
+include ../config.mk
+
+LIKWID_LIB = -L$(PREFIX)/lib -llikwid
+INCLUDES = -I$(PREFIX)/include
 
 all:  testmarker testmarkerF90 stream streamM
 
@@ -7,10 +9,10 @@ testmarkerF90: chaos.F90
 	ifort $(INCLUDES)  -O3  -o $@ chaos.F90 $(LIKWID_LIB) -lpthread
 
 stream: stream.c
-	icc -O3 $(INCLUDES) -mmic  -openmp  -o $@  -DLIKWID_PERFMON stream.c $(LIKWID_LIB) -lm
+	gcc -O3 -std=c99 $(INCLUDES) -fopenmp  -o $@  -DLIKWID_PERFMON stream.c $(LIKWID_LIB) -lm
 
 streamM: stream.c
-	gcc -O3 $(INCLUDES) -fopenmp  -o $@  -DLIKWID_PERFMON stream.c $(LIKWID_LIB) -lm
+	icc -O3 -std=c99 $(INCLUDES) -openmp  -o $@  -DLIKWID_PERFMON stream.c $(LIKWID_LIB) -lm
 
 testmarker:
 	gcc -O3 -std=c99  $(INCLUDES) -fopenmp -DLIKWID_PERFMON  -o $@ testmarker-cnt.c $(LIKWID_LIB) -lm
@@ -19,11 +21,11 @@ testmarker-omp:
 	gcc -O3 -std=c99  $(INCLUDES) -openmp -DLIKWID_PERFMON  -o $@ testmarker-omp.c $(LIKWID_LIB)
 
 test-mpi:
-	mpicc -DMAX_NUM_THREADS=128 -O2 -openmp -I../src/includes  -I../GCC -D_GNU_SOURCE  -o $@ MPI_pin_test.c $(LIKWID_LIB)
+	mpicc -O2 -fopenmp -D_GNU_SOURCE  -o $@ MPI_pin_test.c
 
-.PHONY: clean
+.PHONY: clean streamM test-mpi
 
 clean: 
-	rm -f stream streamM  testmarker testmarkerF90
+	rm -f stream streamM  testmarker testmarkerF90 test-mpi testmarker-omp
 
 
diff --git a/test/accuracy/Makefile b/test/accuracy/Makefile
index f84b1cd..144db67 100644
--- a/test/accuracy/Makefile
+++ b/test/accuracy/Makefile
@@ -1,25 +1,37 @@
 LIKWID_PATH=../..
+LIKWID_BENCH_PATH=../../bench
 LIKWID_APP=likwid-bench
 HOST=$(shell hostname -s)
 
 
-all: plain marker
+all: clean plain marker localize_likwid
 
 plain:
-	sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = false#'/g $(LIKWID_PATH)/config.mk
-	sed -i -e s/'CPPFLAGS := -DPAPI '/'CPPFLAGS := '/g $(LIKWID_PATH)/Makefile
-	cd $(LIKWID_PATH) && make distclean && make
-	cp $(LIKWID_PATH)/$(LIKWID_APP) $(LIKWID_APP)-plain
+	@echo "===> Building uninstrumented likwid-bench"
+	@sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = false#'/g $(LIKWID_PATH)/config.mk
+	@sed -i -e s/'CPPFLAGS := -DPAPI '/'CPPFLAGS := '/g $(LIKWID_PATH)/Makefile
+	@cd $(LIKWID_PATH) && make distclean >/dev/null && make >/dev/null 2>/dev/null
+	@cp $(LIKWID_BENCH_PATH)/$(LIKWID_APP) $(LIKWID_APP)-plain
 
 marker:
-	sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = true#'/g $(LIKWID_PATH)/config.mk
-	sed -i -e s/'CPPFLAGS := -DPAPI '/'CPPFLAGS := '/g $(LIKWID_PATH)/Makefile
-	cd $(LIKWID_PATH) && make distclean && make
-	cp $(LIKWID_PATH)/$(LIKWID_APP) $(LIKWID_APP)-marker
+	@echo "===> Building instrumented likwid-bench"
+	@sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = true#'/g $(LIKWID_PATH)/config.mk
+	@sed -i -e s/'CPPFLAGS := -DPAPI '/'CPPFLAGS := '/g $(LIKWID_PATH)/Makefile
+	@cd $(LIKWID_PATH) && make distclean >/dev/null && make >/dev/null 2>/dev/null
+	@cp $(LIKWID_BENCH_PATH)/$(LIKWID_APP) $(LIKWID_APP)-marker
+
 papi:
-	sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = false#'/g $(LIKWID_PATH)/config.mk
-	cp $(LIKWID_PATH)/Makefile $(LIKWID_PATH)/Makefile.orig
-	sed -i -e s/'CPPFLAGS := '/'CPPFLAGS := -DPAPI '/g $(LIKWID_PATH)/Makefile
-	cd $(LIKWID_PATH) && make distclean && make
-	cp $(LIKWID_PATH)/$(LIKWID_APP) $(LIKWID_APP)-papi
-	mv $(LIKWID_PATH)/Makefile.orig $(LIKWID_PATH)/Makefile
+	@echo "===> Building instrumented likwid-bench using PAPI API"
+	@sed -i -e s/'INSTRUMENT_BENCH = .*#'/'INSTRUMENT_BENCH = false#'/g $(LIKWID_PATH)/config.mk
+	@cp $(LIKWID_PATH)/Makefile $(LIKWID_PATH)/Makefile.orig
+	@sed -i -e s/'CPPFLAGS := '/'CPPFLAGS := -DPAPI '/g $(LIKWID_PATH)/Makefile
+	@cd $(LIKWID_PATH) && make distclean >/dev/null && make >/dev/null 2>/dev/null
+	@cp $(LIKWID_BENCH_PATH)/$(LIKWID_APP) $(LIKWID_APP)-papi
+	@mv $(LIKWID_PATH)/Makefile.orig $(LIKWID_PATH)/Makefile
+
+localize_likwid:
+	@cd $(LIKWID_PATH) && make local >/dev/null && cd - >/dev/null
+
+clean:
+	@echo "===> Cleaning old likwid-bench executables"
+	@rm -f $(LIKWID_APP)-plain $(LIKWID_APP)-marker $(LIKWID_APP)-papi
diff --git a/test/accuracy/README b/test/accuracy/README
index 9dd8a78..6baaa01 100644
--- a/test/accuracy/README
+++ b/test/accuracy/README
@@ -1,6 +1,6 @@
 LIKWID accuracy tester
 
-likwid-tester and likwid-tester-plot are test applications written in Perl. The likwid-accuracy.py application does the same but is written in Python.
+The likwid-accuracy.py application tests the accuracy of LIKWID's measurements. The tool is written in Python. The measurements are compared to an instrumented version of likwid-bench. By scaling the calculated likwid-bench results, it also takes write-allocates into account.
 
 Usage:
 make #build non-instrumentated and LIKWID-instrumentated versions of
@@ -9,10 +9,11 @@ Adjust test files in TESTS.
 Adjust test set file SET.txt or use the -s/--sets switch on commandline.
 likwid-accuracy.py #Runs the tests of all sets and saves results in folder RESULTS/<hostname>
 
+You should use some plotting option on the commandline.
+
 Options for likwid-accuracy.py:
 --pgf: Create a TeX file containing the definition of a PGF plot with suffix .tex -> .pdf
 --grace: Create grace batch file for further manipulation with XMgrace or create plot with gracebat .agr/.bat -> .png
 --gnuplot: Create GNUplot script .plot -> .jpg
---script: Create a Bash script containing all commands to create all plots using pdflatex, gracebat and gnuplot.
+--script: Create a Bash script containing all commands to create all plots using pdflatex, gracebat and/or gnuplot.
 --scriptname: Set name for Bash script, default is $CWD/create_plots.sh
---wiki/--only_wiki: Create a Wiki page for the Google Code Wiki including the .png pics found in Google Code Wiki picture path (http://<project>.googlecode.com/svn/wiki/images). 
diff --git a/test/accuracy/TESTS/FLOPS_AVX.txt b/test/accuracy/TESTS/FLOPS_AVX.txt
index f5ce80e..ff7bbe8 100644
--- a/test/accuracy/TESTS/FLOPS_AVX.txt
+++ b/test/accuracy/TESTS/FLOPS_AVX.txt
@@ -1,12 +1,6 @@
 REGEX_BENCH MFlops\/s:\s+([0-9]+)
-REGEX_PERF \|\s+DP MFlops\/s\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+Packed DP MFlops\/s\s+\|\s+([0-9\.e\+\-]+)
 
-TEST stream_avx
-RUNS 10
-VARIANT 24kB 20000
-VARIANT 128kB 10000
-VARIANT  2MB 5000
-VARIANT  1GB 50
 
 TEST triad_avx
 RUNS 10
diff --git a/test/accuracy/TESTS/FLOPS_DP.txt b/test/accuracy/TESTS/FLOPS_DP.txt
index da6f8be..2e7219d 100644
--- a/test/accuracy/TESTS/FLOPS_DP.txt
+++ b/test/accuracy/TESTS/FLOPS_DP.txt
@@ -1,5 +1,5 @@
 REGEX_BENCH MFlops\/s:\s+([0-9]+)
-REGEX_PERF \|\s+MFlops\/s\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+MFlops\/s\s+\|\s+([0-9\.e\+\-]+)
 
 TEST stream
 RUNS 10
diff --git a/test/accuracy/TESTS/FLOPS_SP.txt b/test/accuracy/TESTS/FLOPS_SP.txt
index 3bad7d7..62e728a 100644
--- a/test/accuracy/TESTS/FLOPS_SP.txt
+++ b/test/accuracy/TESTS/FLOPS_SP.txt
@@ -1,17 +1,11 @@
 REGEX_BENCH MFlops\/s:\s+([0-9]+)
-REGEX_PERF \|\s+MFlops\/s\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+MFlops\/s\s+\|\s+([0-9\.e\+\-]+)
 
-TEST stream_sp
+TEST sum
 RUNS 10
 VARIANT 24kB 20000
 VARIANT 128kB 10000
 VARIANT  2MB 5000
 VARIANT  1GB 50
 
-TEST triad_sp
-RUNS 10
-VARIANT 24kB 20000
-VARIANT 128kB 10000
-VARIANT  2MB 5000
-VARIANT  1GB 50
 
diff --git a/test/accuracy/TESTS/L2.txt b/test/accuracy/TESTS/L2.txt
index 35b2bea..586e206 100644
--- a/test/accuracy/TESTS/L2.txt
+++ b/test/accuracy/TESTS/L2.txt
@@ -1,8 +1,9 @@
 REGEX_BENCH MByte\/s:\s+([0-9]+)
-REGEX_PERF \|\s+L2 bandwidth \[MBytes\/s\]\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+L2 bandwidth \[MBytes\/s\]\s+\|\s+([0-9\.e\+\-]+)
 
 TEST load
 RUNS 5
+WA_FACTOR 1.0
 VARIANT 12kB 20000
 VARIANT 1MB 10000
 VARIANT  4MB 7500
@@ -10,6 +11,7 @@ VARIANT  1GB 50
 
 TEST store
 RUNS 5
+WA_FACTOR 2.0
 VARIANT 12kB 20000
 VARIANT 1MB 10000
 VARIANT  4MB 7500
@@ -17,6 +19,7 @@ VARIANT  1GB 50
 
 TEST copy
 RUNS 5
+WA_FACTOR 1.5
 VARIANT 12kB 20000
 VARIANT 1MB 10000
 VARIANT  4MB 7500
@@ -24,6 +27,7 @@ VARIANT  1GB 50
 
 TEST stream
 RUNS 5
+WA_FACTOR 1.3333
 VARIANT 12kB 20000
 VARIANT 1MB 10000
 VARIANT  4MB 7500
@@ -31,6 +35,7 @@ VARIANT  1GB 50
 
 TEST triad
 RUNS 5
+WA_FACTOR 1.25
 VARIANT 12kB 20000
 VARIANT 1MB 10000
 VARIANT  4MB 7500
diff --git a/test/accuracy/TESTS/L3.txt b/test/accuracy/TESTS/L3.txt
index 8ff6c62..c0b06cf 100644
--- a/test/accuracy/TESTS/L3.txt
+++ b/test/accuracy/TESTS/L3.txt
@@ -1,8 +1,9 @@
 REGEX_BENCH MByte\/s:\s+([0-9]+)
-REGEX_PERF \|\s+L3 bandwidth \[MBytes\/s\]\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+L3 bandwidth \[MBytes\/s\]\s+\|\s+([0-9\.e\+\-]+)
 
 TEST load
 RUNS 5
+WA_FACTOR 1.0
 VARIANT 12kB 20000
 VARIANT 1MB 10000
 VARIANT  4MB 7500
@@ -10,6 +11,7 @@ VARIANT  1GB 50
 
 TEST store
 RUNS 5
+WA_FACTOR 2.0
 VARIANT 12kB 20000
 VARIANT 1MB 10000
 VARIANT  4MB 2000
@@ -17,6 +19,7 @@ VARIANT  1GB 50
 
 TEST copy
 RUNS 5
+WA_FACTOR 1.5
 VARIANT 12kB 20000
 VARIANT 1MB 10000
 VARIANT  4MB 2000
@@ -24,6 +27,7 @@ VARIANT  1GB 50
 
 TEST stream
 RUNS 5
+WA_FACTOR 1.333
 VARIANT 12kB 20000
 VARIANT 1MB 10000
 VARIANT  4MB 2000
@@ -31,6 +35,7 @@ VARIANT  1GB 50
 
 TEST triad
 RUNS 5
+WA_FACTOR 1.333
 VARIANT 12kB 20000
 VARIANT 1MB 10000
 VARIANT  4MB 2000
diff --git a/test/accuracy/TESTS/MEM.txt b/test/accuracy/TESTS/MEM.txt
index 09993f6..3978d83 100644
--- a/test/accuracy/TESTS/MEM.txt
+++ b/test/accuracy/TESTS/MEM.txt
@@ -1,8 +1,9 @@
 REGEX_BENCH MByte\/s:\s+([0-9]+)
-REGEX_PERF \|\s+Memory bandwidth \[MBytes\/s\]\s+\|\s+([0-9]+)
+REGEX_PERF \|\s+Memory bandwidth \[MBytes\/s\]\s+\|\s+([0-9\.e\+\-]+)
 
 TEST load
 RUNS 10
+WA_FACTOR 1.0
 VARIANT 24kB 20000
 VARIANT 128kB 10000
 VARIANT  2MB 7500
@@ -10,6 +11,7 @@ VARIANT  1GB 50
 
 TEST store
 RUNS 10
+WA_FACTOR 2.0
 VARIANT 24kB 20000
 VARIANT 128kB 10000
 VARIANT  2MB 7500
@@ -17,6 +19,7 @@ VARIANT  1GB 50
 
 TEST copy
 RUNS 10
+WA_FACTOR 1.5
 VARIANT 24kB 20000
 VARIANT 128kB 10000
 VARIANT  2MB 7500
@@ -24,6 +27,7 @@ VARIANT  1GB 50
 
 TEST stream
 RUNS 10
+WA_FACTOR 1.3333
 VARIANT 24kB 20000
 VARIANT 128kB 10000
 VARIANT  2MB 7500
@@ -31,6 +35,7 @@ VARIANT  1GB 50
 
 TEST triad
 RUNS 10
+WA_FACTOR 1.25
 VARIANT 24kB 20000
 VARIANT 128kB 10000
 VARIANT  2MB 7500
diff --git a/test/accuracy/likwid-accuracy.py b/test/accuracy/likwid-accuracy.py
index 3d2d63c..534007b 100755
--- a/test/accuracy/likwid-accuracy.py
+++ b/test/accuracy/likwid-accuracy.py
@@ -24,10 +24,10 @@ testlist = "SET.txt"
 testfolder = "TESTS"
 resultfolder = "RESULTS"
 hostname = socket.gethostname()
-picture_base = "http://likwid.googlecode.com/svn/wiki/images"
+picture_base = ".."
 
-gnu_colors = ["red","blue","green"]#,"black","brown", "gray","violet", "cyan", "magenta","orange","#4B0082","#800000","turquoise","#006400","yellow"]
-gnu_marks = [5,13,9]#,2,3,4,6,7,8,9,10,11,12,14,15]
+gnu_colors = ["red","blue","green","black"]#,"brown", "gray","violet", "cyan", "magenta","orange","#4B0082","#800000","turquoise","#006400","yellow"]
+gnu_marks = [5,13,9,2]#,3,4,6,7,8,9,10,11,12,14,15]
 
 wiki = False
 papi = False
@@ -38,14 +38,27 @@ out_gnuplot = False
 out_grace = False
 scriptfilename = "create_plots.sh"
 out_script = False
+test_set = {}
+plain_set = {}
+corrected_set = {}
+marker_set = {}
+papi_set = {}
+
+if not os.path.exists(bench_plain) or not os.path.exists(bench_marker):
+    print "Please run make before using likwid-accuracy.py"
+    sys.exit(1)
+if not os.path.exists(perfctr):
+    print "Cannot find likwid-perfctr"
+    sys.exit(1)
+
 
 def usage():
     print "Execute and evaluate accuracy tests for LIKWID with likwid-bench and likwid-perfctr"
     print
     print "-h/--help:\tPrint this help text"
     print "-s/--sets:\tSpecifiy testgroups (comma separated). Can also be set in SET.txt"
-    print "--wiki:\t\tBesides testing write out results in Google code wiki syntax"
-    print "--only_wiki:\tDo not run benchmarks, read results from file and write out results in Google code wiki syntax"
+#    print "--wiki:\t\tBesides testing write out results in Google code wiki syntax"
+#    print "--only_wiki:\tDo not run benchmarks, read results from file and write out results in Google code wiki syntax"
     print "Picture options:"
     print "--pgf:\t\tCreate TeX document for each test with PGFPlot"
     print "--gnuplot:\tCreate GNUPlot script for each test"
@@ -87,8 +100,11 @@ def get_groups():
     for line in p.stdout.read().split("\n"):
         if line.startswith("-") or not line.strip(): continue
         if line.startswith("Available"): continue
-        name, description = line.split(":")
-        groups[name.strip()] = description.strip()
+        print(re.split("\s*", line.strip()))
+        linelist = re.split("\s+", line.strip())
+        name = linelist[0]
+        description = " ".join(linelist[1:])
+        groups[name] = description
     return groups
 
 def get_test_groups(groupdict):
@@ -130,7 +146,7 @@ def get_values_from_file(file, lineoffset, linecount):
             print line
     return results
 
-def write_pgf(group, test, plain_file, marker_file, papi_file=None, execute=False, script=None):
+def write_pgf(group, test, plain_file, correct_file, marker_file, papi_file=None, execute=False, script=None):
     filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".tex")
     fp = open(filename,'w')
     fp.write("\documentclass{article}\n")
@@ -141,10 +157,12 @@ def write_pgf(group, test, plain_file, marker_file, papi_file=None, execute=Fals
     fp.write("\\begin{axis}[xlabel={Run}, ylabel={MFlops/s / MBytes/s},title={%s\_%s},legend pos=south east,xtick=data,width=.75\\textwidth]\n" % (group.replace("_","\_"),test.replace("_","\_"),))
     fp.write("\\addplot+[red,mark=square*,mark options={draw=red, fill=red}] table {%s};\n" % (os.path.basename(plain_file),))
     fp.write("\\addlegendentry{plain};\n")
-    fp.write("\\addplot+[blue,mark=diamond*,mark options={draw=blue, fill=blue}] table {%s};\n" % (os.path.basename(marker_file),))
+    fp.write("\\addplot+[blue,mark=*,mark options={draw=blue, fill=blue}] table {%s};\n" % (os.path.basename(correct_file),))
+    fp.write("\\addlegendentry{corrected};\n")
+    fp.write("\\addplot+[green,mark=diamond*,mark options={draw=green, fill=green}] table {%s};\n" % (os.path.basename(marker_file),))
     fp.write("\\addlegendentry{marker};\n")
     if papi and papi_file:
-        fp.write("\\addplot+[green,mark=triangle*,mark options={draw=green, fill=green}] table {%s};\n" % (os.path.basename(papi_file),))
+        fp.write("\\addplot+[black,mark=triangle*,mark options={draw=black, fill=black}] table {%s};\n" % (os.path.basename(papi_file),))
         fp.write("\\addlegendentry{papi};\n")
     fp.write("\\end{axis}\n")
     fp.write("\\end{tikzpicture}\n")
@@ -162,7 +180,7 @@ def write_pgf(group, test, plain_file, marker_file, papi_file=None, execute=Fals
         script.write("pdflatex %s\n" % (os.path.basename(filename),))
     return filename
     
-def write_gnuplot(group, test, plain_file, marker_file, papi_file, execute=False, script=None):
+def write_gnuplot(group, test, plain_file, correct_file, marker_file, papi_file=None, execute=False, script=None):
     filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".plot")
     fp = open(filename,'w')
     for i,color in enumerate(gnu_colors):
@@ -170,12 +188,15 @@ def write_gnuplot(group, test, plain_file, marker_file, papi_file, execute=False
     fp.write("set terminal jpeg\n")
     fp.write("set title '%s_%s'\n" % (group, test,))
     fp.write("set output '%s'\n" % (os.path.basename(os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".jpg")),))
-    fp.write("set xlabel 'Run'\n")
+    fp.write("set xlabel 'Size - %d runs each'\n" % (test_set[group][test]["RUNS"],))
     fp.write("set ylabel 'MFlops/s / MBytes/s'\n")
-    #fp.write("set xtics 1\n")
-    plot_string = "plot '%s' using 1:2 title 'plain' with linespoints ls 1, \\\n '%s' using 1:2 title 'marker' with linespoints ls 2" % (os.path.basename(plain_file), os.path.basename(marker_file),)
+    #fp.write("set xtics 0,%d,%d\n" % (test_set[group][test]["RUNS"], test_set[group][test]["RUNS"]*len(test_set[group][test]["variants"]),))
+    fp.write("set xtics %d\n" % (test_set[group][test]["RUNS"]*len(test_set[group][test]["variants"]),))
+    for i,variant in enumerate(test_set[group][test]["variants"]):
+        fp.write("set xtics add (\"%s\" %f)\n" % (variant, (i*test_set[group][test]["RUNS"])+(0.5*test_set[group][test]["RUNS"]),))
+    plot_string = "plot '%s' using 1:2 title 'plain' with linespoints ls 1, \\\n '%s' using 1:2 title 'corrected' with linespoints ls 2, \\\n '%s' using 1:2 title 'marker' with linespoints ls 3" % (os.path.basename(plain_file), os.path.basename(correct_file), os.path.basename(marker_file),)
     if papi and papi_file:
-        plot_string += ", \\\n '%s' using 1:2 title 'papi' with linespoints ls 3\n" % (os.path.basename(papi_file),)
+        plot_string += ", \\\n '%s' using 1:2 title 'papi' with linespoints ls 4\n" % (os.path.basename(papi_file),)
     fp.write(plot_string+"\n")
     fp.close()
     if execute:
@@ -190,17 +211,18 @@ def write_gnuplot(group, test, plain_file, marker_file, papi_file, execute=False
         script.write("gnuplot %s\n" % (os.path.basename(filename),))
     return filename
 
-def write_grace(group, test, plain_file, marker_file, papi_file=None, execute=False, script=None):
+def write_grace(group, test, plain_file, correct_file, marker_file, papi_file=None, execute=False, script=None):
     filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".bat")
     agrname = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".agr")
     pngname = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+".png")
     if execute or script:
         plain_file = os.path.basename(plain_file)
         marker_file = os.path.basename(marker_file)
+        correct_file = os.path.basename(correct_file)
         if papi_file: papi_file = os.path.basename(papi_file)
         pngname = os.path.basename(pngname)
         agrname = os.path.basename(agrname)
-    cmd_options = "-autoscale xy -nxy %s -nxy %s "% (plain_file,marker_file,)
+    cmd_options = "-autoscale xy -nxy %s -nxy %s -nxy %s " % (plain_file, correct_file, marker_file,)
     if papi and papi_file:
         cmd_options += "-nxy %s " % (papi_file,)
     out_options = "-hdevice PNG -printfile %s " % (pngname,)
@@ -228,7 +250,7 @@ def write_grace(group, test, plain_file, marker_file, papi_file=None, execute=Fa
     fp.write("s0 line linestyle 1\n")
     fp.write("s0 line linewidth 2\n")
     fp.write("s0 line pattern 1\n")
-    fp.write("s1 legend \"marker\"\n")
+    fp.write("s1 legend \"corrected\"\n")
     fp.write("s1 symbol 3\n")
     fp.write("s1 symbol size 1\n")
     fp.write("s1 symbol color 4\n")
@@ -242,21 +264,35 @@ def write_grace(group, test, plain_file, marker_file, papi_file=None, execute=Fa
     fp.write("s1 line linestyle 1\n")
     fp.write("s1 line linewidth 2\n")
     fp.write("s1 line pattern 1\n")
+    fp.write("s2 legend \"marker\"\n")
+    fp.write("s2 symbol 4\n")
+    fp.write("s2 symbol size 1\n")
+    fp.write("s2 symbol color 3\n")
+    fp.write("s2 symbol pattern 1\n")
+    fp.write("s2 symbol fill color 3\n")
+    fp.write("s2 symbol fill pattern 1\n")
+    fp.write("s2 symbol linewidth 2\n")
+    fp.write("s2 symbol linestyle 1\n")
+    fp.write("s2 line type 1\n")
+    fp.write("s2 line color 3\n")
+    fp.write("s2 line linestyle 1\n")
+    fp.write("s2 line linewidth 2\n")
+    fp.write("s2 line pattern 1\n")
     if papi and papi_file:
-        fp.write("s2 legend \"papi\"\n")
-        fp.write("s2 symbol 4\n")
-        fp.write("s2 symbol size 1\n")
-        fp.write("s2 symbol color 3\n")
-        fp.write("s2 symbol pattern 1\n")
-        fp.write("s2 symbol fill color 3\n")
-        fp.write("s2 symbol fill pattern 1\n")
-        fp.write("s2 symbol linewidth 2\n")
-        fp.write("s2 symbol linestyle 1\n")
-        fp.write("s2 line type 1\n")
-        fp.write("s2 line color 3\n")
-        fp.write("s2 line linestyle 1\n")
-        fp.write("s2 line linewidth 2\n")
-        fp.write("s2 line pattern 1\n")
+        fp.write("s3 legend \"papi\"\n")
+        fp.write("s3 symbol 5\n")
+        fp.write("s3 symbol size 1\n")
+        fp.write("s3 symbol color \"black\"\n")
+        fp.write("s3 symbol pattern 1\n")
+        fp.write("s3 symbol fill color \"black\"\n")
+        fp.write("s3 symbol fill pattern 1\n")
+        fp.write("s3 symbol linewidth 2\n")
+        fp.write("s3 symbol linestyle 1\n")
+        fp.write("s3 line type 1\n")
+        fp.write("s3 line color \"black\"\n")
+        fp.write("s3 line linestyle 1\n")
+        fp.write("s3 line linewidth 2\n")
+        fp.write("s3 line pattern 1\n")
     fp.close()
     if execute:
         cmd = "cd %s && gracebat %s -param %s %s && cd -" % (os.path.dirname(filename), cmd_options, os.path.basename(filename),out_options,)
@@ -303,23 +339,27 @@ for o, a in opts:
     if o == "--scriptname":
         scriptfilename = a
 
-if not os.path.exists(testlist):
+if len(sets) == 0 and not os.path.exists(testlist):
     print "Cannot find file %s containing list of testgroups" % (testlist,)
     sys.exit(1)
 if not os.path.exists(testfolder):
     print "Cannot find folder %s containing the testgroups" % (testfolder,)
     sys.exit(1)
 
-test_set = {}
-plain_set = {}
-marker_set = {}
-papi_set = {}
-fp = open(testlist,'r')
-for line in fp.read().split("\n"):
+
+if len(sets) == 0:
+    fp = open(testlist,'r')
+    tmp = fp.read().split("\n")
+    for item in tmp:
+        if not item.strip() or item.startswith("#"): continue
+        sets.append(item)
+    fp.close()
+for line in sets:
     if not line.strip() or line.startswith("#"): continue
     if os.path.exists("%s/%s.txt" % (testfolder,line.strip(),)):
         test_set[line.strip()] = {}
         plain_set[line.strip()] = {}
+        corrected_set[line.strip()] = {}
         marker_set[line.strip()] = {}
         papi_set[line.strip()] = {}
         testfp = open("%s/%s.txt" % (testfolder,line.strip(),),'r')
@@ -335,11 +375,15 @@ for line in fp.read().split("\n"):
             if testline.startswith("TEST"):
                 test = testline.split(" ")[1]
                 test_set[line.strip()][test] = {}
+                test_set[line.strip()][test]["WA_FACTOR"] = 1.0
                 plain_set[line.strip()][test] = {}
+                corrected_set[line.strip()][test] = {}
                 marker_set[line.strip()][test] = {}
                 papi_set[line.strip()][test] = {}
             if testline.startswith("RUNS") and test:
                 test_set[line.strip()][test]["RUNS"] = int(testline.split(" ")[1])
+            if testline.startswith("WA_FACTOR") and test:
+                test_set[line.strip()][test]["WA_FACTOR"] = float(testline.split(" ")[1])
             if testline.startswith("VARIANT") and test:
                 linelist = re.split("\s+",testline);
                 variant = linelist[1]
@@ -348,10 +392,11 @@ for line in fp.read().split("\n"):
                 test_set[line.strip()][test][variant] = linelist[2]
                 test_set[line.strip()][test]["variants"].append(linelist[1])
                 plain_set[line.strip()][test][variant] = []
+                corrected_set[line.strip()][test][variant] = []
                 marker_set[line.strip()][test][variant] = []
                 papi_set[line.strip()][test][variant] = []
         testfp.close()
-fp.close()
+
 
 
 if len(test_set.keys()) == 0:
@@ -374,10 +419,12 @@ if not only_wiki:
             if test.startswith("REGEX"): continue
             file_plain = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_plain.dat")
             raw_plain = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_plain.raw")
+            file_correct = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_correct.dat")
             file_marker = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_marker.dat")
             raw_marker = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_marker.raw")
             outfp_plain = open(file_plain,'w')
             rawfp_plain = open(raw_plain,'w')
+            outfp_correct = open(file_correct,'w')
             outfp_marker = open(file_marker,'w')
             rawfp_marker = open(raw_marker,'w')
             if papi:
@@ -391,9 +438,11 @@ if not only_wiki:
             counter = 1
             for size in test_set[group][test]["variants"]:
                 if size.startswith("RUNS"): continue
-                bench_options = "-t %s -i %s -g 1 -w N:%s:1" % (test, test_set[group][test][size], size,)
+                bench_options = "-t %s -w S0:%s:1" % (test, size,)
                 for i in range(0,test_set[group][test]["RUNS"]):
                     # Run with plain likwid-bench
+                    print "*",
+                    sys.stdout.flush()
                     p = subprocess.Popen(bench_plain+" "+bench_options, shell=True, stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
                     try:
                         p.wait()
@@ -402,11 +451,14 @@ if not only_wiki:
                     except:
                         sys.exit(1)
                     for line in stdout.split("\n"):
-                        if p.returncode != 0: print line
+                        #if p.returncode != 0: print line
                         match = test_set[group]["REGEX_BENCH"].match(line)
                         if match:
+                            value = float(match.group(1)) * test_set[group][test]["WA_FACTOR"]
                             plain_set[group][test][size].append(match.group(1))
+                            corrected_set[group][test][size].append(str(value))
                             outfp_plain.write(str(counter)+" "+match.group(1)+"\n")
+                            outfp_correct.write(str(counter)+" "+str(value)+"\n")
                         rawfp_plain.write(line+"\n")
                     # Run with papi instrumented likwid-bench
                     if papi:
@@ -419,7 +471,7 @@ if not only_wiki:
                         except:
                             sys.exit(1)
                         for line in stdout.split("\n"):
-                            if p.returncode != 0: print line
+                            #if p.returncode != 0: print line
                             match = test_set[group]["REGEX_PAPI"].match(line)
                             if match:
                                 papi_set[group][test][size].append(match.group(1))
@@ -435,99 +487,107 @@ if not only_wiki:
                     except:
                         sys.exit(1)
                     for line in stdout.split("\n"):
-                        if p.returncode != 0: print line
+                        #if p.returncode != 0: print line
                         match = test_set[group]["REGEX_PERF"].match(line)
                         if match:
                             marker_set[group][test][size].append(float(match.group(1)))
                             outfp_marker.write(str(counter)+" "+str(float(match.group(1)))+"\n")
                         rawfp_marker.write(line+"\n")
                     counter += 1
+                print("")
             outfp_plain.close()
             rawfp_plain.close()
+            outfp_correct.close()
             outfp_marker.close()
             rawfp_marker.close()
             if papi:
                 outfp_papi.close()
                 rawfp_papi.close()
-            if out_pgf: pgf_file = write_pgf(group, test, file_plain, file_marker, file_papi, script=script)
-            if out_gnuplot: plot_file = write_gnuplot(group, test, file_plain, file_marker, file_papi, script=script)
-            if out_grace: grace_file = write_grace(group, test, file_plain, file_marker, file_papi, script=script)
+            if out_pgf: pgf_file = write_pgf(group, test, file_plain, file_correct, file_marker, file_papi, script=script)
+            if out_gnuplot: plot_file = write_gnuplot(group, test, file_plain, file_correct, file_marker, file_papi, script=script)
+            if out_grace: grace_file = write_grace(group, test, file_plain, file_correct, file_marker, file_papi, script=script)
 
 
     script.close()
     os.chmod(scriptfile, stat.S_IRWXU)
-#if only_wiki:
-#    for group in test_set.keys():
-#        for test in test_set[group].keys():
-#            if test.startswith("REGEX"): continue
-#            filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_plain.dat")
-#            for i,size in enumerate(test_set[group][test]["variants"]):
-#                start = i*test_set[group][test]["RUNS"]
-#                end = (i+1)*test_set[group][test]["RUNS"]
-#                runs = test_set[group][test]["RUNS"]
-#                print "Read file %s for size %s from %d to %d" % (filename,size, start, end,)
-#                plain_set[group][test][size] = get_values_from_file(filename, start, runs)
-#                if len(plain_set[group][test][size]) == 0: plain_set[group][test][size].append(0)
-#            filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_marker.dat")
-#            for i,size in enumerate(test_set[group][test]["variants"]):
-#                start = i*test_set[group][test]["RUNS"]
-#                end = (i+1)*test_set[group][test]["RUNS"]
-#                runs = test_set[group][test]["RUNS"]
-#                print "Read file %s for size %s from %d to %d" % (filename,size, start, end,)
-#                marker_set[group][test][size] = get_values_from_file(filename, start, runs)
-#                if len(marker_set[group][test][size]) == 0: marker_set[group][test][size].append(0)
+if only_wiki:
+    for group in test_set.keys():
+        for test in test_set[group].keys():
+            if test.startswith("REGEX"): continue
+            filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_plain.dat")
+            for i,size in enumerate(test_set[group][test]["variants"]):
+                start = i*test_set[group][test]["RUNS"]
+                end = (i+1)*test_set[group][test]["RUNS"]
+                runs = test_set[group][test]["RUNS"]
+                print "Read file %s for size %s from %d to %d" % (filename,size, start, end,)
+                plain_set[group][test][size] = get_values_from_file(filename, start, runs)
+                if len(plain_set[group][test][size]) == 0: plain_set[group][test][size].append(0)
+            filename = os.path.join(os.path.join(resultfolder,hostname),group+"_"+test+"_marker.dat")
+            for i,size in enumerate(test_set[group][test]["variants"]):
+                start = i*test_set[group][test]["RUNS"]
+                end = (i+1)*test_set[group][test]["RUNS"]
+                runs = test_set[group][test]["RUNS"]
+                print "Read file %s for size %s from %d to %d" % (filename,size, start, end,)
+                marker_set[group][test][size] = get_values_from_file(filename, start, runs)
+                if len(marker_set[group][test][size]) == 0: marker_set[group][test][size].append(0)
 
 
 if wiki or only_wiki:
     name, sockets, corespersocket, threadspercore = get_system_info();
     groups = get_groups()
     testable_groups = get_test_groups(groups)
-    #print groups
-    #print testable_groups
-    #if testable_groups.has_key("FLOPS_DP"): del testable_groups["FLOPS_DP"]
+    if testable_groups.has_key("FLOPS_DP"): del testable_groups["FLOPS_DP"]
 
-    print "#summary Accuracy Tests for %s\n" % (name,)
-    print "= Hardware description ="
+    print "# Accuracy Tests for %s\n" % (name,)
+    print "## Hardware description"
     print "Sockets: %d<br>" % (sockets,)
     print "Cores per socket: %d<br>" % (corespersocket,)
     print "Threads per core: %d<br>" % (threadspercore,)
     print "Total number of processing units: %d<br>" % (sockets * corespersocket * threadspercore)
     print
-    print "= Available groups ="
-    print "Each architecture defines a different set of groups. Here all the groups available for the %s are listed:<br>" % (name,)
+    print "## Available groups"
+    print "Each architecture defines a different set of performance groups. These groups help users to measure their derived metrics. Besides the event and counter defintion, a performance groups contains derived metrics that are calculated based on the measured data.<br>Here all the groups available for the %s are listed:<br>\n" % (name,)
+    print "| Name | Description |"
+    print "| ---- | ----------- |"
     for grp in groups.keys():
-        print "%s: %s<br>" % (grp, groups[grp],)
+        print "| %s | %s |" % (grp, groups[grp],)
     print
-    print "= Available verification tests ="
-    print "Not all groups can be tested for accuracy. Here only the groups are listed that can be verified. Each group is followed by the low-level benchmarks that are performed for comparison.<br>"
+    print "## Available verification tests"
+    print "Not all groups can be tested for accuracy. We don't have a test application for each performance group. Here only the groups are listed that can be verified. Each group is followed by the low-level benchmarks that are performed for comparison.<br>\n"
     #print testable_groups
+    print "| Group | Tests |"
+    print "|-------|-------|"
     for grp in testable_groups.keys():
-        print "%s: %s<br>" % (grp, ", ".join (testable_groups[grp]))
+        print "| %s | %s |" % (grp, ", ".join (testable_groups[grp]))
     print
-    print "= Accuracy comparison ="
-    print "For each varification group, the tests are performed twice. Once in a plain manner without measuring but calculating the resulting values and once through an instumented code with LIKWID.<br>"
+    print "## Accuracy comparison"
+    print "For each varification group, the tests are performed twice. Once in a plain manner without measuring but calculating the resulting values and once through an instumented code with LIKWID.<br>\n"
     
     
     for grp in testable_groups.keys():
-        print "== Verification of Group %s ==" % (grp,)
+        print "### Verification of Group %s" % (grp,)
         for test in testable_groups[grp]:
             #print grp, test, test_set[grp][test]
-            print "=== Verification of Group %s with Test %s ===" % (grp, test,)
-            print "|| *Stream size* || *Iterations* ||"
+            print "#### Verification of Group %s with Test %s\n" % (grp, test,)
+            print "| *Stream size* | *Iterations* |"
+            print "|---------------|--------------|"
             for variant in test_set[grp][test]["variants"]:
-                print "|| %s || %s ||" % (variant, test_set[grp][test][variant], )
+                print "| %s | %s |" % (variant, test_set[grp][test][variant], )
             print 
-            print "Each data size is tested %d times, hence the first %d entries on the x-axis correspond to the %d runs for the first data size of %s and so on.<br>" % (test_set[grp][test]["RUNS"],test_set[grp][test]["RUNS"],test_set[grp][test]["RUNS"],test_set[grp][test]["variants"][0],)
-            print "%s/accuracy/%s/%s_%s.png" % (picture_base,hostname, grp, test,)
+            print "Each data size is tested %d times, hence the first %d entries on the x-axis correspond to the %d runs for the first data size of %s and so on.<br>\n" % (test_set[grp][test]["RUNS"],test_set[grp][test]["RUNS"],test_set[grp][test]["RUNS"],test_set[grp][test]["variants"][0],)
+            print "%s/images/%s/%s_%s.png" % (picture_base,hostname, grp, test,)
             print
             file_plain = os.path.join(os.path.join(resultfolder,hostname),grp+"_"+test+"_plain.dat")
             file_marker = os.path.join(os.path.join(resultfolder,hostname),grp+"_"+test+"_marker.dat")
-            print "|| Variant || Plain (Min) || LIKWID (Min) || Plain (Max) || LIKWID (Max) || Plain (Avg) || LIKWID (Avg) ||"
+            print "| Variant | Plain (Min) | LIKWID (Min) | Plain (Max) | LIKWID (Max) | Plain (Avg) | LIKWID (Avg) |"
+            print "| ------- | ------- | ------- | ------- | ------- | ------- | ------- |"
             for i, variant in enumerate(test_set[grp][test]["variants"]):
                 results_plain = get_values_from_file(file_plain, i*test_set[grp][test]["RUNS"], test_set[grp][test]["RUNS"])
+                results_correct = get_values_from_file(file_correct, i*test_set[grp][test]["RUNS"], test_set[grp][test]["RUNS"])
                 results_marker = get_values_from_file(file_marker, i*test_set[grp][test]["RUNS"], test_set[grp][test]["RUNS"])
-                 if results_plain == []: results_plain.append(0)
-                 if results_marker == []: results_marker.append(0)
-                 print "|| %s || %d || %d || %d || %d || %d || %d ||" % (variant, min(results_plain), min(results_marker), max(results_plain), max(results_marker), int(statistics.mean(results_plain)), int(statistics.mean(results_marker)),)
+                if results_plain == []: results_plain.append(0)
+                if results_marker == []: results_marker.append(0)
+                if results_correct == []: results_correct.append(0)
+                print "| %s | %d | %d | %d | %d | %d | %d |" % (variant, min(results_correct), min(results_marker), max(results_correct), max(results_marker), int(statistics.mean(results_correct)), int(statistics.mean(results_marker)),)
             print
             print
diff --git a/test/accuracy/likwid-tester b/test/accuracy/likwid-tester
deleted file mode 100755
index ea264ae..0000000
--- a/test/accuracy/likwid-tester
+++ /dev/null
@@ -1,220 +0,0 @@
-#!/usr/bin/perl
-use lib '../../perl';
-use strict;
-use warnings;
-use xmgrace;
-use Cwd 'abs_path';
-use Data::Dumper;
-use File::Copy;
-
-my $domain = 'S0';
-
-my $hostname = `hostname`;
-chomp $hostname;
-my %GROUPS;
-my $TEST_ROOT = abs_path('./');
-my $RESULT_DIR   =  "$TEST_ROOT/RESULTS/$hostname";
-my $LIKWID_ROOT  =  "$TEST_ROOT/../..";
-my $PERFCTR      =  "$LIKWID_ROOT/likwid-perfctr";
-my $BENCH_PLAIN  =  "$LIKWID_ROOT/likwid-bench-plain";
-my $BENCH_MARKER =  "$LIKWID_ROOT/likwid-bench-marker";
-
-sub extract_result 
-{
-	my $type = shift;
-    my $REGEX;
-	my $REGEX_PLAIN = shift;
-	my $REGEX_MARKER = shift;
-
-	if ( $type eq 'plain' ) {
-		$REGEX = $REGEX_PLAIN;
-	}
-	elsif ($type eq 'marker') {
-		$REGEX = $REGEX_MARKER;
-	}
-
-	open (INPUT,"<out-$hostname.txt");
-	while (<INPUT>) {
-		if (/$REGEX/) {
-			return $1;
-		}
-	}
-	close INPUT;
-
-	return 0;
-}
-
-# determine capabilities of platform
-open (INPUT, "$PERFCTR -a |");
-
-while (<INPUT>) {
-    if (/(.+):/) {
-        $GROUPS{$1}='true';
-    }
-}
-
-close INPUT;
-
-mkdir $RESULT_DIR if (not -d $RESULT_DIR);
-
-
-# collect tests
-chdir ("$TEST_ROOT/TESTS") or die "Cannot change in $TEST_ROOT/TESTS $!\n";
-opendir (DIR, './') or die "Cannot open current directory: $!\n";
-my $TESTS = {};
-my $test_ptr;
-
-while (defined(my $file = readdir(DIR))) {
-    if ($file !~ /^\./) {
-        print "SCANNING $file\n";
-        open (TESTCASE, "<$file");
-        $file =~ s/.txt//;
-        $TESTS->{$file}->{benchmarks} = [];
-
-        while ( <TESTCASE> ) {
-
-            if (/REGEX_BENCH[ ](.+)/) {
-                $TESTS->{$file}->{REGEX_BENCH} = $1;
-            } elsif (/REGEX_PERF[ ](.+)/) {
-                $TESTS->{$file}->{REGEX_PERF} = $1;
-            } elsif (/TEST\s+(.+)/) {
-                push (@{ $TESTS->{$file}->{benchmarks} },
-                    {name => $1,
-                     runs => 0,
-                     variants => []});
-
-                $test_ptr = $TESTS->{$file}->{benchmarks}[-1];
-
-            } elsif (/RUNS\s+(.+)/) {
-                $test_ptr->{runs} = $1;
-            } elsif (/VARIANT\s+(.+B)\s+([0-9]+)/) {
-                push (@{ $test_ptr->{variants} },{size => $1, iter => $2});
-            }
-        }
-        close TESTCASE;
-    }
-}
-
-closedir DIR;
-chdir "$TEST_ROOT";
-
-# Read in Test set
-my %FILTER;
-open FILE,"<SET.txt";
-while ( <FILE> ) {
-    if ( not /^#/ ) {
-        chomp;
-        $FILTER{$_} = 'true';
-    }
-}
-close FILE;
-
-#run tests
-foreach my $test ( keys %$TESTS ) {
-
-    if ((exists $GROUPS{$test}) and (exists $FILTER{$test})) {
-        print "RUNNING $test : ";
-
-        foreach my $bench ( @{ $TESTS->{$test}->{benchmarks} } ) {
-            my $benchmark = $bench->{name};
-            my $runs      = $bench->{runs};
-            open (DATAFILE1, ">out-$hostname-1.dat");
-            open (DATAFILE2, ">out-$hostname-2.dat");
-            my $globalrun = 0;
-            print "$bench->{name} ";
-
-            foreach my $variant ( @{ $bench->{variants} } ) {
-                foreach ( 0 ... $runs ) {
-                    print DATAFILE1 "$globalrun ";
-                    print DATAFILE2 "$globalrun ";
-                    #print "$BENCH_PLAIN -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt";
-                    system ("$BENCH_PLAIN -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt");
-                    my $result = extract_result('plain',$TESTS->{$test}->{REGEX_BENCH},$TESTS->{$test}->{REGEX_PERF});
-                    print DATAFILE1 "$result\n";
-                    #print "$PERFCTR  -C E:". $domain .":0 -m -g $test $BENCH_MARKER -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt";
-                    system ("$PERFCTR  -C E:". $domain .":1 -m -g $test $BENCH_MARKER -g 1 -t $benchmark -i $variant->{iter} -w $domain:$variant->{size}:1 > out-$hostname.txt");
-                    $result = extract_result('marker',$TESTS->{$test}->{REGEX_BENCH},$TESTS->{$test}->{REGEX_PERF});
-                    print DATAFILE2 "$result\n";
-                    $globalrun++;
-                }
-            }
-
-            close DATAFILE1;
-            close DATAFILE2;
-
-#output results
-            if (system('gracebat >/dev/null 2>&1') ) {
-                mkdir "$RESULT_DIR/tmp" if (not -d "$RESULT_DIR/tmp");
-
-                copy ("$LIKWID_ROOT/test/accuracy/out-$hostname-1.dat", "$RESULT_DIR/tmp/$test-$bench->{name}-plain.dat");
-                copy ("$LIKWID_ROOT/test/accuracy/out-$hostname-2.dat", "$RESULT_DIR/tmp/$test-$bench->{name}-marker.dat");
-
-            } else {
-
-                my $series = [];
-
-                push @{$series}, 
-                { "title"     =>  "plain",
-                    "data file" =>  "$LIKWID_ROOT/test/accuracy/out-$hostname-1.dat",
-                    "line" => {
-                        "type"      => "1",
-                        "color"     => "2",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "pattern"   => "1",
-                    },
-                    "symbol" => {
-                        "type"      => "2",
-                        "color"     => "2",
-                        "pattern"   => "1",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "size"      => "1",
-                        "fill pattern" => "1",
-                        "fill color" => "2",
-                    }
-                };
-
-                push @{$series}, 
-                { "title"     =>  "marker",
-                    "data file" =>  "$LIKWID_ROOT/test/accuracy/out-$hostname-2.dat",
-                    "line" => {
-                        "type"      => "1",
-                        "color"     => "4",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "pattern"   => "1",
-                    },
-                    "symbol" => {
-                        "type"      => "3",
-                        "color"     => "4",
-                        "pattern"   => "1",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "size"      => "1",
-                        "fill pattern" => "1",
-                        "fill color" => "4",
-                    }
-                };
-
-                xmgrace ({"title"           => "$test",
-                        "subtitle"          => "$bench->{name}",
-                        "legend"            => "0.8,0.7",
-                        "device"            => 'PNG',
-                        "output file"       => "$RESULT_DIR/$test\_".$bench->{name}.".png",
-                        "grace output file" => "$RESULT_DIR/$test\_".$bench->{name}.".agr",
-                        "xaxis label"       => "run",
-                        "yaxis label"       => "MFlops/s / MBytes/s"
-                    },
-                    $series);
-            }
-        }
-        print "\n";
-    }
-}
-
-unlink 'out-$hostname.txt';
-unlink 'out-$hostname-1.dat';
-unlink 'out-$hostname-2.dat';
-
-
diff --git a/test/accuracy/likwid-tester-plot b/test/accuracy/likwid-tester-plot
deleted file mode 100755
index ec6af41..0000000
--- a/test/accuracy/likwid-tester-plot
+++ /dev/null
@@ -1,78 +0,0 @@
-#!/usr/bin/perl
-use lib '../../perl';
-use strict;
-use warnings;
-use xmgrace;
-use Cwd 'abs_path';
-
-my $TEST_ROOT = abs_path('./');
-my $machine = $ARGV[0];
-my $RESULT_DIR = "$TEST_ROOT/RESULTS/$machine";
-
-chdir "$TEST_ROOT/RESULTS/$machine/tmp/";
-opendir (DIR, './') or die "Cannot open current directory: $!\n";
-
-while (defined(my $file = readdir(DIR))) {
-
-    if ($file =~ /([A-Z0-9_]+)-(.*)-marker\.dat/) {
-                my $series = [];
-                my $test = $1;
-                my $name = $2;
-
-                push @{$series}, 
-                { "title"     =>  "plain",
-                    "data file" =>  "$TEST_ROOT/RESULTS/$machine/tmp/$test-$name-plain.dat",
-                    "line" => {
-                        "type"      => "1",
-                        "color"     => "2",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "pattern"   => "1",
-                    },
-                    "symbol" => {
-                        "type"      => "2",
-                        "color"     => "2",
-                        "pattern"   => "1",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "size"      => "1",
-                        "fill pattern" => "1",
-                        "fill color" => "2",
-                    }
-                };
-
-                push @{$series}, 
-                { "title"     =>  "marker",
-                    "data file" =>  "$TEST_ROOT/RESULTS/$machine/tmp/$file",
-                    "line" => {
-                        "type"      => "1",
-                        "color"     => "4",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "pattern"   => "1",
-                    },
-                    "symbol" => {
-                        "type"      => "3",
-                        "color"     => "4",
-                        "pattern"   => "1",
-                        "linewidth" => "2",
-                        "linestyle" => "1",
-                        "size"      => "1",
-                        "fill pattern" => "1",
-                        "fill color" => "4",
-                    }
-                };
-
-                xmgrace ({"title"           => "$test",
-                        "subtitle"          => "$name",
-                        "legend"            => "0.8,0.7",
-                        "device"            => 'PNG',
-                        "output file"       => "$RESULT_DIR/$test\_".$name.".png",
-                        "grace output file" => "$RESULT_DIR/$test\_".$name.".agr",
-                        "xaxis label"       => "run",
-                        "yaxis label"       => "MFlops/s / MBytes/s"
-                    },
-                    $series);
-    }
-}
-
diff --git a/test/executable_tests/Makefile b/test/executable_tests/Makefile
deleted file mode 100644
index 08acc2a..0000000
--- a/test/executable_tests/Makefile
+++ /dev/null
@@ -1,22 +0,0 @@
-
-
-all: topology pin perfctr memsweeper powermeter features bench genCfg setFreq
-
-topology:
-	./tester.sh likwid-topology
-pin:
-	./tester.sh likwid-pin
-perfctr:
-	./tester.sh likwid-perfctr
-memsweeper:
-	./tester.sh likwid-memsweeper
-powermeter:
-	./tester.sh likwid-powermeter
-features:
-	./tester.sh likwid-features
-bench:
-	./tester.sh likwid-bench
-genCfg:
-	./tester.sh likwid-genCfg
-setFreq:
-	./tester.sh likwid-setFreq
diff --git a/test/executable_tests/README b/test/executable_tests/README
deleted file mode 100644
index 99ab560..0000000
--- a/test/executable_tests/README
+++ /dev/null
@@ -1,8 +0,0 @@
-Simple commandline argument evaluation tool
-
-Usage: ./tester.sh <executable>
-
-For batch testing all executables simply type make
-
-All lines in the <executable>.txt file are executed and the output evaluated.
-Only simple checks are made using grep.
diff --git a/test/executable_tests/likwid-bench.txt b/test/executable_tests/likwid-bench.txt
deleted file mode 100644
index 474b160..0000000
--- a/test/executable_tests/likwid-bench.txt
+++ /dev/null
@@ -1,29 +0,0 @@
-| EXIT 0 | GREP Help message
--h | EXIT 0 | GREP Help message
--v | EXIT 0 | GREP likwid-bench
--p | EXIT 0 | GREP Domain
--a | EXIT 0 | GREP sum
--i | EXIT 1 | GREP requires an argument
--i 0 | EXIT 1 | GREP Iterations must be greater than 0
--i 100 | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--l | EXIT 1 | GREP requires an argument
--l sum | EXIT 0 | GREP Name: sum
--l XXX | EXIT 0 | GREP Unknown test case XXX
--t | EXIT 1 | GREP requires an argument
--t sum | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--t XXX | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--g | EXIT 1 | GREP requires an argument
--g 0 | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--g 1 | EXIT 1 | GREP workgroups requested but only 0 given on commandline
--g X | EXIT 1 | GREP Number of Workgroups must be 1 or greater
--w | EXIT 1 | GREP requires an argument
--g 1 -w X | EXIT 1 | GREP You need to specify a test case first
--t sum -g 1 -w X | EXIT 1 | GREP Error in parsing workgroup string
--t sum -g 1 -w N:1 | EXIT 1 | GREP Cannot parse string
--t XXX -g 1 -w N:1MB:1 | EXIT 1 | GREP You need to specify a test case first
--g 1 -w N:100kB:1 | EXIT 1 | GREP You need to specify a test case first
--i 100 -t sum -g 1 -w N:100kB:1 | EXIT 0 | GREP Number of Flops
--i 100 -t sum -g 2 -w N:100kB:1 | EXIT 1 | GREP workgroups requested but only 1 given on commandline
--i 100 -t sum -g 2 -w N:100kB:1 -w N:100kB:1 | EXIT 0 | GREP Number of Flops
--i 100 -t sum -g 1 -w N:100kB:2:1 | EXIT 1 | GREP Error in parsing workgroup string
--i 100 -t sum -g 1 -w N:100kB:2:1:2 | EXIT 0 | GREP Number of Flops
diff --git a/test/executable_tests/likwid-features.txt b/test/executable_tests/likwid-features.txt
deleted file mode 100644
index ce95592..0000000
--- a/test/executable_tests/likwid-features.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-| EXIT 0 | GREP Performance monitoring | GREP CPU core id
--h | EXIT 0 | GREP Help message
--v | EXIT 0 | GREP likwid-features
--c | EXIT 1 | GREP option requires an argument
--s | EXIT 1 | GREP option requires an argument
--u | EXIT 1 | GREP option requires an argument
--c 0 | EXIT 0 | GREP Performance monitoring | GREP CPU core id
--s HW_PREFETCHER | EXIT 0 | GREP Performance monitoring | GREP CPU core id
--u HW_PREFETCHER | EXIT 0 | GREP Performance monitoring | GREP CPU core id
diff --git a/test/executable_tests/likwid-genCfg.txt b/test/executable_tests/likwid-genCfg.txt
deleted file mode 100644
index 6369b70..0000000
--- a/test/executable_tests/likwid-genCfg.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-| EXIT 1 | GREP Permission denied
--h | EXIT 0 | GREP Help message
--v | EXIT 0 | GREP likwid-genCfg
--o | EXIT 1 | GREP option requires an argument
--o /tmp/topo.txt | EXIT 0 | GREP CPU name
diff --git a/test/executable_tests/likwid-memsweeper.txt b/test/executable_tests/likwid-memsweeper.txt
deleted file mode 100644
index 6c4cd0e..0000000
--- a/test/executable_tests/likwid-memsweeper.txt
+++ /dev/null
@@ -1,8 +0,0 @@
-| EXIT 0 | GREP Sweeping domain
--h | EXIT 0 | GREP Help message
--v | EXIT 0 | GREP likwid-memsweeper
--c | EXIT 1 | GREP option requires an argument
--c - | EXIT 1 | GREP Cannot parse string
--c -1 | EXIT 0 | GREP Sweeping domain
--c 0 | EXIT 0 | GREP Sweeping domain
--c 10 | EXIT 1 | GREP ERROR | GREP numa
diff --git a/test/executable_tests/likwid-perfctr.txt b/test/executable_tests/likwid-perfctr.txt
deleted file mode 100644
index 80ac60d..0000000
--- a/test/executable_tests/likwid-perfctr.txt
+++ /dev/null
@@ -1,38 +0,0 @@
-| EXIT 0 | GREP Help message
--h | EXIT 0 | GREP Help message
--v | EXIT 0 | GREP likwid-perfctr
--i | EXIT 0 | GREP CPU family
--V -c 0 hostname | EXIT 0 | GREP NOTICE
--V | EXIT 1 | GREP You must specify at least one processor
--g | EXIT 1 | GREP option requires an argument
--g BRANCH -H | EXIT 0 | GREP Group BRANCH:
--a | EXIT 0 | GREP Available groups
--V -e | EXIT 0 | GREP This architecture
--t 200ms | EXIT 1 | GREP You must specify at least one processor
--c | EXIT 1 | GREP option requires an argument
--c 0 | EXIT 1 | GREP You have to specify a program to measure as argument
--t 200ms -c 0 | EXIT 1 | GREP Executable must be given on commandline
--S | EXIT 1 | GREP option requires an argument
--o | EXIT 1 | GREP option requires an argument
--o /tmp/test | EXIT 1 | GREP Outputfile has no filetype suffix
--o /tmp/test.txt | EXIT 1 | GREP You must specify at least one processor
--S 1 | EXIT 1 | GREP You must specify at least one processor
--S 1 -c 0 | EXIT 1 | GREP You have to specify a group or event set to measure using the -g option.
--S 1 -C 0 | EXIT 1 | GREP You have to specify a group or event set to measure using the -g option.
--S 1 -c 0 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP Branch
--S 1 -C 0 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP Branch
--S 1 -c 0,1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -c 0-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -c 0,1-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -C 0,1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -C 0-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -C 0,1-1 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP core 1 | GREP Branch
--S 1 -c E:N:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -c E:N:2:1:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -c M:scatter -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -C E:N:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -C E:N:2:1:2 -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--S 1 -C M:scatter -g BRANCH | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--c 0 -g BRANCH hostname | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--C 0 -g BRANCH hostname | EXIT 0 | GREP Measuring group BRANCH | GREP core 0 | GREP Branch
--C 0 -g BRANCH -m hostname | EXIT 1 | GREP  The marker result file could not be found
diff --git a/test/executable_tests/likwid-pin.txt b/test/executable_tests/likwid-pin.txt
deleted file mode 100644
index 801f79c..0000000
--- a/test/executable_tests/likwid-pin.txt
+++ /dev/null
@@ -1,26 +0,0 @@
--h | EXIT 0 | GREP Help message
--v | EXIT 0 | GREP likwid-pin
--i | EXIT 1 | GREP Executable must be given on commandline
--i hostname | EXIT 0 | GREP Set mem_policy to interleaved
--S | EXIT 1 |GREP Executable must be given on commandline
--S hostname | EXIT 0 | GREP Sweeping memory
--c | EXIT 1 |GREP option requires an argument
--p | EXIT 0 | GREP Domain | GREP Tag
--c 0 | EXIT 1 | GREP Executable must be given on commandline
--c 0 -p | EXIT 0 | GREP 0
--c N:0 -p | EXIT 0 | GREP 0
--c S0:0-1 -p | EXIT 0 | GREP 0,1
--c N:0 at N:1 -p | EXIT 0 | GREP 0,1
--c N:0 at N:1 at N:2 -p | EXIT 0 | GREP 0,1,2
--c C0:1-0 -p | EXIT 1 | GREP Range End
--c E:N:1 -p | EXIT 0 | GREP 0
--c E:N:2 -p | EXIT 0 | LISTLEN , 2
--c E:N:2:1:2 -p | EXIT 0 | LISTLEN , 2
--c E:N:2:1:2 -d . -p | EXIT 0 | LISTLEN . 2
--c M:scatter -p | EXIT 0
--s | EXIT 1 | GREP option requires an argument
--s 0x1 | EXIT 1 | GREP Executable must be given on commandline
--s 0x1 hostname | EXIT 0 | GREP Main PID
--q | EXIT 1 | GREP Executable must be given on commandline
--q hostname | EXIT 1 | NGREP Main PID
-
diff --git a/test/executable_tests/likwid-powermeter.txt b/test/executable_tests/likwid-powermeter.txt
deleted file mode 100644
index f733b06..0000000
--- a/test/executable_tests/likwid-powermeter.txt
+++ /dev/null
@@ -1,14 +0,0 @@
-| EXIT 0 | GREP Help message
--h | EXIT 0 | GREP Help message
--v | EXIT 0 | GREP likwid-powermeter
--i | EXIT 0 | GREP Base clock | GREP Power
--c | EXIT 1 | GREP option requires an argument | GREP Help message
--s | EXIT 1 | GREP option requires an argument | GREP Help message
--M | EXIT 1 | GREP option requires an argument | GREP Help message
--s 1 | EXIT 0 | GREP consumed
--c 0 | EXIT 1 | GREP Commandline option -c requires an executable if not used in combination with -s
--p | EXIT 1 | GREP Commandline option -p requires an executable
--c 0 -s 1 | EXIT 0 | GREP consumed | GREP Socket 0
--p hostname | EXIT 0 | Measuring group CLOCK
--c 0 hostname | EXIT 0 | GREP consumed | GREP Socket 0
--M 1 | EXIT 1 | GREP Either -s <seconds> or executable must be given on commandline
diff --git a/test/executable_tests/likwid-setFreq.txt b/test/executable_tests/likwid-setFreq.txt
deleted file mode 100644
index 56c495b..0000000
--- a/test/executable_tests/likwid-setFreq.txt
+++ /dev/null
@@ -1,6 +0,0 @@
-| EXIT 1 | GREP Usage
-0 | EXIT 1 | GREP Usage
-0 0 | EXIT 1 | GREP Frequency must be greater than 0
-0 -1 | EXIT 1 | GREP Frequency must be greater than 0
--1 -1 | EXIT 1 | GREP not a valid CPU ID. Range from 0 to
-100 0 | EXIT 1 | GREP not a valid CPU ID. Range from 0 to
diff --git a/test/executable_tests/likwid-topology.txt b/test/executable_tests/likwid-topology.txt
deleted file mode 100644
index 810b1e9..0000000
--- a/test/executable_tests/likwid-topology.txt
+++ /dev/null
@@ -1,11 +0,0 @@
--h | EXIT 0 | Help message
--v | EXIT 0 | GREP likwid-topology
--c | EXIT 0 | GREP Cache line size
--C | EXIT 0 | GREP CPU clock
--g | EXIT 0 | GREP +--------
--g -v | EXIT 0 | GREP likwid-topology
--c -g | EXIT 0 | GREP +-------- | GREP Cache line size
--c -g -C | EXIT 0 | GREP +-------- | GREP Cache line size | GREP CPU clock
--o | EXIT 1
--o /tmp/out | EXIT 1 | GREP filter suffix
--o /tmp/out.txt | EXIT 0
diff --git a/test/executable_tests/tester.sh b/test/executable_tests/tester.sh
deleted file mode 100755
index 71342df..0000000
--- a/test/executable_tests/tester.sh
+++ /dev/null
@@ -1,80 +0,0 @@
-#!/bin/bash
-
-if [ $# -ne 1 ]; then
-    echo "You need to give application to test on commandline"
-    exit 1
-fi
-
-EXECPATH=../..
-EXEC=$1
-TMPFILE=/tmp/testout
-
-f_grep() {
-    ARG="$1"
-    if [ `grep "${ARG}" ${TMPFILE} | wc -l` == "0" ]; then
-        return 1
-    fi
-    return 0
-}
-
-f_ngrep() {
-    ARG="$1"
-    if [ `grep "${ARG}" ${TMPFILE} | wc -l` != "0" ]; then
-        return 1
-    fi
-    return 0
-}
-
-f_listlen() {
-    LIST=$(cat ${TMPFILE})
-    DELIM=$(echo ${1} | cut -d ' ' -f 1)
-    COUNT=$(echo ${1} | cut -d ' ' -f 2)
-    CHARS=${LIST//[^${DELIM}]}
-    LENGTH=$(expr ${#CHARS} + 1)
-    if [ ${LENGTH} != "${COUNT}" ]; then
-        return 1
-    fi
-    return 0
-}
-
-if [ ! -e ${EXEC}.txt ]; then
-    echo "Cannot find testfile ${EXEC}.txt"
-    exit 1
-fi
-
-while read -r LINE || [[ -n $LINE ]]; do
-    if [ -z "${LINE}" ]; then continue; fi
-    if [[ "${LINE}" =~ \#.* ]]; then continue; fi
-
-    OPTIONS=$(echo "${LINE}" | cut -d '|' -f 1)
-    RESULTS=$(echo "${LINE}" | cut -d '|' -f 2-)
-    NUM_RESULTS="${RESULTS//[^|]}"
-    EXITCODE=$(${EXECPATH}/${EXEC} ${OPTIONS} 1>${TMPFILE} 2>&1  ; echo $?)
-    STATE=0
-    for ((i=1;i<=${#NUM_RESULTS}+1;i++)); do
-        RESULT=$(echo ${RESULTS} | cut -d '|' -f ${i})
-        RESULT_CMD=$(echo $RESULT | cut -d' ' -f1)
-        RESULT_OPTS=$(echo $RESULT | cut -d ' ' -f 2-)
-        if [ ${RESULT_CMD} == "EXIT" ]; then
-            if [ "${RESULT_OPTS}" != "$EXITCODE" ]; then
-                STATE=1
-            fi
-        elif [ ${RESULT_CMD} == "GREP" ]; then
-            f_grep "${RESULT_OPTS}"
-            STATE=$?
-        elif [ ${RESULT_CMD} == "NGREP" ]; then
-            f_ngrep "${RESULT_OPTS}"
-            STATE=$?
-        elif [ ${RESULT_CMD} == "LISTLEN" ]; then
-            f_listlen "${RESULT_OPTS}"
-            STATE=$?
-        fi
-    done
-    if [ $STATE -eq 0 ]; then
-        echo "SUCCESS : ${EXEC}" "${OPTIONS}"
-    else
-        echo "FAIL : ${EXEC}" "${OPTIONS}"
-    fi
-done < ${EXEC}.txt
-
-
diff --git a/test/stream.c b/test/stream.c
new file mode 100644
index 0000000..8abf05b
--- /dev/null
+++ b/test/stream.c
@@ -0,0 +1,191 @@
+#define _GNU_SOURCE
+#include <stdlib.h>
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/syscall.h>
+#ifdef _OPENMP
+#include <omp.h>
+# endif
+#include <stdint.h>
+#include <sys/time.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sched.h>
+#include <pthread.h>
+
+#define ITER 10
+#define SIZE 40000000
+
+#define gettid() syscall(SYS_gettid)
+#include <likwid.h>
+#define HLINE "-------------------------------------------------------------\n"
+
+#ifndef MIN
+#define MIN(x,y) ((x)<(y)?(x):(y))
+#endif
+
+typedef struct {
+    struct timeval before;
+    struct timeval after;
+} TimeData;
+
+
+void time_start(TimeData* time)
+{
+    gettimeofday(&(time->before),NULL);
+}
+
+
+void time_stop(TimeData* time)
+{
+    gettimeofday(&(time->after),NULL);
+}
+
+double time_print(TimeData* time)
+{
+    long int sec;
+    double timeDuration;
+
+    sec = time->after.tv_sec - time->before.tv_sec;
+    timeDuration = ((double)((sec*1000000)+time->after.tv_usec) - (double) time->before.tv_usec);
+
+    return (timeDuration/1000000);
+}
+
+static int
+getProcessorID(cpu_set_t* cpu_set)
+{
+    int processorId;
+
+    for (processorId=0;processorId<128;processorId++)
+    {
+	if (CPU_ISSET(processorId,cpu_set))
+	{
+	    break;
+	}
+    }
+    return processorId;
+}
+
+int  threadGetProcessorId()
+{
+    cpu_set_t  cpu_set;
+    CPU_ZERO(&cpu_set);
+    sched_getaffinity(gettid(),sizeof(cpu_set_t), &cpu_set);
+
+    return getProcessorID(&cpu_set);
+}
+
+void allocate_vector(double** ptr, uint64_t size)
+{
+    int errorCode;
+
+    errorCode = posix_memalign((void**) ptr, 64, size*sizeof(double));
+
+    if (errorCode)
+    {
+	if (errorCode == EINVAL)
+	{
+	    fprintf(stderr,
+		    "Alignment parameter is not a power of two\n");
+	    exit(EXIT_FAILURE);
+	}
+	if (errorCode == ENOMEM)
+	{
+	    fprintf(stderr,
+		    "Insufficient memory to fulfill the request\n");
+	    exit(EXIT_FAILURE);
+	}
+    }
+}
+
+
+int main(int argn, char** argc)
+{
+    double *a,*b,*c,*d;
+    TimeData timer;
+    double triad_time, copy_time;
+
+    allocate_vector(&a, SIZE);
+    allocate_vector(&b, SIZE);
+    allocate_vector(&c, SIZE);
+    allocate_vector(&d, SIZE);
+
+#ifdef LIKWID_PERFMON
+    printf("Using likwid\n");
+#endif
+
+    LIKWID_MARKER_INIT;
+
+#ifdef _OPENMP
+    printf(HLINE);
+#pragma omp parallel
+    {
+	LIKWID_MARKER_THREADINIT;
+#pragma omp master
+	{
+	    printf ("Number of Threads requested = %i\n",omp_get_num_threads());
+	}
+
+	printf ("Thread %d running on processor %d ....\n",omp_get_thread_num(),threadGetProcessorId());
+    }
+#endif
+
+#pragma omp parallel for
+    for (int j=0; j<SIZE; j++) {
+	a[j] = 1.0;
+	b[j] = 2.0;
+	c[j] = 0.0;
+	d[j] = 1.0;
+    }
+
+    time_start(&timer);
+#pragma omp parallel
+    {
+        for (int k=0; k<ITER; k++)
+        {
+            LIKWID_MARKER_START("copy");
+#pragma omp for
+            for (int j=0; j<SIZE; j++)
+            {
+                c[j] = a[j];
+            }
+            LIKWID_MARKER_STOP("copy");
+        }
+    }
+    time_stop(&timer);
+    copy_time = time_print(&timer)/(double)ITER;
+
+    time_start(&timer);
+#pragma omp parallel
+    {
+        for (int k=0; k<ITER; k++)
+        {
+
+            LIKWID_MARKER_START("triad");
+#pragma omp for
+            for (int j=0; j<SIZE; j++)
+            {
+                a[j] = b[j] +  c[j] * d[j];
+            }
+            LIKWID_MARKER_STOP("triad");
+        }
+    }
+    time_stop(&timer);
+    triad_time = time_print(&timer)/(double)ITER;
+
+
+    printf("Processed %.1f Mbyte at copy benchmark in %.4f seconds: %.2f MByte/s\n",
+                        1E-6*(2*SIZE*sizeof(double)),
+                        copy_time,
+                        1E-6*((2*SIZE*sizeof(double))/copy_time));
+    printf("Processed %.1f Mbyte at triad benchmark in %.4f seconds: %.2f MByte/s\n",
+                        1E-6*(4*SIZE*sizeof(double)),
+                        triad_time,
+                        1E-6*((4*SIZE*sizeof(double))/triad_time));
+
+
+    LIKWID_MARKER_CLOSE;
+    return 0;
+}
+

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/likwid/likwid.git



More information about the Likwid-commit mailing list